0
# Validation and Analysis
1
2
Tools for data validation, quality assessment, and statistical analysis. Includes data profiling, constraint validation, summary statistics, and visualization utilities for understanding and validating data quality.
3
4
## Capabilities
5
6
### Data Validation
7
8
Validate data against constraints and rules to ensure data quality.
9
10
```python { .api }
11
def validate(table, constraints=None, header=None) -> Table:
12
"""
13
Validate table data against constraints and expected header structure.
14
15
Parameters:
16
- table: Input table to validate
17
- constraints: List of constraint dictionaries with the following keys:
18
- name (str): Name of the constraint
19
- field (str, optional): Field name to validate (omit for row-level constraints)
20
- test (callable, optional): Function to test field values (e.g., int, float)
21
- assertion (callable, optional): Function returning True/False for validation
22
- optional (bool, optional): If True, missing fields are allowed
23
- header: Expected header tuple/list for structure validation
24
25
Returns:
26
Table with columns: name, row, field, value, error
27
28
Example constraint formats:
29
- Field type test: dict(name='foo_int', field='foo', test=int)
30
- Field assertion: dict(name='bar_enum', field='bar', assertion=lambda v: v in ['A', 'B'])
31
- Row-level assertion: dict(name='no_nulls', assertion=lambda row: None not in row)
32
- Optional field: dict(name='optional_field', field='opt', test=str, optional=True)
33
"""
34
```
35
36
### Data Visualization and Inspection
37
38
Tools for examining and understanding table structure and content.
39
40
```python { .api }
41
def look(table, limit=0, vrepr=None, index_header=None, style=None, truncate=None, width=None):
42
"""
43
Print a table in a formatted display.
44
45
Parameters:
46
- table: Input table
47
- limit: Maximum number of rows to display (0 for all)
48
- vrepr: Function for value representation
49
- index_header: Whether to show row indices
50
- style: Display style ('grid', 'simple', 'minimal')
51
- truncate: Maximum length for cell values
52
- width: Maximum display width
53
"""
54
55
def lookall(table, **kwargs):
56
"""Print entire table with formatting options."""
57
58
def lookstr(table, limit=0, **kwargs) -> str:
59
"""
60
Return table as formatted string.
61
62
Parameters:
63
- table: Input table
64
- limit: Maximum number of rows
65
- kwargs: Formatting options
66
67
Returns:
68
Formatted string representation of table
69
"""
70
71
def lookallstr(table, **kwargs) -> str:
72
"""Return entire table as formatted string."""
73
74
def see(table, limit=0, vrepr=None, index_header=None):
75
"""
76
Print table with field details and statistics.
77
78
Parameters:
79
- table: Input table
80
- limit: Maximum number of rows to analyze
81
- vrepr: Function for value representation
82
- index_header: Whether to show row indices
83
"""
84
```
85
86
### Statistical Analysis
87
88
Calculate summary statistics and analyze data distributions.
89
90
```python { .api }
91
def stats(table, field):
92
"""
93
Return basic statistics for a numeric field.
94
95
Parameters:
96
- table: Input table
97
- field: Numeric field name
98
99
Returns:
100
Dictionary with count, min, max, sum, mean, stddev
101
"""
102
103
def limits(table, field):
104
"""
105
Return minimum and maximum values for a field.
106
107
Parameters:
108
- table: Input table
109
- field: Field name
110
111
Returns:
112
Tuple of (min_value, max_value)
113
"""
114
115
def nrows(table) -> int:
116
"""
117
Count the number of data rows in the table.
118
119
Parameters:
120
- table: Input table
121
122
Returns:
123
Integer count of rows (excluding header)
124
"""
125
```
126
127
### Data Profiling
128
129
Analyze data types, patterns, and quality metrics.
130
131
```python { .api }
132
def typeset(table, field):
133
"""
134
Return set of types found in the specified field.
135
136
Parameters:
137
- table: Input table
138
- field: Field name to analyze
139
140
Returns:
141
Set of Python types found in the field
142
"""
143
144
def typecounter(table, field):
145
"""
146
Return a Counter of Python types found in the specified field.
147
148
Parameters:
149
- table: Input table
150
- field: Field name to analyze
151
152
Returns:
153
Counter object with type counts
154
"""
155
156
def typecounts(table, field) -> Table:
157
"""
158
Return a table with type counts for the specified field.
159
160
Parameters:
161
- table: Input table
162
- field: Field name to analyze
163
164
Returns:
165
Table with type names and counts
166
"""
167
168
def stringpatterncounter(table, field):
169
"""
170
Count string patterns in the specified field.
171
172
Parameters:
173
- table: Input table
174
- field: Field name containing string values
175
176
Returns:
177
Counter with string pattern counts
178
"""
179
180
def stringpatterns(table, field) -> Table:
181
"""
182
Return a table with string pattern counts.
183
184
Parameters:
185
- table: Input table
186
- field: Field name containing string values
187
188
Returns:
189
Table with patterns and their frequencies
190
"""
191
192
def rowlengths(table) -> Table:
193
"""
194
Return counts of row lengths in the table.
195
196
Parameters:
197
- table: Input table
198
199
Returns:
200
Table with row lengths and their frequencies
201
"""
202
```
203
204
### Value Analysis
205
206
Analyze value distributions and frequencies.
207
208
```python { .api }
209
def valuecount(table, field, value, missing=None) -> int:
210
"""
211
Count occurrences of a specific value in a field.
212
213
Parameters:
214
- table: Input table
215
- field: Field name to search
216
- value: Value to count
217
- missing: Value to treat as missing
218
219
Returns:
220
Integer count of occurrences
221
"""
222
223
def valuecounter(table, *field, **kwargs):
224
"""
225
Return a Counter of values in the specified field(s).
226
227
Parameters:
228
- table: Input table
229
- field: Field name(s) to count
230
- kwargs: Additional options
231
232
Returns:
233
Counter object with value frequencies
234
"""
235
236
def valuecounts(table, *field, **kwargs) -> Table:
237
"""
238
Return a table with value counts for the specified field(s).
239
240
Parameters:
241
- table: Input table
242
- field: Field name(s) to count
243
- kwargs: Additional options
244
245
Returns:
246
Table with values and their frequencies
247
"""
248
```
249
250
### Data Quality Assessment
251
252
Identify data quality issues and inconsistencies.
253
254
```python { .api }
255
def diffheaders(t1, t2):
256
"""
257
Find differences between table headers.
258
259
Parameters:
260
- t1: First table
261
- t2: Second table
262
263
Returns:
264
Information about header differences
265
"""
266
267
def diffvalues(t1, t2, field):
268
"""
269
Find different values between tables for a field.
270
271
Parameters:
272
- t1: First table
273
- t2: Second table
274
- field: Field name to compare
275
276
Returns:
277
Information about value differences
278
"""
279
280
def parsecounter(table, field, parsers=(('int', int), ('float', float))):
281
"""
282
Count successful parsing attempts with different parsers.
283
284
Parameters:
285
- table: Input table
286
- field: Field name to test parsing
287
- parsers: List of (name, parser_function) tuples
288
289
Returns:
290
Counter with parsing success counts
291
"""
292
293
def parsecounts(table, field, parsers=(('int', int), ('float', float))) -> Table:
294
"""
295
Return a table with parsing attempt counts.
296
297
Parameters:
298
- table: Input table
299
- field: Field name to test parsing
300
- parsers: List of (name, parser_function) tuples
301
302
Returns:
303
Table with parser names and success counts
304
"""
305
```
306
307
### Performance Monitoring
308
309
Monitor and measure data processing performance.
310
311
```python { .api }
312
def progress(table, batchsize=1000, prefix="", out=None) -> Table:
313
"""
314
Report progress while iterating through table.
315
316
Parameters:
317
- table: Input table
318
- batchsize: Number of rows between progress reports
319
- prefix: Prefix for progress messages
320
- out: Output stream for progress messages
321
322
Returns:
323
Table that reports progress during iteration
324
"""
325
326
def clock(table) -> Table:
327
"""
328
Time table operations.
329
330
Parameters:
331
- table: Input table
332
333
Returns:
334
Table that measures processing time
335
"""
336
```
337
338
### Test Data Generation
339
340
Generate test data for validation and testing purposes.
341
342
```python { .api }
343
def randomtable(numflds=5, numrows=100, wait=0, seed=None) -> Table:
344
"""
345
Generate a table with random data.
346
347
Parameters:
348
- numflds: Number of fields
349
- numrows: Number of rows
350
- wait: Delay between row generation (for testing)
351
- seed: Random seed for reproducibility
352
353
Returns:
354
Table with random data
355
"""
356
357
def dummytable(numflds=5, numrows=100) -> Table:
358
"""
359
Generate a table with dummy data.
360
361
Parameters:
362
- numflds: Number of fields
363
- numrows: Number of rows
364
365
Returns:
366
Table with predictable dummy data
367
"""
368
```
369
370
## Usage Examples
371
372
### Data Inspection and Visualization
373
374
```python
375
import petl as etl
376
377
data = etl.fromcsv('dataset.csv')
378
379
# Quick look at data structure
380
etl.look(data, limit=10)
381
382
# Detailed analysis with statistics
383
etl.see(data, limit=20)
384
385
# Get formatted string representation
386
table_str = etl.lookstr(data, limit=5, style='simple')
387
print(table_str)
388
```
389
390
### Statistical Analysis
391
392
```python
393
import petl as etl
394
395
sales = etl.fromcsv('sales.csv')
396
397
# Basic statistics for numeric field
398
sales_stats = etl.stats(sales, 'amount')
399
print(f"Mean: {sales_stats['mean']}, StdDev: {sales_stats['stddev']}")
400
401
# Find data range
402
min_val, max_val = etl.limits(sales, 'amount')
403
print(f"Range: {min_val} to {max_val}")
404
405
# Count total rows
406
total_rows = etl.nrows(sales)
407
print(f"Total records: {total_rows}")
408
```
409
410
### Data Profiling
411
412
```python
413
import petl as etl
414
415
customer_data = etl.fromcsv('customers.csv')
416
417
# Analyze data types
418
age_types = etl.typecounts(customer_data, 'age')
419
etl.look(age_types)
420
421
# Check string patterns
422
phone_patterns = etl.stringpatterns(customer_data, 'phone')
423
etl.look(phone_patterns)
424
425
# Analyze row structure
426
row_lengths = etl.rowlengths(customer_data)
427
etl.look(row_lengths)
428
429
# Test parsing capabilities
430
income_parsing = etl.parsecounts(customer_data, 'income', [
431
('int', int),
432
('float', float),
433
('currency', lambda x: float(x.replace('$', '').replace(',', '')))
434
])
435
etl.look(income_parsing)
436
```
437
438
### Value Distribution Analysis
439
440
```python
441
import petl as etl
442
443
survey = etl.fromcsv('survey.csv')
444
445
# Count specific value
446
male_count = etl.valuecount(survey, 'gender', 'Male')
447
print(f"Male respondents: {male_count}")
448
449
# Get value frequency table
450
gender_dist = etl.valuecounts(survey, 'gender')
451
etl.look(gender_dist)
452
453
# Cross-tabulation
454
region_gender = etl.valuecounts(survey, 'region', 'gender')
455
etl.look(region_gender)
456
457
# Get counter for programmatic access
458
age_counter = etl.valuecounter(survey, 'age_group')
459
most_common = age_counter.most_common(3)
460
print(f"Top 3 age groups: {most_common}")
461
```
462
463
### Data Quality Assessment
464
465
```python
466
import petl as etl
467
468
# Compare two datasets
469
current_data = etl.fromcsv('current.csv')
470
previous_data = etl.fromcsv('previous.csv')
471
472
# Check header consistency
473
header_diff = etl.diffheaders(current_data, previous_data)
474
if header_diff:
475
print("Header differences found:", header_diff)
476
477
# Compare specific field values
478
value_diff = etl.diffvalues(current_data, previous_data, 'status')
479
if value_diff:
480
print("Value differences in status field:", value_diff)
481
482
# Validate data format consistency
483
email_types = etl.typeset(current_data, 'email')
484
if len(email_types) > 1:
485
print("Mixed data types found in email field:", email_types)
486
```
487
488
### Data Validation
489
490
```python
491
import petl as etl
492
493
products = etl.fromcsv('products.csv')
494
495
# Define validation constraints
496
constraints = {
497
'price': lambda x: x is not None and x > 0,
498
'category': lambda x: x in ['Electronics', 'Clothing', 'Books'],
499
'stock': lambda x: isinstance(x, int) and x >= 0
500
}
501
502
# Validate data
503
validation_results = etl.validate(products, constraints)
504
etl.look(validation_results)
505
506
# Count validation failures
507
failed_rows = etl.select(validation_results, 'valid', False)
508
failure_count = etl.nrows(failed_rows)
509
print(f"Validation failures: {failure_count}")
510
```
511
512
### Performance Monitoring
513
514
```python
515
import petl as etl
516
517
large_dataset = etl.fromcsv('large_file.csv')
518
519
# Monitor processing progress
520
processed = (large_dataset
521
.progress(batchsize=10000, prefix="Processing: ")
522
.select('status', 'active')
523
.convert('amount', float)
524
.sort('timestamp'))
525
526
# Time the operations
527
timed_result = etl.clock(processed)
528
529
# Process with timing
530
result = etl.tocsv(timed_result, 'output.csv')
531
```