Tessl Tile for pypi/petl@1.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

aggregation.md basic-transformations.md data-io.md data-reshaping.md index.md sorting-joins.md table-operations.md validation-analysis.md

validation-analysis.mddocs/

0
# Validation and Analysis
1

2
Tools for data validation, quality assessment, and statistical analysis. Includes data profiling, constraint validation, summary statistics, and visualization utilities for understanding and validating data quality.
3

4
## Capabilities
5

6
### Data Validation
7

8
Validate data against constraints and rules to ensure data quality.
9

10
```python { .api }
11
def validate(table, constraints=None, header=None) -> Table:
12
    """
13
    Validate table data against constraints and expected header structure.
14
    
15
    Parameters:
16
    - table: Input table to validate
17
    - constraints: List of constraint dictionaries with the following keys:
18
        - name (str): Name of the constraint
19
        - field (str, optional): Field name to validate (omit for row-level constraints)
20
        - test (callable, optional): Function to test field values (e.g., int, float)
21
        - assertion (callable, optional): Function returning True/False for validation
22
        - optional (bool, optional): If True, missing fields are allowed
23
    - header: Expected header tuple/list for structure validation
24
    
25
    Returns:
26
    Table with columns: name, row, field, value, error
27
    
28
    Example constraint formats:
29
    - Field type test: dict(name='foo_int', field='foo', test=int)
30
    - Field assertion: dict(name='bar_enum', field='bar', assertion=lambda v: v in ['A', 'B'])
31
    - Row-level assertion: dict(name='no_nulls', assertion=lambda row: None not in row)
32
    - Optional field: dict(name='optional_field', field='opt', test=str, optional=True)
33
    """
34
```
35

36
### Data Visualization and Inspection
37

38
Tools for examining and understanding table structure and content.
39

40
```python { .api }
41
def look(table, limit=0, vrepr=None, index_header=None, style=None, truncate=None, width=None):
42
    """
43
    Print a table in a formatted display.
44
    
45
    Parameters:
46
    - table: Input table
47
    - limit: Maximum number of rows to display (0 for all)
48
    - vrepr: Function for value representation
49
    - index_header: Whether to show row indices
50
    - style: Display style ('grid', 'simple', 'minimal')
51
    - truncate: Maximum length for cell values
52
    - width: Maximum display width
53
    """
54

55
def lookall(table, **kwargs):
56
    """Print entire table with formatting options."""
57

58
def lookstr(table, limit=0, **kwargs) -> str:
59
    """
60
    Return table as formatted string.
61
    
62
    Parameters:
63
    - table: Input table
64
    - limit: Maximum number of rows
65
    - kwargs: Formatting options
66
    
67
    Returns:
68
    Formatted string representation of table
69
    """
70

71
def lookallstr(table, **kwargs) -> str:
72
    """Return entire table as formatted string."""
73

74
def see(table, limit=0, vrepr=None, index_header=None):
75
    """
76
    Print table with field details and statistics.
77
    
78
    Parameters:
79
    - table: Input table
80
    - limit: Maximum number of rows to analyze
81
    - vrepr: Function for value representation
82
    - index_header: Whether to show row indices
83
    """
84
```
85

86
### Statistical Analysis
87

88
Calculate summary statistics and analyze data distributions.
89

90
```python { .api }
91
def stats(table, field):
92
    """
93
    Return basic statistics for a numeric field.
94
    
95
    Parameters:
96
    - table: Input table
97
    - field: Numeric field name
98
    
99
    Returns:
100
    Dictionary with count, min, max, sum, mean, stddev
101
    """
102

103
def limits(table, field):
104
    """
105
    Return minimum and maximum values for a field.
106
    
107
    Parameters:
108
    - table: Input table
109
    - field: Field name
110
    
111
    Returns:
112
    Tuple of (min_value, max_value)
113
    """
114

115
def nrows(table) -> int:
116
    """
117
    Count the number of data rows in the table.
118
    
119
    Parameters:
120
    - table: Input table
121
    
122
    Returns:
123
    Integer count of rows (excluding header)
124
    """
125
```
126

127
### Data Profiling
128

129
Analyze data types, patterns, and quality metrics.
130

131
```python { .api }
132
def typeset(table, field):
133
    """
134
    Return set of types found in the specified field.
135
    
136
    Parameters:
137
    - table: Input table
138
    - field: Field name to analyze
139
    
140
    Returns:
141
    Set of Python types found in the field
142
    """
143

144
def typecounter(table, field):
145
    """
146
    Return a Counter of Python types found in the specified field.
147
    
148
    Parameters:
149
    - table: Input table
150
    - field: Field name to analyze
151
    
152
    Returns:
153
    Counter object with type counts
154
    """
155

156
def typecounts(table, field) -> Table:
157
    """
158
    Return a table with type counts for the specified field.
159
    
160
    Parameters:
161
    - table: Input table
162
    - field: Field name to analyze
163
    
164
    Returns:
165
    Table with type names and counts
166
    """
167

168
def stringpatterncounter(table, field):
169
    """
170
    Count string patterns in the specified field.
171
    
172
    Parameters:
173
    - table: Input table
174
    - field: Field name containing string values
175
    
176
    Returns:
177
    Counter with string pattern counts
178
    """
179

180
def stringpatterns(table, field) -> Table:
181
    """
182
    Return a table with string pattern counts.
183
    
184
    Parameters:
185
    - table: Input table
186
    - field: Field name containing string values
187
    
188
    Returns:
189
    Table with patterns and their frequencies
190
    """
191

192
def rowlengths(table) -> Table:
193
    """
194
    Return counts of row lengths in the table.
195
    
196
    Parameters:
197
    - table: Input table
198
    
199
    Returns:
200
    Table with row lengths and their frequencies
201
    """
202
```
203

204
### Value Analysis
205

206
Analyze value distributions and frequencies.
207

208
```python { .api }
209
def valuecount(table, field, value, missing=None) -> int:
210
    """
211
    Count occurrences of a specific value in a field.
212
    
213
    Parameters:
214
    - table: Input table
215
    - field: Field name to search
216
    - value: Value to count
217
    - missing: Value to treat as missing
218
    
219
    Returns:
220
    Integer count of occurrences
221
    """
222

223
def valuecounter(table, *field, **kwargs):
224
    """
225
    Return a Counter of values in the specified field(s).
226
    
227
    Parameters:
228
    - table: Input table
229
    - field: Field name(s) to count
230
    - kwargs: Additional options
231
    
232
    Returns:
233
    Counter object with value frequencies
234
    """
235

236
def valuecounts(table, *field, **kwargs) -> Table:
237
    """
238
    Return a table with value counts for the specified field(s).
239
    
240
    Parameters:
241
    - table: Input table
242
    - field: Field name(s) to count
243
    - kwargs: Additional options
244
    
245
    Returns:
246
    Table with values and their frequencies
247
    """
248
```
249

250
### Data Quality Assessment
251

252
Identify data quality issues and inconsistencies.
253

254
```python { .api }
255
def diffheaders(t1, t2):
256
    """
257
    Find differences between table headers.
258
    
259
    Parameters:
260
    - t1: First table
261
    - t2: Second table
262
    
263
    Returns:
264
    Information about header differences
265
    """
266

267
def diffvalues(t1, t2, field):
268
    """
269
    Find different values between tables for a field.
270
    
271
    Parameters:
272
    - t1: First table
273
    - t2: Second table
274
    - field: Field name to compare
275
    
276
    Returns:
277
    Information about value differences
278
    """
279

280
def parsecounter(table, field, parsers=(('int', int), ('float', float))):
281
    """
282
    Count successful parsing attempts with different parsers.
283
    
284
    Parameters:
285
    - table: Input table
286
    - field: Field name to test parsing
287
    - parsers: List of (name, parser_function) tuples
288
    
289
    Returns:
290
    Counter with parsing success counts
291
    """
292

293
def parsecounts(table, field, parsers=(('int', int), ('float', float))) -> Table:
294
    """
295
    Return a table with parsing attempt counts.
296
    
297
    Parameters:
298
    - table: Input table
299
    - field: Field name to test parsing
300
    - parsers: List of (name, parser_function) tuples
301
    
302
    Returns:
303
    Table with parser names and success counts
304
    """
305
```
306

307
### Performance Monitoring
308

309
Monitor and measure data processing performance.
310

311
```python { .api }
312
def progress(table, batchsize=1000, prefix="", out=None) -> Table:
313
    """
314
    Report progress while iterating through table.
315
    
316
    Parameters:
317
    - table: Input table
318
    - batchsize: Number of rows between progress reports
319
    - prefix: Prefix for progress messages
320
    - out: Output stream for progress messages
321
    
322
    Returns:
323
    Table that reports progress during iteration
324
    """
325

326
def clock(table) -> Table:
327
    """
328
    Time table operations.
329
    
330
    Parameters:
331
    - table: Input table
332
    
333
    Returns:
334
    Table that measures processing time
335
    """
336
```
337

338
### Test Data Generation
339

340
Generate test data for validation and testing purposes.
341

342
```python { .api }
343
def randomtable(numflds=5, numrows=100, wait=0, seed=None) -> Table:
344
    """
345
    Generate a table with random data.
346
    
347
    Parameters:
348
    - numflds: Number of fields
349
    - numrows: Number of rows
350
    - wait: Delay between row generation (for testing)
351
    - seed: Random seed for reproducibility
352
    
353
    Returns:
354
    Table with random data
355
    """
356

357
def dummytable(numflds=5, numrows=100) -> Table:
358
    """
359
    Generate a table with dummy data.
360
    
361
    Parameters:
362
    - numflds: Number of fields
363
    - numrows: Number of rows
364
    
365
    Returns:
366
    Table with predictable dummy data
367
    """
368
```
369

370
## Usage Examples
371

372
### Data Inspection and Visualization
373

374
```python
375
import petl as etl
376

377
data = etl.fromcsv('dataset.csv')
378

379
# Quick look at data structure
380
etl.look(data, limit=10)
381

382
# Detailed analysis with statistics
383
etl.see(data, limit=20)
384

385
# Get formatted string representation
386
table_str = etl.lookstr(data, limit=5, style='simple')
387
print(table_str)
388
```
389

390
### Statistical Analysis
391

392
```python
393
import petl as etl
394

395
sales = etl.fromcsv('sales.csv')
396

397
# Basic statistics for numeric field
398
sales_stats = etl.stats(sales, 'amount')
399
print(f"Mean: {sales_stats['mean']}, StdDev: {sales_stats['stddev']}")
400

401
# Find data range
402
min_val, max_val = etl.limits(sales, 'amount')
403
print(f"Range: {min_val} to {max_val}")
404

405
# Count total rows
406
total_rows = etl.nrows(sales)
407
print(f"Total records: {total_rows}")
408
```
409

410
### Data Profiling
411

412
```python
413
import petl as etl
414

415
customer_data = etl.fromcsv('customers.csv')
416

417
# Analyze data types
418
age_types = etl.typecounts(customer_data, 'age')
419
etl.look(age_types)
420

421
# Check string patterns
422
phone_patterns = etl.stringpatterns(customer_data, 'phone')
423
etl.look(phone_patterns)
424

425
# Analyze row structure
426
row_lengths = etl.rowlengths(customer_data)
427
etl.look(row_lengths)
428

429
# Test parsing capabilities
430
income_parsing = etl.parsecounts(customer_data, 'income', [
431
    ('int', int),
432
    ('float', float),
433
    ('currency', lambda x: float(x.replace('$', '').replace(',', '')))
434
])
435
etl.look(income_parsing)
436
```
437

438
### Value Distribution Analysis
439

440
```python
441
import petl as etl
442

443
survey = etl.fromcsv('survey.csv')
444

445
# Count specific value
446
male_count = etl.valuecount(survey, 'gender', 'Male')
447
print(f"Male respondents: {male_count}")
448

449
# Get value frequency table
450
gender_dist = etl.valuecounts(survey, 'gender')
451
etl.look(gender_dist)
452

453
# Cross-tabulation
454
region_gender = etl.valuecounts(survey, 'region', 'gender')
455
etl.look(region_gender)
456

457
# Get counter for programmatic access
458
age_counter = etl.valuecounter(survey, 'age_group')
459
most_common = age_counter.most_common(3)
460
print(f"Top 3 age groups: {most_common}")
461
```
462

463
### Data Quality Assessment
464

465
```python
466
import petl as etl
467

468
# Compare two datasets
469
current_data = etl.fromcsv('current.csv')  
470
previous_data = etl.fromcsv('previous.csv')
471

472
# Check header consistency
473
header_diff = etl.diffheaders(current_data, previous_data)
474
if header_diff:
475
    print("Header differences found:", header_diff)
476

477
# Compare specific field values
478
value_diff = etl.diffvalues(current_data, previous_data, 'status')
479
if value_diff:
480
    print("Value differences in status field:", value_diff)
481

482
# Validate data format consistency
483
email_types = etl.typeset(current_data, 'email')
484
if len(email_types) > 1:
485
    print("Mixed data types found in email field:", email_types)
486
```
487

488
### Data Validation
489

490
```python
491
import petl as etl
492

493
products = etl.fromcsv('products.csv')
494

495
# Define validation constraints
496
constraints = {
497
    'price': lambda x: x is not None and x > 0,
498
    'category': lambda x: x in ['Electronics', 'Clothing', 'Books'],
499
    'stock': lambda x: isinstance(x, int) and x >= 0
500
}
501

502
# Validate data
503
validation_results = etl.validate(products, constraints)
504
etl.look(validation_results)
505

506
# Count validation failures
507
failed_rows = etl.select(validation_results, 'valid', False)
508
failure_count = etl.nrows(failed_rows)
509
print(f"Validation failures: {failure_count}")
510
```
511

512
### Performance Monitoring
513

514
```python
515
import petl as etl
516

517
large_dataset = etl.fromcsv('large_file.csv')
518

519
# Monitor processing progress
520
processed = (large_dataset
521
             .progress(batchsize=10000, prefix="Processing: ")
522
             .select('status', 'active')
523
             .convert('amount', float)
524
             .sort('timestamp'))
525

526
# Time the operations
527
timed_result = etl.clock(processed)
528

529
# Process with timing
530
result = etl.tocsv(timed_result, 'output.csv')
531
```

Version

Tile

Files

validation-analysis.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

validation-analysis.mddocs/