0
# Data Input/Output
1
2
Comprehensive support for reading and writing data from various sources including CSV, JSON, XML, Excel, databases, and many specialized formats. PETL provides a unified interface for data I/O operations with support for different encodings, formats, and data sources.
3
4
## Capabilities
5
6
### CSV and TSV Operations
7
8
Read and write comma-separated values and tab-separated values with extensive formatting options.
9
10
```python { .api }
11
def fromcsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
12
"""
13
Extract a table from a CSV file or source.
14
15
Parameters:
16
- source: File path, file object, or URL
17
- encoding: Character encoding (default: system default)
18
- errors: Error handling strategy ('strict', 'ignore', 'replace')
19
- header: Row number for header (0-based) or None for no header
20
- csvargs: Additional CSV reader arguments (delimiter, quotechar, etc.)
21
22
Returns:
23
Table object
24
"""
25
26
def fromtsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
27
"""Extract a table from a TSV (tab-separated values) file."""
28
29
def tocsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
30
"""
31
Write a table to a CSV file.
32
33
Parameters:
34
- table: Input table
35
- source: Output file path or file object
36
- encoding: Character encoding
37
- errors: Error handling strategy
38
- write_header: Whether to write header row
39
- csvargs: Additional CSV writer arguments
40
"""
41
42
def totsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
43
"""Write a table to a TSV file."""
44
45
def appendcsv(table, source=None, encoding=None, errors='strict', **csvargs):
46
"""Append a table to an existing CSV file."""
47
48
def appendtsv(table, source=None, encoding=None, errors='strict', **csvargs):
49
"""Append a table to an existing TSV file."""
50
51
def teecsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
52
"""Write a table to CSV while returning the table for further processing."""
53
54
def teetsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
55
"""Write a table to TSV while returning the table for further processing."""
56
```
57
58
### JSON Operations
59
60
Handle JSON data in various formats including JSON Lines and arrays of objects.
61
62
```python { .api }
63
def fromjson(source, *args, **kwargs) -> Table:
64
"""
65
Extract a table from JSON lines format or array of objects.
66
67
Parameters:
68
- source: File path, file object, or URL containing JSON data
69
- args: Additional arguments passed to json.loads
70
- kwargs: Additional keyword arguments
71
72
Returns:
73
Table object
74
"""
75
76
def fromdicts(dicts, header=None, sample=1000, missing=None) -> Table:
77
"""
78
Construct a table from an iterable of dictionaries.
79
80
Parameters:
81
- dicts: Iterable of dictionary objects
82
- header: Explicit field names (optional)
83
- sample: Number of records to sample for field detection
84
- missing: Value for missing fields
85
86
Returns:
87
Table object
88
"""
89
90
def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs):
91
"""
92
Write a table to JSON format.
93
94
Parameters:
95
- table: Input table
96
- source: Output file path or file object
97
- prefix: String to prepend to output
98
- suffix: String to append to output
99
- args: Additional arguments passed to json.dumps
100
- kwargs: Additional keyword arguments
101
"""
102
103
def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs):
104
"""
105
Write a table to JSON array format.
106
107
Parameters:
108
- table: Input table
109
- source: Output file path or file object
110
- prefix: String to prepend to output
111
- suffix: String to append to output
112
- output_header: Whether to include header in output
113
- args: Additional arguments
114
- kwargs: Additional keyword arguments
115
"""
116
```
117
118
### Database Operations
119
120
Connect to and work with various database systems using SQLAlchemy.
121
122
```python { .api }
123
def fromdb(dbo, query, *args, **kwargs) -> Table:
124
"""
125
Extract a table from a database query.
126
127
Parameters:
128
- dbo: Database connection object (SQLAlchemy engine/connection)
129
- query: SQL query string
130
- args: Query parameters
131
- kwargs: Additional arguments
132
133
Returns:
134
Table object with query results
135
"""
136
137
def todb(table, dbo, tablename, schema=None, commit=True, create=False, drop=False,
138
constraints=True, metadata=None, dialect=None, typeconv=None):
139
"""
140
Write a table to a database.
141
142
Parameters:
143
- table: Input table
144
- dbo: Database connection object
145
- tablename: Target table name
146
- schema: Database schema name
147
- commit: Whether to commit transaction
148
- create: Whether to create table if it doesn't exist
149
- drop: Whether to drop existing table
150
- constraints: Whether to create constraints
151
- metadata: SQLAlchemy metadata object
152
- dialect: Database dialect
153
- typeconv: Type conversion dictionary
154
"""
155
156
def appenddb(table, dbo, tablename, schema=None, commit=True):
157
"""
158
Append a table to a database table.
159
160
Parameters:
161
- table: Input table
162
- dbo: Database connection object
163
- tablename: Target table name
164
- schema: Database schema name
165
- commit: Whether to commit transaction
166
"""
167
```
168
169
### Excel Operations
170
171
Read and write Microsoft Excel files in both legacy (.xls) and modern (.xlsx) formats.
172
173
```python { .api }
174
def fromxls(filename, sheet=None, use_view=True, **kwargs) -> Table:
175
"""
176
Extract a table from an Excel .xls file.
177
178
Parameters:
179
- filename: Path to Excel file
180
- sheet: Sheet name or index (default: first sheet)
181
- use_view: Whether to use optimized view
182
- kwargs: Additional arguments
183
184
Returns:
185
Table object
186
"""
187
188
def fromxlsx(filename, sheet=None, range_string=None, min_row=None, max_row=None,
189
min_col=None, max_col=None, read_only=False, **kwargs) -> Table:
190
"""
191
Extract a table from an Excel .xlsx file.
192
193
Parameters:
194
- filename: Path to Excel file
195
- sheet: Sheet name or index
196
- range_string: Excel range string (e.g., 'A1:C10')
197
- min_row, max_row: Row range limits
198
- min_col, max_col: Column range limits
199
- read_only: Whether to open in read-only mode
200
- kwargs: Additional arguments
201
202
Returns:
203
Table object
204
"""
205
206
def toxls(table, filename, sheet=None, encoding=None, style_compression=0, **kwargs):
207
"""Write a table to an Excel .xls file."""
208
209
def toxlsx(table, filename, sheet=None, write_header=True, mode="replace"):
210
"""Write a table to an Excel .xlsx file."""
211
212
def appendxlsx(table, filename, sheet=None, write_header=False):
213
"""Append a table to an Excel .xlsx file."""
214
```
215
216
### XML and HTML Operations
217
218
Process XML and HTML data for web scraping and data integration.
219
220
```python { .api }
221
def fromxml(source, *args, **kwargs) -> Table:
222
"""
223
Extract a table from an XML file.
224
225
Parameters:
226
- source: XML file path or file object
227
- args: Additional arguments for XML parsing
228
- kwargs: Additional keyword arguments
229
230
Returns:
231
Table object
232
"""
233
234
def toxml(table, target=None, root='table', head='row', **kwargs):
235
"""
236
Write a table to XML format.
237
238
Parameters:
239
- table: Input table
240
- target: Output file path or file object
241
- root: Root element name
242
- head: Row element name
243
- kwargs: Additional XML formatting options
244
"""
245
246
def tohtml(table, source=None, encoding=None, errors='strict', caption=None,
247
vrepr=str, lineterminator='\r\n', class_=None, **kwargs):
248
"""
249
Write a table to HTML format.
250
251
Parameters:
252
- table: Input table
253
- source: Output file path or file object
254
- encoding: Character encoding
255
- errors: Error handling strategy
256
- caption: HTML table caption
257
- vrepr: Value representation function
258
- lineterminator: Line ending character
259
- class_: CSS class for table element
260
- kwargs: Additional HTML formatting options
261
"""
262
263
def teehtml(table, source=None, encoding=None, errors='strict', caption=None,
264
vrepr=str, lineterminator='\r\n', class_=None, **kwargs) -> Table:
265
"""Write a table to HTML while returning the table for further processing."""
266
```
267
268
### Specialized Formats
269
270
Support for various specialized data formats and scientific computing libraries.
271
272
```python { .api }
273
def fromarray(array) -> Table:
274
"""
275
Construct a table from a NumPy array.
276
277
Parameters:
278
- array: NumPy array
279
280
Returns:
281
Table object
282
"""
283
284
def toarray(table, dtype=None, count=-1, sample=1000):
285
"""
286
Convert a table to a NumPy array.
287
288
Parameters:
289
- table: Input table
290
- dtype: NumPy data type
291
- count: Maximum number of rows (-1 for all)
292
- sample: Number of rows to sample for type detection
293
294
Returns:
295
NumPy array
296
"""
297
298
def fromdataframe(df, include_index=False) -> Table:
299
"""
300
Construct a table from a Pandas DataFrame.
301
302
Parameters:
303
- df: Pandas DataFrame
304
- include_index: Whether to include DataFrame index as a column
305
306
Returns:
307
Table object
308
"""
309
310
def todataframe(table, index=None, exclude=None, columns=None,
311
coerce_float=False, nrows=None):
312
"""
313
Convert a table to a Pandas DataFrame.
314
315
Parameters:
316
- table: Input table
317
- index: Column to use as DataFrame index
318
- exclude: Columns to exclude
319
- columns: Column names to use
320
- coerce_float: Whether to coerce numeric strings to float
321
- nrows: Maximum number of rows
322
323
Returns:
324
Pandas DataFrame
325
"""
326
327
def frompickle(source=None) -> Table:
328
"""Extract a table from a pickle file."""
329
330
def topickle(table, source=None, protocol=-1, write_header=True):
331
"""Write a table to a pickle file."""
332
333
def fromavro(source, limit=None, skips=0, **avro_args) -> Table:
334
"""Extract a table from an Apache Avro file."""
335
336
def toavro(table, target, schema=None, sample=9, mode='wb', **avro_args):
337
"""Write a table to Apache Avro format."""
338
339
def fromhdf5(source, where=None, name=None, condition=None,
340
condvars=None, start=None, stop=None, step=None, **kwargs) -> Table:
341
"""
342
Extract a table from an HDF5 file using PyTables.
343
344
Parameters:
345
- source: HDF5 file path or file object
346
- where: Path to HDF5 table within file
347
- name: Name of table within HDF5 file
348
- condition: Selection condition
349
- condvars: Variables for selection condition
350
- start: Start index for selection
351
- stop: Stop index for selection
352
- step: Step size for selection
353
- kwargs: Additional PyTables arguments
354
355
Returns:
356
Table object
357
"""
358
359
def tohdf5(table, source, where=None, name='table', create=False,
360
description=None, title='', filters=None, expectedrows=10000,
361
chunkshape=None, byteorder=None, createparents=False,
362
sample=1000, **kwargs):
363
"""Write a table to HDF5 format using PyTables."""
364
365
def fromhdf5sorted(source, *args, **kwargs) -> Table:
366
"""Extract a table from HDF5 with sorted index reading."""
367
368
def frombcolz(source, expression=None, outcols=None, limit=None, skip=0) -> Table:
369
"""
370
Extract a table from a Bcolz compressed array.
371
372
Parameters:
373
- source: Bcolz source file or object
374
- expression: Query expression for filtering
375
- outcols: Output columns to select
376
- limit: Maximum number of rows to read
377
- skip: Number of rows to skip
378
379
Returns:
380
Table object
381
"""
382
383
def tobcolz(table, source=None, mode='w', **bcolz_args):
384
"""Write a table to Bcolz compressed format."""
385
386
def fromgsheet(url=None, title=None, sheet_name=None, encoding='utf-8',
387
auth_method='service_account', scopes=None, credentials=None,
388
**gsheet_args) -> Table:
389
"""
390
Extract a table from a Google Sheets document.
391
392
Parameters:
393
- url: Google Sheets URL
394
- title: Sheet title
395
- sheet_name: Name of specific sheet
396
- encoding: Character encoding
397
- auth_method: Authentication method
398
- scopes: OAuth scopes
399
- credentials: Authentication credentials
400
- gsheet_args: Additional Google Sheets arguments
401
402
Returns:
403
Table object
404
"""
405
406
def togsheet(table, url=None, title=None, sheet_name=None,
407
auth_method='service_account', scopes=None, credentials=None,
408
**gsheet_args):
409
"""Write a table to Google Sheets."""
410
```
411
412
### Data Sources
413
414
Flexible data source classes for handling various input/output scenarios.
415
416
```python { .api }
417
class FileSource:
418
"""File source for reading/writing local files."""
419
def __init__(self, filename, mode='rb'): ...
420
421
class URLSource:
422
"""URL-based data source for remote files."""
423
def __init__(self, url, **kwargs): ...
424
425
class StringSource:
426
"""In-memory string source."""
427
def __init__(self, string_data): ...
428
429
class GzipSource:
430
"""Compressed gzip file source."""
431
def __init__(self, filename, mode='rb'): ...
432
433
class RemoteSource:
434
"""Remote file system source using fsspec."""
435
def __init__(self, url, **kwargs): ...
436
437
class SMBSource:
438
"""SMB/CIFS network share source."""
439
def __init__(self, url, **kwargs): ...
440
```
441
442
## Usage Examples
443
444
### Working with CSV Files
445
446
```python
447
import petl as etl
448
449
# Read CSV with custom options
450
table = etl.fromcsv('data.csv', delimiter=';', encoding='utf-8')
451
452
# Write CSV with specific formatting
453
etl.tocsv(table, 'output.csv', delimiter='|', quotechar='"')
454
455
# Chain operations with CSV I/O
456
result = (etl.fromcsv('input.csv')
457
.select('age', lambda age: age > 18)
458
.cut('name', 'age')
459
.sort('name'))
460
result.tocsv('adults.csv')
461
```
462
463
### Database Integration
464
465
```python
466
import petl as etl
467
from sqlalchemy import create_engine
468
469
# Connect to database
470
engine = create_engine('postgresql://user:pass@host:port/db')
471
472
# Extract data from database
473
table = etl.fromdb(engine, 'SELECT * FROM customers WHERE active = true')
474
475
# Process and write back to database
476
processed = table.convert('email', str.lower).convert('phone', lambda x: x.replace('-', ''))
477
etl.todb(processed, engine, 'customers_clean', create=True)
478
```
479
480
### JSON Data Processing
481
482
```python
483
import petl as etl
484
485
# Read JSON Lines file
486
table = etl.fromjson('data.jsonl')
487
488
# Convert list of dictionaries to table
489
data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
490
table = etl.fromdicts(data)
491
492
# Write to JSON format
493
etl.tojson(table, 'output.json')
494
```
495
496
### Excel File Operations
497
498
```python
499
import petl as etl
500
501
# Read specific sheet and range
502
table = etl.fromxlsx('report.xlsx', sheet='Sales', range_string='A1:F100')
503
504
# Write to Excel with formatting
505
etl.toxlsx(table, 'summary.xlsx', sheet='Results')
506
507
# Append to existing Excel file
508
etl.appendxlsx(additional_data, 'summary.xlsx', sheet='Results')
509
```