0
# Data Input/Output
1
2
Comprehensive I/O capabilities for reading and writing data in various formats including CSV, Excel, JSON, SQL databases, HDF5, Parquet, and many statistical file formats.
3
4
## Core Imports
5
6
```python
7
import pandas as pd
8
from pandas import read_csv, read_excel, read_json, read_sql
9
```
10
11
## Capabilities
12
13
### CSV and Text Files
14
15
Read and write comma-separated values and other delimited text files.
16
17
```python { .api }
18
def read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors='strict', dialect=None, on_bad_lines='error', delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, dtype_backend='numpy_nullable'):
19
"""
20
Read a comma-separated values (CSV) file into DataFrame.
21
22
Parameters:
23
- filepath_or_buffer: str, path object, or file-like object
24
- sep: str, delimiter to use
25
- header: int, list of int, default 'infer', row(s) to use as column names
26
- names: array-like, list of column names to use
27
- index_col: int, str, sequence of int/str, or False, column(s) to use as row labels
28
- usecols: list-like or callable, return subset of columns
29
- dtype: type name or dict of column -> type, data type for data or columns
30
- na_values: scalar, str, list-like, or dict, additional strings to recognize as NA/NaN
31
- parse_dates: bool or list of int or names or list of lists or dict
32
- chunksize: int, return TextFileReader object for iteration
33
34
Returns:
35
DataFrame or TextFileReader
36
"""
37
38
def read_table(filepath_or_buffer, sep='\\t', delimiter=None, header='infer', names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors='strict', dialect=None, on_bad_lines='error', delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, dtype_backend='numpy_nullable'):
39
"""
40
Read general delimited file into DataFrame.
41
42
Similar to read_csv but with tab delimiter by default.
43
"""
44
45
def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, infer_nrows=100, dtype_backend='numpy_nullable', iterator=False, chunksize=None, **kwargs):
46
"""
47
Read a table of fixed-width formatted lines into DataFrame.
48
49
Parameters:
50
- filepath_or_buffer: str, path object, or file-like object
51
- colspecs: list of tuple (int, int) or 'infer', column specifications
52
- widths: list of int, width of each field
53
- infer_nrows: int, number of rows to consider when letting the parser determine colspecs
54
55
Returns:
56
DataFrame or TextFileReader
57
"""
58
```
59
60
### Excel Files
61
62
Read and write Microsoft Excel files (.xlsx, .xls).
63
64
```python { .api }
65
def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, parse_dates=False, date_parser=None, date_format=None, thousands=None, decimal='.', comment=None, skipfooter=0, storage_options=None, dtype_backend='numpy_nullable', engine_kwargs=None):
66
"""
67
Read an Excel file into a pandas DataFrame.
68
69
Parameters:
70
- io: str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
71
- sheet_name: str, int, list, or None, names of sheets or sheet positions to read
72
- header: int, list of int, default 0, row(s) to use as column names
73
- names: array-like, list of column names to use
74
- index_col: int, str, list of int, default None, column(s) to use as row labels
75
- usecols: str, list-like, or callable, return subset of columns
76
- dtype: type name or dict of column -> type, data type for data or columns
77
- skiprows: list-like, rows to skip at the beginning
78
- nrows: int, number of rows to parse
79
80
Returns:
81
DataFrame or dict of DataFrames
82
"""
83
84
class ExcelFile:
85
def __init__(self, path_or_buffer, engine=None, storage_options=None, engine_kwargs=None):
86
"""
87
Class for parsing tabular Excel sheets into DataFrame objects.
88
89
Parameters:
90
- path_or_buffer: str, bytes, path object, or file-like object
91
- engine: str, engine to use for reading ('openpyxl', 'xlrd', 'odf', 'pyxlsb')
92
"""
93
94
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, converters=None, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, dtype_backend='numpy_nullable', **kwds):
95
"""Parse specified sheet(s) into DataFrame."""
96
97
def sheet_names(self):
98
"""Property returning list of sheet names."""
99
100
class ExcelWriter:
101
def __init__(self, path, engine=None, date_format=None, datetime_format=None, mode='w', storage_options=None, if_sheet_exists=None, engine_kwargs=None):
102
"""
103
Class for writing DataFrame objects into Excel sheets.
104
105
Parameters:
106
- path: str or file-like object, file path or existing ExcelWriter
107
- engine: str, engine to use for writing ('openpyxl', 'xlsxwriter')
108
- mode: str, file mode to use (write or append)
109
"""
110
111
def close(self):
112
"""Close the contained workbook."""
113
114
def save(self):
115
"""Save workbook to disk."""
116
```
117
118
### JSON Files
119
120
Read and write JavaScript Object Notation (JSON) format.
121
122
```python { .api }
123
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, convert_axes=None, convert_dates=True, keep_default_dates=True, precise_float=False, date_unit=None, encoding=None, encoding_errors='strict', lines=False, chunksize=None, compression='infer', nrows=None, storage_options=None, dtype_backend='numpy_nullable', engine='ujson'):
124
"""
125
Convert a JSON string to pandas object.
126
127
Parameters:
128
- path_or_buf: str, path object, file-like object, or None
129
- orient: str, indication of expected JSON string format
130
- typ: str, type of object to recover ('frame' or 'series')
131
- dtype: bool or dict, data type for data or columns
132
- convert_dates: bool or list of str, dates to parse
133
- lines: bool, read file as one JSON object per line
134
- chunksize: int, return JsonReader object for iteration
135
136
Returns:
137
Series, DataFrame, or JsonReader
138
"""
139
140
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.', max_level=None):
141
"""
142
Normalize semi-structured JSON data into a flat table.
143
144
Parameters:
145
- data: dict or list of dicts, unserialized JSON objects
146
- record_path: str or list of str, path in each object to list of records
147
- meta: list of str, fields to use as metadata for each record
148
- sep: str, nested record separator
149
- max_level: int, max number of levels to normalize
150
151
Returns:
152
DataFrame
153
"""
154
```
155
156
### SQL Databases
157
158
Read and write data from SQL databases.
159
160
```python { .api }
161
def read_sql(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend='numpy_nullable', dtype=None):
162
"""
163
Read SQL query or database table into a DataFrame.
164
165
Parameters:
166
- sql: str or SQLAlchemy Selectable, SQL query or table name
167
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
168
- index_col: str or list of str, column(s) to use as row labels
169
- coerce_float: bool, attempts to convert values of non-string, non-numeric objects to floating point
170
- params: list, tuple, mapping or None, parameters to pass to execute method
171
- parse_dates: list or dict, columns to parse as dates
172
- chunksize: int, number of rows to include in each chunk
173
174
Returns:
175
DataFrame or Iterator[DataFrame]
176
"""
177
178
def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, dtype=None, dtype_backend='numpy_nullable'):
179
"""
180
Read SQL query into a DataFrame.
181
182
Parameters:
183
- sql: str or SQLAlchemy Selectable, SQL query to be executed
184
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
185
186
Returns:
187
DataFrame or Iterator[DataFrame]
188
"""
189
190
def read_sql_table(table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, dtype_backend='numpy_nullable'):
191
"""
192
Read SQL database table into a DataFrame.
193
194
Parameters:
195
- table_name: str, name of SQL table in database
196
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
197
- schema: str, name of SQL schema in database to query
198
- columns: list, list of column names to select from SQL table
199
200
Returns:
201
DataFrame or Iterator[DataFrame]
202
"""
203
```
204
205
### Binary Formats
206
207
Read and write binary file formats for efficient storage.
208
209
```python { .api }
210
def read_pickle(filepath_or_buffer, compression='infer', storage_options=None):
211
"""
212
Load pickled pandas object (or any object) from file.
213
214
Parameters:
215
- filepath_or_buffer: str, path object, or file-like object
216
- compression: str or dict, compression type and options
217
218
Returns:
219
unpickled object
220
"""
221
222
def to_pickle(obj, filepath_or_buffer, compression='infer', protocol=5, storage_options=None):
223
"""
224
Pickle (serialize) object to file.
225
226
Parameters:
227
- obj: any object, object to pickle
228
- filepath_or_buffer: str, path object, or file-like object
229
- compression: str or dict, compression type and options
230
- protocol: int, pickle protocol to use
231
"""
232
233
def read_hdf(path_or_buf, key=None, mode='r', errors='strict', where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, dtype_backend='numpy_nullable', **kwargs):
234
"""
235
Read from the store, close it if we opened it.
236
237
Parameters:
238
- path_or_buf: str, path object, pandas.HDFStore, or file-like object
239
- key: str, identifier for group in store
240
- mode: str, mode to open file
241
- where: list of Term, criteria to select
242
- start: int, row number to start selection
243
- stop: int, row number to stop selection
244
- columns: list, list of columns to return
245
246
Returns:
247
DataFrame or Series
248
"""
249
250
class HDFStore:
251
def __init__(self, path, mode='r', complevel=None, complib=None, fletcher32=False, **kwargs):
252
"""
253
Dict-like IO interface for storing pandas objects in PyTables.
254
255
Parameters:
256
- path: str, file path to HDF5 file
257
- mode: str, mode to open file
258
- complevel: int, compression level (0-9)
259
- complib: str, compression library
260
"""
261
262
def put(self, key, value, format=None, index=True, append=False, complib=None, complevel=None, min_itemsize=None, nan_rep=None, data_columns=None, encoding=None, errors='strict', track_times=True, dropna=False):
263
"""Store object in HDFStore."""
264
265
def get(self, key):
266
"""Retrieve pandas object stored in file."""
267
268
def keys(self):
269
"""Return list of keys in the store."""
270
271
def close(self):
272
"""Close the store."""
273
274
def read_parquet(path, engine='auto', columns=None, storage_options=None, use_nullable_dtypes=False, dtype_backend='numpy_nullable', filesystem=None, filters=None, **kwargs):
275
"""
276
Load a parquet object, returning a DataFrame.
277
278
Parameters:
279
- path: str, path object, or file-like object
280
- engine: str, parquet library to use ('auto', 'pyarrow', 'fastparquet')
281
- columns: list, columns to read
282
- filters: list of tuples, row group filters
283
284
Returns:
285
DataFrame
286
"""
287
288
def read_feather(path, columns=None, use_threads=True, storage_options=None, dtype_backend='numpy_nullable'):
289
"""
290
Load a feather-format object into a DataFrame.
291
292
Parameters:
293
- path: str, path object, or file-like object
294
- columns: sequence, columns to read
295
- use_threads: bool, whether to parallelize reading
296
297
Returns:
298
DataFrame
299
"""
300
301
def read_orc(path, columns=None, dtype_backend='numpy_nullable', filesystem=None, **kwargs):
302
"""
303
Load an ORC object, returning a DataFrame.
304
305
Parameters:
306
- path: str, path object, or file-like object
307
- columns: list, columns to read
308
309
Returns:
310
DataFrame
311
"""
312
```
313
314
### Web and Markup
315
316
Read data from web sources and markup formats.
317
318
```python { .api }
319
def read_html(io, match='.+', header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True, extract_links=None, dtype_backend='numpy_nullable', storage_options=None):
320
"""
321
Read HTML tables into a list of DataFrame objects.
322
323
Parameters:
324
- io: str, path object, file-like object, or raw string containing HTML
325
- match: str or compiled regex, set of table attributes to match
326
- header: int or list-like, row(s) to use to make column headers
327
- index_col: int or list-like, column(s) to use to make row index
328
- skiprows: int, list-like or slice, rows to skip
329
- attrs: dict, attributes to match in table tag
330
331
Returns:
332
list of DataFrames
333
"""
334
335
def read_xml(path_or_buffer, xpath='./*', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', tree_builder=None, stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
336
"""
337
Read XML document into a DataFrame object.
338
339
Parameters:
340
- path_or_buffer: str, path object, or file-like object
341
- xpath: str, XPath expression to parse desired element(s)
342
- namespaces: dict, namespace prefixes and URIs
343
- elems_only: bool, parse child elements only
344
- attrs_only: bool, parse attributes only
345
- encoding: str, encoding of document
346
347
Returns:
348
DataFrame
349
"""
350
351
def read_clipboard(sep='\\s+', dtype_backend='numpy_nullable', **kwargs):
352
"""
353
Read text from clipboard and pass to read_csv.
354
355
Parameters:
356
- sep: str, delimiter for splitting clipboard contents
357
358
Returns:
359
DataFrame
360
"""
361
```
362
363
### Statistical Package Formats
364
365
Read data from statistical software packages.
366
367
```python { .api }
368
def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
369
"""
370
Read Stata file into DataFrame.
371
372
Parameters:
373
- filepath_or_buffer: str, path object, or file-like object
374
- convert_dates: bool, convert date variables to pandas datetime
375
- convert_categoricals: bool, convert categorical variables to pandas Categorical
376
- encoding: str, encoding used to decode text strings
377
- preserve_dtypes: bool, preserve Stata data types
378
379
Returns:
380
DataFrame or StataReader
381
"""
382
383
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
384
"""
385
Read SAS files stored as either XPORT or SAS7BDAT format files.
386
387
Parameters:
388
- filepath_or_buffer: str, path object, or file-like object
389
- format: str, file format ('xport' or 'sas7bdat')
390
- encoding: str, encoding for text data
391
- chunksize: int, rows to read at a time
392
393
Returns:
394
DataFrame or SAS Reader
395
"""
396
397
def read_spss(path, usecols=None, convert_categoricals=True, dtype_backend='numpy_nullable', storage_options=None):
398
"""
399
Load an SPSS file from the file path, returning a DataFrame.
400
401
Parameters:
402
- path: str, path object, or file-like object
403
- usecols: list-like, return subset of columns
404
- convert_categoricals: bool, convert categorical variables to pandas Categorical
405
406
Returns:
407
DataFrame
408
"""
409
```
410
411
### Google BigQuery
412
413
Read data from Google BigQuery.
414
415
```python { .api }
416
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=True, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, max_results=None, progress_bar_type=None, dtype_backend='numpy_nullable'):
417
"""
418
Load data from Google BigQuery.
419
420
Parameters:
421
- query: str, SQL-Like Query to return data values
422
- project_id: str, Google BigQuery Account project ID
423
- index_col: str, name of result column to use for index
424
- col_order: list(str), list of BigQuery column names in desired order
425
- reauth: bool, force Google BigQuery to re-authenticate user
426
- dialect: str, SQL dialect for BigQuery ('legacy' or 'standard')
427
428
Returns:
429
DataFrame
430
"""
431
```
432
433
## Types
434
435
```python { .api }
436
# File reader classes for chunked reading
437
class TextFileReader:
438
"""Iterator for reading CSV files in chunks."""
439
def __iter__(self): ...
440
def __next__(self): ...
441
def get_chunk(self, size=None): ...
442
def close(self): ...
443
444
class JsonReader:
445
"""Iterator for reading JSON files in chunks."""
446
def __iter__(self): ...
447
def __next__(self): ...
448
def close(self): ...
449
450
# Storage format constants
451
class ExcelWriterMode:
452
WRITE = 'w'
453
APPEND = 'a'
454
455
class JSONOrient:
456
SPLIT = 'split'
457
RECORDS = 'records'
458
INDEX = 'index'
459
COLUMNS = 'columns'
460
VALUES = 'values'
461
TABLE = 'table'
462
```