Powerful data structures for data analysis, time series, and statistics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive I/O capabilities for reading and writing data in various formats including CSV, Excel, JSON, SQL databases, HDF5, Parquet, and many statistical file formats.
import pandas as pd
from pandas import read_csv, read_excel, read_json, read_sqlRead and write comma-separated values and other delimited text files.
def read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors='strict', dialect=None, on_bad_lines='error', delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, dtype_backend='numpy_nullable'):
"""
Read a comma-separated values (CSV) file into DataFrame.
Parameters:
- filepath_or_buffer: str, path object, or file-like object
- sep: str, delimiter to use
- header: int, list of int, default 'infer', row(s) to use as column names
- names: array-like, list of column names to use
- index_col: int, str, sequence of int/str, or False, column(s) to use as row labels
- usecols: list-like or callable, return subset of columns
- dtype: type name or dict of column -> type, data type for data or columns
- na_values: scalar, str, list-like, or dict, additional strings to recognize as NA/NaN
- parse_dates: bool or list of int or names or list of lists or dict
- chunksize: int, return TextFileReader object for iteration
Returns:
DataFrame or TextFileReader
"""
def read_table(filepath_or_buffer, sep='\\t', delimiter=None, header='infer', names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors='strict', dialect=None, on_bad_lines='error', delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, dtype_backend='numpy_nullable'):
"""
Read general delimited file into DataFrame.
Similar to read_csv but with tab delimiter by default.
"""
def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, infer_nrows=100, dtype_backend='numpy_nullable', iterator=False, chunksize=None, **kwargs):
"""
Read a table of fixed-width formatted lines into DataFrame.
Parameters:
- filepath_or_buffer: str, path object, or file-like object
- colspecs: list of tuple (int, int) or 'infer', column specifications
- widths: list of int, width of each field
- infer_nrows: int, number of rows to consider when letting the parser determine colspecs
Returns:
DataFrame or TextFileReader
"""Read and write Microsoft Excel files (.xlsx, .xls).
def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, parse_dates=False, date_parser=None, date_format=None, thousands=None, decimal='.', comment=None, skipfooter=0, storage_options=None, dtype_backend='numpy_nullable', engine_kwargs=None):
"""
Read an Excel file into a pandas DataFrame.
Parameters:
- io: str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
- sheet_name: str, int, list, or None, names of sheets or sheet positions to read
- header: int, list of int, default 0, row(s) to use as column names
- names: array-like, list of column names to use
- index_col: int, str, list of int, default None, column(s) to use as row labels
- usecols: str, list-like, or callable, return subset of columns
- dtype: type name or dict of column -> type, data type for data or columns
- skiprows: list-like, rows to skip at the beginning
- nrows: int, number of rows to parse
Returns:
DataFrame or dict of DataFrames
"""
class ExcelFile:
def __init__(self, path_or_buffer, engine=None, storage_options=None, engine_kwargs=None):
"""
Class for parsing tabular Excel sheets into DataFrame objects.
Parameters:
- path_or_buffer: str, bytes, path object, or file-like object
- engine: str, engine to use for reading ('openpyxl', 'xlrd', 'odf', 'pyxlsb')
"""
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, converters=None, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, dtype_backend='numpy_nullable', **kwds):
"""Parse specified sheet(s) into DataFrame."""
def sheet_names(self):
"""Property returning list of sheet names."""
class ExcelWriter:
def __init__(self, path, engine=None, date_format=None, datetime_format=None, mode='w', storage_options=None, if_sheet_exists=None, engine_kwargs=None):
"""
Class for writing DataFrame objects into Excel sheets.
Parameters:
- path: str or file-like object, file path or existing ExcelWriter
- engine: str, engine to use for writing ('openpyxl', 'xlsxwriter')
- mode: str, file mode to use (write or append)
"""
def close(self):
"""Close the contained workbook."""
def save(self):
"""Save workbook to disk."""Read and write JavaScript Object Notation (JSON) format.
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, convert_axes=None, convert_dates=True, keep_default_dates=True, precise_float=False, date_unit=None, encoding=None, encoding_errors='strict', lines=False, chunksize=None, compression='infer', nrows=None, storage_options=None, dtype_backend='numpy_nullable', engine='ujson'):
"""
Convert a JSON string to pandas object.
Parameters:
- path_or_buf: str, path object, file-like object, or None
- orient: str, indication of expected JSON string format
- typ: str, type of object to recover ('frame' or 'series')
- dtype: bool or dict, data type for data or columns
- convert_dates: bool or list of str, dates to parse
- lines: bool, read file as one JSON object per line
- chunksize: int, return JsonReader object for iteration
Returns:
Series, DataFrame, or JsonReader
"""
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.', max_level=None):
"""
Normalize semi-structured JSON data into a flat table.
Parameters:
- data: dict or list of dicts, unserialized JSON objects
- record_path: str or list of str, path in each object to list of records
- meta: list of str, fields to use as metadata for each record
- sep: str, nested record separator
- max_level: int, max number of levels to normalize
Returns:
DataFrame
"""Read and write data from SQL databases.
def read_sql(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend='numpy_nullable', dtype=None):
"""
Read SQL query or database table into a DataFrame.
Parameters:
- sql: str or SQLAlchemy Selectable, SQL query or table name
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
- index_col: str or list of str, column(s) to use as row labels
- coerce_float: bool, attempts to convert values of non-string, non-numeric objects to floating point
- params: list, tuple, mapping or None, parameters to pass to execute method
- parse_dates: list or dict, columns to parse as dates
- chunksize: int, number of rows to include in each chunk
Returns:
DataFrame or Iterator[DataFrame]
"""
def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, dtype=None, dtype_backend='numpy_nullable'):
"""
Read SQL query into a DataFrame.
Parameters:
- sql: str or SQLAlchemy Selectable, SQL query to be executed
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
Returns:
DataFrame or Iterator[DataFrame]
"""
def read_sql_table(table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, dtype_backend='numpy_nullable'):
"""
Read SQL database table into a DataFrame.
Parameters:
- table_name: str, name of SQL table in database
- con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
- schema: str, name of SQL schema in database to query
- columns: list, list of column names to select from SQL table
Returns:
DataFrame or Iterator[DataFrame]
"""Read and write binary file formats for efficient storage.
def read_pickle(filepath_or_buffer, compression='infer', storage_options=None):
"""
Load pickled pandas object (or any object) from file.
Parameters:
- filepath_or_buffer: str, path object, or file-like object
- compression: str or dict, compression type and options
Returns:
unpickled object
"""
def to_pickle(obj, filepath_or_buffer, compression='infer', protocol=5, storage_options=None):
"""
Pickle (serialize) object to file.
Parameters:
- obj: any object, object to pickle
- filepath_or_buffer: str, path object, or file-like object
- compression: str or dict, compression type and options
- protocol: int, pickle protocol to use
"""
def read_hdf(path_or_buf, key=None, mode='r', errors='strict', where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, dtype_backend='numpy_nullable', **kwargs):
"""
Read from the store, close it if we opened it.
Parameters:
- path_or_buf: str, path object, pandas.HDFStore, or file-like object
- key: str, identifier for group in store
- mode: str, mode to open file
- where: list of Term, criteria to select
- start: int, row number to start selection
- stop: int, row number to stop selection
- columns: list, list of columns to return
Returns:
DataFrame or Series
"""
class HDFStore:
def __init__(self, path, mode='r', complevel=None, complib=None, fletcher32=False, **kwargs):
"""
Dict-like IO interface for storing pandas objects in PyTables.
Parameters:
- path: str, file path to HDF5 file
- mode: str, mode to open file
- complevel: int, compression level (0-9)
- complib: str, compression library
"""
def put(self, key, value, format=None, index=True, append=False, complib=None, complevel=None, min_itemsize=None, nan_rep=None, data_columns=None, encoding=None, errors='strict', track_times=True, dropna=False):
"""Store object in HDFStore."""
def get(self, key):
"""Retrieve pandas object stored in file."""
def keys(self):
"""Return list of keys in the store."""
def close(self):
"""Close the store."""
def read_parquet(path, engine='auto', columns=None, storage_options=None, use_nullable_dtypes=False, dtype_backend='numpy_nullable', filesystem=None, filters=None, **kwargs):
"""
Load a parquet object, returning a DataFrame.
Parameters:
- path: str, path object, or file-like object
- engine: str, parquet library to use ('auto', 'pyarrow', 'fastparquet')
- columns: list, columns to read
- filters: list of tuples, row group filters
Returns:
DataFrame
"""
def read_feather(path, columns=None, use_threads=True, storage_options=None, dtype_backend='numpy_nullable'):
"""
Load a feather-format object into a DataFrame.
Parameters:
- path: str, path object, or file-like object
- columns: sequence, columns to read
- use_threads: bool, whether to parallelize reading
Returns:
DataFrame
"""
def read_orc(path, columns=None, dtype_backend='numpy_nullable', filesystem=None, **kwargs):
"""
Load an ORC object, returning a DataFrame.
Parameters:
- path: str, path object, or file-like object
- columns: list, columns to read
Returns:
DataFrame
"""Read data from web sources and markup formats.
def read_html(io, match='.+', header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True, extract_links=None, dtype_backend='numpy_nullable', storage_options=None):
"""
Read HTML tables into a list of DataFrame objects.
Parameters:
- io: str, path object, file-like object, or raw string containing HTML
- match: str or compiled regex, set of table attributes to match
- header: int or list-like, row(s) to use to make column headers
- index_col: int or list-like, column(s) to use to make row index
- skiprows: int, list-like or slice, rows to skip
- attrs: dict, attributes to match in table tag
Returns:
list of DataFrames
"""
def read_xml(path_or_buffer, xpath='./*', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', tree_builder=None, stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
"""
Read XML document into a DataFrame object.
Parameters:
- path_or_buffer: str, path object, or file-like object
- xpath: str, XPath expression to parse desired element(s)
- namespaces: dict, namespace prefixes and URIs
- elems_only: bool, parse child elements only
- attrs_only: bool, parse attributes only
- encoding: str, encoding of document
Returns:
DataFrame
"""
def read_clipboard(sep='\\s+', dtype_backend='numpy_nullable', **kwargs):
"""
Read text from clipboard and pass to read_csv.
Parameters:
- sep: str, delimiter for splitting clipboard contents
Returns:
DataFrame
"""Read data from statistical software packages.
def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
"""
Read Stata file into DataFrame.
Parameters:
- filepath_or_buffer: str, path object, or file-like object
- convert_dates: bool, convert date variables to pandas datetime
- convert_categoricals: bool, convert categorical variables to pandas Categorical
- encoding: str, encoding used to decode text strings
- preserve_dtypes: bool, preserve Stata data types
Returns:
DataFrame or StataReader
"""
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, compression='infer', storage_options=None, dtype_backend='numpy_nullable'):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters:
- filepath_or_buffer: str, path object, or file-like object
- format: str, file format ('xport' or 'sas7bdat')
- encoding: str, encoding for text data
- chunksize: int, rows to read at a time
Returns:
DataFrame or SAS Reader
"""
def read_spss(path, usecols=None, convert_categoricals=True, dtype_backend='numpy_nullable', storage_options=None):
"""
Load an SPSS file from the file path, returning a DataFrame.
Parameters:
- path: str, path object, or file-like object
- usecols: list-like, return subset of columns
- convert_categoricals: bool, convert categorical variables to pandas Categorical
Returns:
DataFrame
"""Read data from Google BigQuery.
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=True, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, max_results=None, progress_bar_type=None, dtype_backend='numpy_nullable'):
"""
Load data from Google BigQuery.
Parameters:
- query: str, SQL-Like Query to return data values
- project_id: str, Google BigQuery Account project ID
- index_col: str, name of result column to use for index
- col_order: list(str), list of BigQuery column names in desired order
- reauth: bool, force Google BigQuery to re-authenticate user
- dialect: str, SQL dialect for BigQuery ('legacy' or 'standard')
Returns:
DataFrame
"""# File reader classes for chunked reading
class TextFileReader:
"""Iterator for reading CSV files in chunks."""
def __iter__(self): ...
def __next__(self): ...
def get_chunk(self, size=None): ...
def close(self): ...
class JsonReader:
"""Iterator for reading JSON files in chunks."""
def __iter__(self): ...
def __next__(self): ...
def close(self): ...
# Storage format constants
class ExcelWriterMode:
WRITE = 'w'
APPEND = 'a'
class JSONOrient:
SPLIT = 'split'
RECORDS = 'records'
INDEX = 'index'
COLUMNS = 'columns'
VALUES = 'values'
TABLE = 'table'Install with Tessl CLI
npx tessl i tessl/pypi-pandas