Blazingly fast DataFrame library for Python with lazy and eager evaluation modes
—
Comprehensive I/O support for 15+ file formats including CSV, Parquet, JSON, Excel, databases, and cloud storage with both eager reading and lazy scanning capabilities for optimal performance and memory usage.
Read and scan CSV files with extensive customization options for delimiters, headers, data types, and parsing behavior.
def read_csv(
source,
*,
has_header=True,
columns=None,
new_columns=None,
dtypes=None,
separator=",",
comment_prefix=None,
quote_char='"',
skip_rows=0,
skip_rows_after_header=0,
row_index_name=None,
row_index_offset=0,
sample_size=1024,
eol_char="\n",
raise_if_empty=True,
truncate_ragged_lines=False,
decimal_comma=False,
glob=True,
rechunk=False,
low_memory=False,
use_pyarrow=False,
storage_options=None,
credential_provider=None,
retries=2,
file_cache_ttl=None,
include_file_paths=None
) -> DataFrame:
"""
Read CSV file(s) into DataFrame.
Parameters:
- source: File path, URL, or file-like object
- has_header: Whether CSV has header row
- columns: Column subset to read
- new_columns: New column names
- dtypes: Column data types
- separator: Field delimiter
- comment_prefix: Comment line prefix
- quote_char: Quote character
- skip_rows: Rows to skip at start
- skip_rows_after_header: Rows to skip after header
- row_index_name: Add row index column
- row_index_offset: Row index starting value
- sample_size: Rows to sample for type inference
- eol_char: End-of-line character
- raise_if_empty: Raise error if empty
- truncate_ragged_lines: Handle ragged lines
- decimal_comma: Use comma as decimal separator
- glob: Enable glob patterns
- rechunk: Rechunk to contiguous memory
- low_memory: Use low memory mode
- use_pyarrow: Use PyArrow parser
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- retries: Number of retries
- file_cache_ttl: File cache TTL
- include_file_paths: Include file path column
Returns:
DataFrame with CSV data
"""
def scan_csv(
source,
*,
has_header=True,
separator=",",
comment_prefix=None,
quote_char='"',
skip_rows=0,
dtypes=None,
null_values=None,
missing_utf8_is_empty_string=False,
ignore_errors=False,
cache=True,
with_columns=None,
include_file_paths=None,
n_rows=None,
encoding="utf8",
low_memory=False,
rechunk=False,
skip_rows_after_header=0,
row_index_name=None,
row_index_offset=0,
sample_size=1024,
eol_char="\n",
raise_if_empty=True,
truncate_ragged_lines=False,
decimal_comma=False,
glob=True,
storage_options=None,
credential_provider=None,
retries=2,
file_cache_ttl=None
) -> LazyFrame:
"""
Scan CSV file(s) lazily into LazyFrame.
Parameters: Similar to read_csv
Returns:
LazyFrame for deferred CSV reading
"""Read and scan Parquet files with column selection, predicate pushdown, and parallel processing.
def read_parquet(
source,
*,
columns=None,
n_rows=None,
parallel="auto",
row_index_name=None,
row_index_offset=0,
low_memory=False,
use_pyarrow=False,
storage_options=None,
credential_provider=None,
retries=2,
rechunk=False,
hive_partitioning=None,
hive_schema=None,
try_parse_hive_dates=True,
include_file_paths=None,
allow_missing_columns=False
) -> DataFrame:
"""
Read Parquet file(s) into DataFrame.
Parameters:
- source: File path, URL, or file-like object
- columns: Column subset to read
- n_rows: Number of rows to read
- parallel: Parallel reading mode
- row_index_name: Add row index column
- row_index_offset: Row index starting value
- low_memory: Use low memory mode
- use_pyarrow: Use PyArrow reader
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- retries: Number of retries
- rechunk: Rechunk to contiguous memory
- hive_partitioning: Enable Hive partitioning
- hive_schema: Hive partition schema
- try_parse_hive_dates: Parse Hive date partitions
- include_file_paths: Include file path column
- allow_missing_columns: Allow missing columns
Returns:
DataFrame with Parquet data
"""
def scan_parquet(
source,
*,
n_rows=None,
row_index_name=None,
row_index_offset=0,
parallel="auto",
glob=True,
rechunk=False,
low_memory=False,
cache=True,
storage_options=None,
credential_provider=None,
retries=2,
hive_partitioning=None,
hive_schema=None,
try_parse_hive_dates=True,
include_file_paths=None,
allow_missing_columns=False
) -> LazyFrame:
"""
Scan Parquet file(s) lazily into LazyFrame.
Parameters: Similar to read_parquet
Returns:
LazyFrame for deferred Parquet reading
"""
def read_parquet_schema(source) -> Schema:
"""
Read schema from Parquet file without loading data.
Parameters:
- source: Parquet file path or URL
Returns:
Schema of Parquet file
"""
def read_parquet_metadata(source):
"""
Read metadata from Parquet file.
Parameters:
- source: Parquet file path or URL
Returns:
Parquet file metadata
"""Read JSON and newline-delimited JSON files with flexible schema inference and nested data handling.
def read_json(
source,
*,
schema=None,
schema_overrides=None,
infer_schema_length=None,
batch_size=None,
n_rows=None,
row_index_name=None,
row_index_offset=0,
storage_options=None,
credential_provider=None,
retries=2
) -> DataFrame:
"""
Read JSON file into DataFrame.
Parameters:
- source: File path, URL, or file-like object
- schema: Column schema
- schema_overrides: Override specific column types
- infer_schema_length: Rows to scan for schema inference
- batch_size: Processing batch size
- n_rows: Number of rows to read
- row_index_name: Add row index column
- row_index_offset: Row index starting value
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- retries: Number of retries
Returns:
DataFrame with JSON data
"""
def read_ndjson(
source,
*,
schema=None,
schema_overrides=None,
infer_schema_length=None,
batch_size=None,
n_rows=None,
row_index_name=None,
row_index_offset=0,
ignore_errors=False,
storage_options=None,
credential_provider=None,
retries=2
) -> DataFrame:
"""
Read newline-delimited JSON file into DataFrame.
Parameters: Similar to read_json with additional:
- ignore_errors: Continue on parsing errors
Returns:
DataFrame with NDJSON data
"""
def scan_ndjson(
source,
*,
infer_schema_length=100,
batch_size=1024,
n_rows=None,
low_memory=False,
rechunk=False,
row_index_name=None,
row_index_offset=0,
ignore_errors=False,
schema=None,
schema_overrides=None,
include_file_paths=None,
retries=2
) -> LazyFrame:
"""
Scan NDJSON file(s) lazily into LazyFrame.
Parameters: Similar to read_ndjson
Returns:
LazyFrame for deferred NDJSON reading
"""Read Excel and OpenDocument spreadsheet files with sheet selection and range specification.
def read_excel(
source,
*,
sheet_id=None,
sheet_name=None,
engine=None,
engine_options=None,
read_options=None,
schema_overrides=None,
infer_schema_length=1000,
raise_if_empty=True
) -> DataFrame:
"""
Read Excel file into DataFrame.
Parameters:
- source: Excel file path or file-like object
- sheet_id: Sheet index (0-based)
- sheet_name: Sheet name
- engine: Excel engine ("xlsx2csv", "openpyxl", "python")
- engine_options: Engine-specific options
- read_options: Additional read options
- schema_overrides: Override column types
- infer_schema_length: Rows for schema inference
- raise_if_empty: Raise error if empty
Returns:
DataFrame with Excel data
"""
def read_ods(
source,
*,
sheet_id=None,
sheet_name=None,
schema_overrides=None,
infer_schema_length=1000,
raise_if_empty=True
) -> DataFrame:
"""
Read OpenDocument Spreadsheet into DataFrame.
Parameters:
- source: ODS file path or file-like object
- sheet_id: Sheet index (0-based)
- sheet_name: Sheet name
- schema_overrides: Override column types
- infer_schema_length: Rows for schema inference
- raise_if_empty: Raise error if empty
Returns:
DataFrame with ODS data
"""Read and scan Arrow IPC files and streams for efficient data exchange.
def read_ipc(
source,
*,
columns=None,
n_rows=None,
row_index_name=None,
row_index_offset=0,
rechunk=False,
memory_map=True,
storage_options=None,
credential_provider=None,
retries=2,
include_file_paths=None
) -> DataFrame:
"""
Read Arrow IPC file into DataFrame.
Parameters:
- source: IPC file path, URL, or file-like object
- columns: Column subset to read
- n_rows: Number of rows to read
- row_index_name: Add row index column
- row_index_offset: Row index starting value
- rechunk: Rechunk to contiguous memory
- memory_map: Use memory mapping
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- retries: Number of retries
- include_file_paths: Include file path column
Returns:
DataFrame with IPC data
"""
def read_ipc_stream(
source,
*,
columns=None,
n_rows=None,
row_index_name=None,
row_index_offset=0,
rechunk=False,
storage_options=None,
credential_provider=None,
retries=2,
include_file_paths=None
) -> DataFrame:
"""
Read Arrow IPC stream into DataFrame.
Parameters: Similar to read_ipc
Returns:
DataFrame with IPC stream data
"""
def scan_ipc(
source,
*,
n_rows=None,
cache=True,
rechunk=False,
row_index_name=None,
row_index_offset=0,
storage_options=None,
credential_provider=None,
retries=2,
include_file_paths=None,
memory_map=True
) -> LazyFrame:
"""
Scan Arrow IPC file(s) lazily into LazyFrame.
Parameters: Similar to read_ipc
Returns:
LazyFrame for deferred IPC reading
"""
def read_ipc_schema(source) -> Schema:
"""
Read schema from Arrow IPC file without loading data.
Parameters:
- source: IPC file path or URL
Returns:
Schema of IPC file
"""Read data from databases using SQL queries with connection management and credential handling.
def read_database(
query,
connection,
*,
partition_on=None,
partition_range=None,
partition_num=None,
protocol=None,
engine=None,
schema_overrides=None,
execute_options=None
) -> DataFrame:
"""
Read from database using SQL query.
Parameters:
- query: SQL query string
- connection: Database connection or connection string
- partition_on: Column for partitioning
- partition_range: Range for partitioning
- partition_num: Number of partitions
- protocol: Database protocol
- engine: Database engine
- schema_overrides: Override column types
- execute_options: Execution options
Returns:
DataFrame with query results
"""
def read_database_uri(
query,
uri,
*,
partition_on=None,
partition_range=None,
partition_num=None,
protocol=None,
engine=None,
schema_overrides=None,
execute_options=None
) -> DataFrame:
"""
Read from database using connection URI.
Parameters:
- query: SQL query string
- uri: Database connection URI
- partition_on: Column for partitioning
- partition_range: Range for partitioning
- partition_num: Number of partitions
- protocol: Database protocol
- engine: Database engine
- schema_overrides: Override column types
- execute_options: Execution options
Returns:
DataFrame with query results
"""Credential providers for accessing cloud storage services with authentication.
class CredentialProvider:
"""Base class for credential providers."""
class CredentialProviderAWS(CredentialProvider):
def __init__(
self,
*,
access_key_id=None,
secret_access_key=None,
session_token=None,
region_name=None,
profile_name=None,
assume_role_arn=None,
assume_role_session_name=None,
assume_role_external_id=None
):
"""
AWS credential provider.
Parameters:
- access_key_id: AWS access key ID
- secret_access_key: AWS secret access key
- session_token: AWS session token
- region_name: AWS region
- profile_name: AWS profile name
- assume_role_arn: Role ARN to assume
- assume_role_session_name: Assume role session name
- assume_role_external_id: External ID for assume role
"""
class CredentialProviderGCP(CredentialProvider):
def __init__(
self,
*,
service_account_path=None,
service_account_key=None,
project_id=None
):
"""
Google Cloud credential provider.
Parameters:
- service_account_path: Path to service account key file
- service_account_key: Service account key JSON
- project_id: GCP project ID
"""
class CredentialProviderAzure(CredentialProvider):
def __init__(
self,
*,
account_name=None,
account_key=None,
sas_token=None,
tenant_id=None,
client_id=None,
client_secret=None
):
"""
Azure credential provider.
Parameters:
- account_name: Storage account name
- account_key: Storage account key
- sas_token: SAS token
- tenant_id: Azure tenant ID
- client_id: Azure client ID
- client_secret: Azure client secret
"""
class CredentialProviderFunction(CredentialProvider):
def __init__(self, function):
"""
Function-based credential provider.
Parameters:
- function: Function returning credentials
"""
# Type alias for function return
CredentialProviderFunctionReturn = dict[str, str]Support for additional formats including Avro, Delta Lake, and Iceberg.
def read_avro(
source,
*,
columns=None,
n_rows=None,
storage_options=None,
credential_provider=None,
retries=2
) -> DataFrame:
"""
Read Avro file into DataFrame.
Parameters:
- source: Avro file path or URL
- columns: Column subset to read
- n_rows: Number of rows to read
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- retries: Number of retries
Returns:
DataFrame with Avro data
"""
def read_delta(
source,
*,
version=None,
columns=None,
storage_options=None,
credential_provider=None,
delta_table_options=None
) -> DataFrame:
"""
Read Delta Lake table into DataFrame.
Parameters:
- source: Delta table path or URL
- version: Table version to read
- columns: Column subset to read
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
- delta_table_options: Delta-specific options
Returns:
DataFrame with Delta table data
"""
def scan_delta(
source,
*,
version=None,
storage_options=None,
credential_provider=None,
delta_table_options=None
) -> LazyFrame:
"""
Scan Delta Lake table lazily into LazyFrame.
Parameters: Similar to read_delta
Returns:
LazyFrame for deferred Delta reading
"""
def scan_iceberg(
source,
*,
storage_options=None,
credential_provider=None
) -> LazyFrame:
"""
Scan Apache Iceberg table lazily into LazyFrame.
Parameters:
- source: Iceberg table path or URL
- storage_options: Cloud storage options
- credential_provider: Cloud credentials
Returns:
LazyFrame for deferred Iceberg reading
"""
def scan_pyarrow_dataset(
source,
*,
allow_pyarrow_filter=True,
pyarrow_options=None
) -> LazyFrame:
"""
Scan PyArrow dataset lazily into LazyFrame.
Parameters:
- source: PyArrow dataset
- allow_pyarrow_filter: Enable PyArrow filtering
- pyarrow_options: PyArrow-specific options
Returns:
LazyFrame for deferred dataset reading
"""Additional I/O utilities including clipboard support and deferred computation.
def read_clipboard(**kwargs) -> DataFrame:
"""
Read data from system clipboard.
Parameters:
- kwargs: Additional options (passed to pandas)
Returns:
DataFrame with clipboard data
"""
def defer() -> Expr:
"""
Create deferred computation placeholder.
Returns:
Deferred expression
"""import polars as pl
# Read CSV with custom options
df = pl.read_csv(
"data.csv",
has_header=True,
separator=",",
dtypes={"id": pl.Int32, "name": pl.String},
null_values=["", "NULL"]
)
# Read Parquet with column selection
df = pl.read_parquet(
"data.parquet",
columns=["id", "name", "value"],
n_rows=1000
)
# Read JSON with schema override
df = pl.read_json(
"data.json",
schema_overrides={"timestamp": pl.Datetime}
)# Lazy CSV scanning with predicate pushdown
lazy_df = (
pl.scan_csv("large_data.csv")
.filter(pl.col("amount") > 1000)
.select(["customer_id", "amount", "date"])
.group_by("customer_id")
.agg([
pl.col("amount").sum().alias("total_amount"),
pl.col("date").max().alias("last_date")
])
)
# Execute optimized query
result = lazy_df.collect()
# Lazy Parquet scanning with column selection
result = (
pl.scan_parquet("*.parquet")
.select(["id", "value"])
.filter(pl.col("value") > 0)
.collect()
)# AWS S3 access with credentials
aws_creds = pl.CredentialProviderAWS(
access_key_id="YOUR_KEY",
secret_access_key="YOUR_SECRET",
region_name="us-east-1"
)
df = pl.read_parquet(
"s3://my-bucket/data.parquet",
credential_provider=aws_creds
)
# Google Cloud Storage
gcp_creds = pl.CredentialProviderGCP(
service_account_path="path/to/service-account.json"
)
df = pl.read_csv(
"gs://my-bucket/data.csv",
credential_provider=gcp_creds
)# Read from database
df = pl.read_database(
"SELECT * FROM users WHERE age > 18",
connection="postgresql://user:pass@localhost/db"
)
# Partitioned database reading
df = pl.read_database(
"SELECT * FROM large_table WHERE date >= ? AND date < ?",
"postgresql://user:pass@localhost/db",
partition_on="id",
partition_range=(1, 1000000),
partition_num=10
)# Read Excel with specific sheet
df = pl.read_excel(
"report.xlsx",
sheet_name="Summary",
schema_overrides={"date": pl.Date}
)
# Read Avro file
df = pl.read_avro("data.avro")
# Read Delta Lake table
df = pl.read_delta(
"path/to/delta/table",
version=5 # Read specific version
)
# Scan multiple Parquet files with glob
lazy_df = pl.scan_parquet("data/year=*/month=*/*.parquet")
result = lazy_df.collect()# Streaming large files
for batch in pl.read_csv_batched("very_large.csv", batch_size=10000):
# Process each batch
processed = batch.with_columns([
pl.col("amount").mul(1.1).alias("amount_with_tax")
])
# Write or accumulate results
# Reading with error handling
df = pl.read_ndjson(
"messy_data.jsonl",
ignore_errors=True, # Skip malformed lines
infer_schema_length=1000
)
# Include file paths in multi-file reading
df = pl.read_csv(
"data/*.csv",
include_file_paths="source_file"
)Install with Tessl CLI
npx tessl i tessl/pypi-polars