chDB is an in-process SQL OLAP Engine powered by ClickHouse
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Seamless integration with pandas DataFrames and PyArrow Tables, enabling SQL queries on DataFrames, table joins, and efficient data conversion between formats. This module bridges Python data science workflows with SQL analytical capabilities.
Unified interface for working with multiple data formats including DataFrames, Arrow tables, and Parquet files.
class Table:
def __init__(
self,
parquet_path: str = None,
temp_parquet_path: str = None,
parquet_memoryview: memoryview = None,
dataframe: pd.DataFrame = None,
arrow_table: pa.Table = None,
use_memfd: bool = False
):
"""
Initialize Table with one of multiple data source formats.
Parameters:
- parquet_path: Path to existing Parquet file
- temp_parquet_path: Path to temporary Parquet file (auto-deleted)
- parquet_memoryview: Parquet data as memory view
- dataframe: pandas DataFrame
- arrow_table: PyArrow Table
- use_memfd: Use memory file descriptor on Linux (fallback to temp file)
"""
def to_pandas(self) -> pd.DataFrame:
"""
Convert table data to pandas DataFrame.
Returns:
pd.DataFrame: Data as pandas DataFrame
Raises:
ImportError: If pandas or pyarrow not available
ValueError: If no data buffer available
"""
def flush_to_disk(self):
"""
Flush in-memory data to disk as temporary Parquet file.
Frees memory by converting DataFrame/Arrow table to disk storage.
"""
def rows_read(self) -> int:
"""Get number of rows processed in last query operation."""
def bytes_read(self) -> int:
"""Get number of bytes processed in last query operation."""
def elapsed(self) -> float:
"""Get elapsed time for last query operation in seconds."""Execute SQL queries on DataFrames and tables with automatic table registration.
class Table:
@staticmethod
def queryStatic(*args, **kwargs):
"""
Execute SQL query on provided tables/DataFrames.
Parameters:
- sql: SQL query string with table references (__tbl1__, __tbl2__, __table__)
- tbl1, tbl2, ...: DataFrames or Table objects referenced in query
- output_format: Output format ("DataFrame", "JSON", "CSV", etc.)
Returns:
Query result in specified format
"""Convenience functions for direct DataFrame querying.
def query(*args, **kwargs):
"""
Execute SQL query on DataFrames/tables. Alias for Table.queryStatic().
Parameters:
- sql: SQL query string
- **kwargs: Named table parameters (tbl1, tbl2, etc.)
Returns:
Query result or Table object
"""
def sql(*args, **kwargs):
"""Alias for query() function with identical functionality."""
def pandas_read_parquet(*args, **kwargs):
"""
Enhanced pandas Parquet reader with optimizations.
Parameters:
Same as pandas.read_parquet()
Returns:
pd.DataFrame: Loaded DataFrame
"""import chdb.dataframe as cdf
import pandas as pd
# Create sample DataFrames
df1 = pd.DataFrame({
'id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'age': [25, 30, 35, 28]
})
df2 = pd.DataFrame({
'id': [1, 2, 3, 4],
'department': ['Engineering', 'Sales', 'Marketing', 'Engineering'],
'salary': [75000, 65000, 70000, 80000]
})
# Query single DataFrame
result = cdf.query(
sql="SELECT name, age FROM __tbl1__ WHERE age > 28",
tbl1=df1
)
print(result.to_pandas())
# Join multiple DataFrames
joined = cdf.query(
sql="""
SELECT t1.name, t1.age, t2.department, t2.salary
FROM __tbl1__ t1
JOIN __tbl2__ t2 ON t1.id = t2.id
WHERE t2.salary > 70000
ORDER BY t2.salary DESC
""",
tbl1=df1,
tbl2=df2
)
print(joined.to_pandas())import chdb.dataframe as cdf
import pandas as pd
# Create DataFrame
sales_data = pd.DataFrame({
'date': ['2024-01-01', '2024-01-02', '2024-01-03'],
'product': ['Widget A', 'Widget B', 'Widget A'],
'quantity': [100, 150, 75],
'price': [9.99, 14.99, 9.99]
})
# Create Table object
sales_table = cdf.Table(dataframe=sales_data)
# Query the table
result_table = cdf.query(
sql="""
SELECT
product,
SUM(quantity) as total_quantity,
SUM(quantity * price) as total_revenue,
AVG(price) as avg_price
FROM __table__
GROUP BY product
ORDER BY total_revenue DESC
""",
table=sales_table
)
# Get results as DataFrame
summary_df = result_table.to_pandas()
print("Product Summary:")
print(summary_df)
# Check query performance metrics
print(f"Rows processed: {result_table.rows_read()}")
print(f"Bytes processed: {result_table.bytes_read()}")
print(f"Query time: {result_table.elapsed():.3f} seconds")import chdb.dataframe as cdf
import pandas as pd
# Initial data
orders = pd.DataFrame({
'order_id': [1, 2, 3, 4, 5],
'customer_id': [101, 102, 101, 103, 102],
'amount': [250.0, 125.5, 89.99, 450.0, 200.0],
'order_date': ['2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19']
})
# First query: Customer totals
customer_totals = cdf.query(
sql="""
SELECT
customer_id,
COUNT(*) as order_count,
SUM(amount) as total_spent,
AVG(amount) as avg_order
FROM __tbl1__
GROUP BY customer_id
""",
tbl1=orders
)
# Second query: Query the results of the first query
top_customers = customer_totals.query(
sql="""
SELECT
customer_id,
total_spent,
order_count
FROM __table__
WHERE total_spent > 200
ORDER BY total_spent DESC
"""
)
print("Top customers (>$200):")
print(top_customers.to_pandas())import chdb.dataframe as cdf
# Query Parquet file directly through Table
parquet_table = cdf.Table(parquet_path="large_dataset.parquet")
# Execute complex analytical query
analysis = cdf.query(
sql="""
SELECT
DATE_TRUNC('month', date_column) as month,
category,
COUNT(*) as record_count,
SUM(value_column) as total_value,
AVG(value_column) as avg_value
FROM __table__
WHERE date_column >= '2024-01-01'
GROUP BY month, category
ORDER BY month, total_value DESC
""",
table=parquet_table
)
# Get results as DataFrame for further processing
monthly_analysis = analysis.to_pandas()
print("Monthly analysis:")
print(monthly_analysis)
# Performance metrics
print(f"Processed {analysis.rows_read():,} rows")
print(f"Data size: {analysis.bytes_read() / 1024 / 1024:.2f} MB")import chdb.dataframe as cdf
import pandas as pd
# Create large DataFrame
large_df = pd.DataFrame({
'id': range(1000000),
'value': np.random.randn(1000000),
'category': np.random.choice(['A', 'B', 'C'], 1000000)
})
# Create Table and flush to disk to save memory
large_table = cdf.Table(dataframe=large_df, use_memfd=True)
large_table.flush_to_disk() # DataFrame is now stored as temp Parquet file
# Query the data (loads only needed portions)
summary = cdf.query(
sql="""
SELECT
category,
COUNT(*) as count,
AVG(value) as avg_value,
STDDEV(value) as std_value
FROM __table__
GROUP BY category
ORDER BY category
""",
table=large_table
)
print("Large dataset summary:")
print(summary.to_pandas())import chdb.dataframe as cdf
import pandas as pd
# Combine DataFrame, Parquet file, and Arrow table
df_sales = pd.DataFrame({
'product_id': [1, 2, 3],
'sales_q1': [1000, 1500, 800]
})
# Assuming we have Arrow table from another source
# arrow_inventory = pa.Table.from_pydict({
# 'product_id': [1, 2, 3, 4],
# 'inventory': [50, 75, 30, 100]
# })
parquet_products = cdf.Table(parquet_path="products.parquet")
# Join across different data sources
comprehensive_report = cdf.query(
sql="""
SELECT
p.product_name,
s.sales_q1,
p.category,
p.price
FROM __products__ p
JOIN __sales__ s ON p.product_id = s.product_id
WHERE s.sales_q1 > 900
ORDER BY s.sales_q1 DESC
""",
products=parquet_products,
sales=df_sales
)
print("Comprehensive product report:")
print(comprehensive_report.to_pandas())import chdb.dataframe as cdf
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'score': [85, 92, 78]
})
# sql() works identically to query()
high_scores = cdf.sql(
"SELECT name, score FROM __tbl1__ WHERE score > 80",
tbl1=df
)
print(high_scores.to_pandas())import chdb.dataframe as cdf
import pandas as pd
from chdb import ChdbError
df = pd.DataFrame({'a': [1, 2, 3]})
try:
# This will fail due to non-existent column
result = cdf.query(
"SELECT nonexistent_column FROM __tbl1__",
tbl1=df
)
except ChdbError as e:
print(f"Query failed: {e}")
try:
# This will fail due to missing table reference
result = cdf.query("SELECT * FROM missing_table")
except ChdbError as e:
print(f"Table reference error: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-chdb