tessl/pypi-chdb

chDB is an in-process SQL OLAP Engine powered by ClickHouse

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

DataFrame Integration

Name: tessl/pypi-chdb
Author: tessl

Seamless integration with pandas DataFrames and PyArrow Tables, enabling SQL queries on DataFrames, table joins, and efficient data conversion between formats. This module bridges Python data science workflows with SQL analytical capabilities.

Capabilities

Table Wrapper Class

Unified interface for working with multiple data formats including DataFrames, Arrow tables, and Parquet files.

class Table:
    def __init__(
        self,
        parquet_path: str = None,
        temp_parquet_path: str = None,
        parquet_memoryview: memoryview = None,
        dataframe: pd.DataFrame = None,
        arrow_table: pa.Table = None,
        use_memfd: bool = False
    ):
        """
        Initialize Table with one of multiple data source formats.
        
        Parameters:
        - parquet_path: Path to existing Parquet file
        - temp_parquet_path: Path to temporary Parquet file (auto-deleted) 
        - parquet_memoryview: Parquet data as memory view
        - dataframe: pandas DataFrame
        - arrow_table: PyArrow Table
        - use_memfd: Use memory file descriptor on Linux (fallback to temp file)
        """
    
    def to_pandas(self) -> pd.DataFrame:
        """
        Convert table data to pandas DataFrame.
        
        Returns:
        pd.DataFrame: Data as pandas DataFrame
        
        Raises:
        ImportError: If pandas or pyarrow not available
        ValueError: If no data buffer available
        """
    
    def flush_to_disk(self):
        """
        Flush in-memory data to disk as temporary Parquet file.
        Frees memory by converting DataFrame/Arrow table to disk storage.
        """
    
    def rows_read(self) -> int:
        """Get number of rows processed in last query operation."""
    
    def bytes_read(self) -> int:
        """Get number of bytes processed in last query operation."""
    
    def elapsed(self) -> float:
        """Get elapsed time for last query operation in seconds."""

Static Query Methods

Execute SQL queries on DataFrames and tables with automatic table registration.

class Table:
    @staticmethod
    def queryStatic(*args, **kwargs):
        """
        Execute SQL query on provided tables/DataFrames.
        
        Parameters:
        - sql: SQL query string with table references (__tbl1__, __tbl2__, __table__)
        - tbl1, tbl2, ...: DataFrames or Table objects referenced in query
        - output_format: Output format ("DataFrame", "JSON", "CSV", etc.)
        
        Returns:
        Query result in specified format
        """

Module-Level Functions

Convenience functions for direct DataFrame querying.

def query(*args, **kwargs):
    """
    Execute SQL query on DataFrames/tables. Alias for Table.queryStatic().
    
    Parameters:
    - sql: SQL query string
    - **kwargs: Named table parameters (tbl1, tbl2, etc.)
    
    Returns:
    Query result or Table object
    """

def sql(*args, **kwargs):
    """Alias for query() function with identical functionality."""

def pandas_read_parquet(*args, **kwargs):
    """
    Enhanced pandas Parquet reader with optimizations.
    
    Parameters:
    Same as pandas.read_parquet()
    
    Returns:
    pd.DataFrame: Loaded DataFrame
    """

Usage Examples

Basic DataFrame Queries

import chdb.dataframe as cdf
import pandas as pd

# Create sample DataFrames
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28]
})

df2 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'department': ['Engineering', 'Sales', 'Marketing', 'Engineering'],
    'salary': [75000, 65000, 70000, 80000]
})

# Query single DataFrame
result = cdf.query(
    sql="SELECT name, age FROM __tbl1__ WHERE age > 28",
    tbl1=df1
)
print(result.to_pandas())

# Join multiple DataFrames
joined = cdf.query(
    sql="""
    SELECT t1.name, t1.age, t2.department, t2.salary
    FROM __tbl1__ t1
    JOIN __tbl2__ t2 ON t1.id = t2.id
    WHERE t2.salary > 70000
    ORDER BY t2.salary DESC
    """,
    tbl1=df1,
    tbl2=df2
)
print(joined.to_pandas())

Working with Table Objects

import chdb.dataframe as cdf
import pandas as pd

# Create DataFrame
sales_data = pd.DataFrame({
    'date': ['2024-01-01', '2024-01-02', '2024-01-03'],
    'product': ['Widget A', 'Widget B', 'Widget A'],
    'quantity': [100, 150, 75],
    'price': [9.99, 14.99, 9.99]
})

# Create Table object
sales_table = cdf.Table(dataframe=sales_data)

# Query the table
result_table = cdf.query(
    sql="""
    SELECT 
        product,
        SUM(quantity) as total_quantity,
        SUM(quantity * price) as total_revenue,
        AVG(price) as avg_price
    FROM __table__
    GROUP BY product
    ORDER BY total_revenue DESC
    """,
    table=sales_table
)

# Get results as DataFrame
summary_df = result_table.to_pandas()
print("Product Summary:")
print(summary_df)

# Check query performance metrics
print(f"Rows processed: {result_table.rows_read()}")
print(f"Bytes processed: {result_table.bytes_read()}")
print(f"Query time: {result_table.elapsed():.3f} seconds")

Chaining Queries on Table Results

import chdb.dataframe as cdf
import pandas as pd

# Initial data
orders = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5],
    'customer_id': [101, 102, 101, 103, 102],
    'amount': [250.0, 125.5, 89.99, 450.0, 200.0],
    'order_date': ['2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19']
})

# First query: Customer totals
customer_totals = cdf.query(
    sql="""
    SELECT 
        customer_id,
        COUNT(*) as order_count,
        SUM(amount) as total_spent,
        AVG(amount) as avg_order
    FROM __tbl1__
    GROUP BY customer_id
    """,
    tbl1=orders
)

# Second query: Query the results of the first query
top_customers = customer_totals.query(
    sql="""
    SELECT 
        customer_id,
        total_spent,
        order_count
    FROM __table__
    WHERE total_spent > 200
    ORDER BY total_spent DESC
    """
)

print("Top customers (>$200):")
print(top_customers.to_pandas())

Working with Parquet Files

import chdb.dataframe as cdf

# Query Parquet file directly through Table
parquet_table = cdf.Table(parquet_path="large_dataset.parquet")

# Execute complex analytical query
analysis = cdf.query(
    sql="""
    SELECT 
        DATE_TRUNC('month', date_column) as month,
        category,
        COUNT(*) as record_count,
        SUM(value_column) as total_value,
        AVG(value_column) as avg_value
    FROM __table__
    WHERE date_column >= '2024-01-01'
    GROUP BY month, category
    ORDER BY month, total_value DESC
    """,
    table=parquet_table
)

# Get results as DataFrame for further processing
monthly_analysis = analysis.to_pandas()
print("Monthly analysis:")
print(monthly_analysis)

# Performance metrics
print(f"Processed {analysis.rows_read():,} rows")
print(f"Data size: {analysis.bytes_read() / 1024 / 1024:.2f} MB")

Memory Management with Large DataFrames

import chdb.dataframe as cdf
import pandas as pd

# Create large DataFrame
large_df = pd.DataFrame({
    'id': range(1000000),
    'value': np.random.randn(1000000),
    'category': np.random.choice(['A', 'B', 'C'], 1000000)
})

# Create Table and flush to disk to save memory
large_table = cdf.Table(dataframe=large_df, use_memfd=True)
large_table.flush_to_disk()  # DataFrame is now stored as temp Parquet file

# Query the data (loads only needed portions)
summary = cdf.query(
    sql="""
    SELECT 
        category,
        COUNT(*) as count,
        AVG(value) as avg_value,
        STDDEV(value) as std_value
    FROM __table__
    GROUP BY category
    ORDER BY category
    """,
    table=large_table
)

print("Large dataset summary:")
print(summary.to_pandas())

Mixed Data Sources

import chdb.dataframe as cdf
import pandas as pd

# Combine DataFrame, Parquet file, and Arrow table
df_sales = pd.DataFrame({
    'product_id': [1, 2, 3],
    'sales_q1': [1000, 1500, 800]
})

# Assuming we have Arrow table from another source
# arrow_inventory = pa.Table.from_pydict({
#     'product_id': [1, 2, 3, 4],
#     'inventory': [50, 75, 30, 100]
# })

parquet_products = cdf.Table(parquet_path="products.parquet")

# Join across different data sources
comprehensive_report = cdf.query(
    sql="""
    SELECT 
        p.product_name,
        s.sales_q1,
        p.category,
        p.price
    FROM __products__ p
    JOIN __sales__ s ON p.product_id = s.product_id
    WHERE s.sales_q1 > 900
    ORDER BY s.sales_q1 DESC
    """,
    products=parquet_products,
    sales=df_sales
)

print("Comprehensive product report:")
print(comprehensive_report.to_pandas())

Using sql() Alias

import chdb.dataframe as cdf
import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [85, 92, 78]
})

# sql() works identically to query()
high_scores = cdf.sql(
    "SELECT name, score FROM __tbl1__ WHERE score > 80",
    tbl1=df
)

print(high_scores.to_pandas())

Error Handling

import chdb.dataframe as cdf
import pandas as pd
from chdb import ChdbError

df = pd.DataFrame({'a': [1, 2, 3]})

try:
    # This will fail due to non-existent column
    result = cdf.query(
        "SELECT nonexistent_column FROM __tbl1__",
        tbl1=df
    )
except ChdbError as e:
    print(f"Query failed: {e}")

try:
    # This will fail due to missing table reference
    result = cdf.query("SELECT * FROM missing_table")
except ChdbError as e:
    print(f"Table reference error: {e}")

Install with Tessl CLI