Tessl Tile for pypi/chdb@2.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

dataframe.md dbapi.md index.md query-functions.md sessions.md udf.md utils.md

dataframe.mddocs/

0
# DataFrame Integration
1

2
Seamless integration with pandas DataFrames and PyArrow Tables, enabling SQL queries on DataFrames, table joins, and efficient data conversion between formats. This module bridges Python data science workflows with SQL analytical capabilities.
3

4
## Capabilities
5

6
### Table Wrapper Class
7

8
Unified interface for working with multiple data formats including DataFrames, Arrow tables, and Parquet files.
9

10
```python { .api }
11
class Table:
12
    def __init__(
13
        self,
14
        parquet_path: str = None,
15
        temp_parquet_path: str = None,
16
        parquet_memoryview: memoryview = None,
17
        dataframe: pd.DataFrame = None,
18
        arrow_table: pa.Table = None,
19
        use_memfd: bool = False
20
    ):
21
        """
22
        Initialize Table with one of multiple data source formats.
23
        
24
        Parameters:
25
        - parquet_path: Path to existing Parquet file
26
        - temp_parquet_path: Path to temporary Parquet file (auto-deleted) 
27
        - parquet_memoryview: Parquet data as memory view
28
        - dataframe: pandas DataFrame
29
        - arrow_table: PyArrow Table
30
        - use_memfd: Use memory file descriptor on Linux (fallback to temp file)
31
        """
32
    
33
    def to_pandas(self) -> pd.DataFrame:
34
        """
35
        Convert table data to pandas DataFrame.
36
        
37
        Returns:
38
        pd.DataFrame: Data as pandas DataFrame
39
        
40
        Raises:
41
        ImportError: If pandas or pyarrow not available
42
        ValueError: If no data buffer available
43
        """
44
    
45
    def flush_to_disk(self):
46
        """
47
        Flush in-memory data to disk as temporary Parquet file.
48
        Frees memory by converting DataFrame/Arrow table to disk storage.
49
        """
50
    
51
    def rows_read(self) -> int:
52
        """Get number of rows processed in last query operation."""
53
    
54
    def bytes_read(self) -> int:
55
        """Get number of bytes processed in last query operation."""
56
    
57
    def elapsed(self) -> float:
58
        """Get elapsed time for last query operation in seconds."""
59
```
60

61
### Static Query Methods
62

63
Execute SQL queries on DataFrames and tables with automatic table registration.
64

65
```python { .api }
66
class Table:
67
    @staticmethod
68
    def queryStatic(*args, **kwargs):
69
        """
70
        Execute SQL query on provided tables/DataFrames.
71
        
72
        Parameters:
73
        - sql: SQL query string with table references (__tbl1__, __tbl2__, __table__)
74
        - tbl1, tbl2, ...: DataFrames or Table objects referenced in query
75
        - output_format: Output format ("DataFrame", "JSON", "CSV", etc.)
76
        
77
        Returns:
78
        Query result in specified format
79
        """
80
```
81

82
### Module-Level Functions
83

84
Convenience functions for direct DataFrame querying.
85

86
```python { .api }
87
def query(*args, **kwargs):
88
    """
89
    Execute SQL query on DataFrames/tables. Alias for Table.queryStatic().
90
    
91
    Parameters:
92
    - sql: SQL query string
93
    - **kwargs: Named table parameters (tbl1, tbl2, etc.)
94
    
95
    Returns:
96
    Query result or Table object
97
    """
98

99
def sql(*args, **kwargs):
100
    """Alias for query() function with identical functionality."""
101

102
def pandas_read_parquet(*args, **kwargs):
103
    """
104
    Enhanced pandas Parquet reader with optimizations.
105
    
106
    Parameters:
107
    Same as pandas.read_parquet()
108
    
109
    Returns:
110
    pd.DataFrame: Loaded DataFrame
111
    """
112
```
113

114
## Usage Examples
115

116
### Basic DataFrame Queries
117

118
```python
119
import chdb.dataframe as cdf
120
import pandas as pd
121

122
# Create sample DataFrames
123
df1 = pd.DataFrame({
124
    'id': [1, 2, 3, 4],
125
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
126
    'age': [25, 30, 35, 28]
127
})
128

129
df2 = pd.DataFrame({
130
    'id': [1, 2, 3, 4],
131
    'department': ['Engineering', 'Sales', 'Marketing', 'Engineering'],
132
    'salary': [75000, 65000, 70000, 80000]
133
})
134

135
# Query single DataFrame
136
result = cdf.query(
137
    sql="SELECT name, age FROM __tbl1__ WHERE age > 28",
138
    tbl1=df1
139
)
140
print(result.to_pandas())
141

142
# Join multiple DataFrames
143
joined = cdf.query(
144
    sql="""
145
    SELECT t1.name, t1.age, t2.department, t2.salary
146
    FROM __tbl1__ t1
147
    JOIN __tbl2__ t2 ON t1.id = t2.id
148
    WHERE t2.salary > 70000
149
    ORDER BY t2.salary DESC
150
    """,
151
    tbl1=df1,
152
    tbl2=df2
153
)
154
print(joined.to_pandas())
155
```
156

157
### Working with Table Objects
158

159
```python
160
import chdb.dataframe as cdf
161
import pandas as pd
162

163
# Create DataFrame
164
sales_data = pd.DataFrame({
165
    'date': ['2024-01-01', '2024-01-02', '2024-01-03'],
166
    'product': ['Widget A', 'Widget B', 'Widget A'],
167
    'quantity': [100, 150, 75],
168
    'price': [9.99, 14.99, 9.99]
169
})
170

171
# Create Table object
172
sales_table = cdf.Table(dataframe=sales_data)
173

174
# Query the table
175
result_table = cdf.query(
176
    sql="""
177
    SELECT 
178
        product,
179
        SUM(quantity) as total_quantity,
180
        SUM(quantity * price) as total_revenue,
181
        AVG(price) as avg_price
182
    FROM __table__
183
    GROUP BY product
184
    ORDER BY total_revenue DESC
185
    """,
186
    table=sales_table
187
)
188

189
# Get results as DataFrame
190
summary_df = result_table.to_pandas()
191
print("Product Summary:")
192
print(summary_df)
193

194
# Check query performance metrics
195
print(f"Rows processed: {result_table.rows_read()}")
196
print(f"Bytes processed: {result_table.bytes_read()}")
197
print(f"Query time: {result_table.elapsed():.3f} seconds")
198
```
199

200
### Chaining Queries on Table Results
201

202
```python
203
import chdb.dataframe as cdf
204
import pandas as pd
205

206
# Initial data
207
orders = pd.DataFrame({
208
    'order_id': [1, 2, 3, 4, 5],
209
    'customer_id': [101, 102, 101, 103, 102],
210
    'amount': [250.0, 125.5, 89.99, 450.0, 200.0],
211
    'order_date': ['2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19']
212
})
213

214
# First query: Customer totals
215
customer_totals = cdf.query(
216
    sql="""
217
    SELECT 
218
        customer_id,
219
        COUNT(*) as order_count,
220
        SUM(amount) as total_spent,
221
        AVG(amount) as avg_order
222
    FROM __tbl1__
223
    GROUP BY customer_id
224
    """,
225
    tbl1=orders
226
)
227

228
# Second query: Query the results of the first query
229
top_customers = customer_totals.query(
230
    sql="""
231
    SELECT 
232
        customer_id,
233
        total_spent,
234
        order_count
235
    FROM __table__
236
    WHERE total_spent > 200
237
    ORDER BY total_spent DESC
238
    """
239
)
240

241
print("Top customers (>$200):")
242
print(top_customers.to_pandas())
243
```
244

245
### Working with Parquet Files
246

247
```python
248
import chdb.dataframe as cdf
249

250
# Query Parquet file directly through Table
251
parquet_table = cdf.Table(parquet_path="large_dataset.parquet")
252

253
# Execute complex analytical query
254
analysis = cdf.query(
255
    sql="""
256
    SELECT 
257
        DATE_TRUNC('month', date_column) as month,
258
        category,
259
        COUNT(*) as record_count,
260
        SUM(value_column) as total_value,
261
        AVG(value_column) as avg_value
262
    FROM __table__
263
    WHERE date_column >= '2024-01-01'
264
    GROUP BY month, category
265
    ORDER BY month, total_value DESC
266
    """,
267
    table=parquet_table
268
)
269

270
# Get results as DataFrame for further processing
271
monthly_analysis = analysis.to_pandas()
272
print("Monthly analysis:")
273
print(monthly_analysis)
274

275
# Performance metrics
276
print(f"Processed {analysis.rows_read():,} rows")
277
print(f"Data size: {analysis.bytes_read() / 1024 / 1024:.2f} MB")
278
```
279

280
### Memory Management with Large DataFrames
281

282
```python
283
import chdb.dataframe as cdf
284
import pandas as pd
285

286
# Create large DataFrame
287
large_df = pd.DataFrame({
288
    'id': range(1000000),
289
    'value': np.random.randn(1000000),
290
    'category': np.random.choice(['A', 'B', 'C'], 1000000)
291
})
292

293
# Create Table and flush to disk to save memory
294
large_table = cdf.Table(dataframe=large_df, use_memfd=True)
295
large_table.flush_to_disk()  # DataFrame is now stored as temp Parquet file
296

297
# Query the data (loads only needed portions)
298
summary = cdf.query(
299
    sql="""
300
    SELECT 
301
        category,
302
        COUNT(*) as count,
303
        AVG(value) as avg_value,
304
        STDDEV(value) as std_value
305
    FROM __table__
306
    GROUP BY category
307
    ORDER BY category
308
    """,
309
    table=large_table
310
)
311

312
print("Large dataset summary:")
313
print(summary.to_pandas())
314
```
315

316
### Mixed Data Sources
317

318
```python
319
import chdb.dataframe as cdf
320
import pandas as pd
321

322
# Combine DataFrame, Parquet file, and Arrow table
323
df_sales = pd.DataFrame({
324
    'product_id': [1, 2, 3],
325
    'sales_q1': [1000, 1500, 800]
326
})
327

328
# Assuming we have Arrow table from another source
329
# arrow_inventory = pa.Table.from_pydict({
330
#     'product_id': [1, 2, 3, 4],
331
#     'inventory': [50, 75, 30, 100]
332
# })
333

334
parquet_products = cdf.Table(parquet_path="products.parquet")
335

336
# Join across different data sources
337
comprehensive_report = cdf.query(
338
    sql="""
339
    SELECT 
340
        p.product_name,
341
        s.sales_q1,
342
        p.category,
343
        p.price
344
    FROM __products__ p
345
    JOIN __sales__ s ON p.product_id = s.product_id
346
    WHERE s.sales_q1 > 900
347
    ORDER BY s.sales_q1 DESC
348
    """,
349
    products=parquet_products,
350
    sales=df_sales
351
)
352

353
print("Comprehensive product report:")
354
print(comprehensive_report.to_pandas())
355
```
356

357
### Using sql() Alias
358

359
```python
360
import chdb.dataframe as cdf
361
import pandas as pd
362

363
df = pd.DataFrame({
364
    'name': ['Alice', 'Bob', 'Charlie'],
365
    'score': [85, 92, 78]
366
})
367

368
# sql() works identically to query()
369
high_scores = cdf.sql(
370
    "SELECT name, score FROM __tbl1__ WHERE score > 80",
371
    tbl1=df
372
)
373

374
print(high_scores.to_pandas())
375
```
376

377
### Error Handling
378

379
```python
380
import chdb.dataframe as cdf
381
import pandas as pd
382
from chdb import ChdbError
383

384
df = pd.DataFrame({'a': [1, 2, 3]})
385

386
try:
387
    # This will fail due to non-existent column
388
    result = cdf.query(
389
        "SELECT nonexistent_column FROM __tbl1__",
390
        tbl1=df
391
    )
392
except ChdbError as e:
393
    print(f"Query failed: {e}")
394

395
try:
396
    # This will fail due to missing table reference
397
    result = cdf.query("SELECT * FROM missing_table")
398
except ChdbError as e:
399
    print(f"Table reference error: {e}")
400
```

Version

Tile

Files

dataframe.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

dataframe.mddocs/