Blazingly fast DataFrame library for legacy CPUs without AVX2 support
—
Native SQL support allowing you to query DataFrames and LazyFrames using familiar SQL syntax with full integration into the polars ecosystem. Execute SQL queries directly on your data structures or within the context of broader data pipelines.
Manage and execute SQL queries across multiple named DataFrames and LazyFrames.
class SQLContext:
def __init__(self, frames: dict[str, DataFrame | LazyFrame] | None = None):
"""
Create SQL context for managing multiple data sources.
Parameters:
- frames: Dictionary of {name: DataFrame/LazyFrame} to register
"""
def register(self, name: str, frame: DataFrame | LazyFrame) -> SQLContext:
"""
Register DataFrame or LazyFrame with a name for SQL queries.
Parameters:
- name: Table name to use in SQL queries
- frame: DataFrame or LazyFrame to register
Returns:
- SQLContext: Self for method chaining
"""
def register_globals(self, n: int = 1) -> SQLContext:
"""
Register DataFrames/LazyFrames from global namespace.
Parameters:
- n: Number of stack frames to look back
Returns:
- SQLContext: Self for method chaining
"""
def register_many(self, frames: dict[str, DataFrame | LazyFrame]) -> SQLContext:
"""
Register multiple frames at once.
Parameters:
- frames: Dictionary of {name: frame} pairs
Returns:
- SQLContext: Self for method chaining
"""
def unregister(self, name: str) -> SQLContext:
"""
Unregister a table from the context.
Parameters:
- name: Table name to unregister
Returns:
- SQLContext: Self for method chaining
"""
def execute(
self,
query: str,
*,
eager: bool = False
) -> DataFrame | LazyFrame:
"""
Execute SQL query and return result.
Parameters:
- query: SQL query string
- eager: Return DataFrame (True) or LazyFrame (False)
Returns:
- DataFrame | LazyFrame: Query result
"""
def tables(self) -> list[str]:
"""
Get list of registered table names.
Returns:
- list[str]: Registered table names
"""Execute SQL queries directly on DataFrames and LazyFrames.
def sql(
query: str,
*,
eager: bool = True,
**frames: DataFrame | LazyFrame
) -> DataFrame | LazyFrame:
"""
Execute SQL query on provided frames.
Parameters:
- query: SQL query string
- eager: Return DataFrame (True) or LazyFrame (False)
- frames: Keyword arguments mapping table names to DataFrames/LazyFrames
Returns:
- DataFrame | LazyFrame: Query result
"""Execute SQL queries directly on individual frames.
# Available on DataFrame and LazyFrame classes
def sql(self, query: str, *, table_name: str = "self") -> DataFrame | LazyFrame:
"""
Execute SQL query with this frame as the primary table.
Parameters:
- query: SQL query string (can reference table_name)
- table_name: Name to use for this frame in the query
Returns:
- DataFrame | LazyFrame: Query result (same type as caller)
"""Create expressions from SQL fragments for use in select, filter, etc.
def sql_expr(sql: str) -> Expr:
"""
Create expression from SQL fragment.
Parameters:
- sql: SQL expression string
Returns:
- Expr: Expression from SQL fragment
"""import polars as pl
# Create sample data
df = pl.DataFrame({
"id": [1, 2, 3, 4, 5],
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [25, 30, 35, 28, 32],
"salary": [50000, 60000, 70000, 55000, 65000],
"department": ["IT", "HR", "IT", "Finance", "IT"]
})
# Simple SELECT query
result = df.sql("SELECT name, age, salary FROM self WHERE age > 30")
print(result)
# Aggregation query
agg_result = df.sql("""
SELECT
department,
COUNT(*) as employee_count,
AVG(salary) as avg_salary,
MAX(age) as max_age
FROM self
GROUP BY department
ORDER BY avg_salary DESC
""")# Create multiple DataFrames
employees = pl.DataFrame({
"emp_id": [1, 2, 3, 4],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"dept_id": [1, 2, 1, 3]
})
departments = pl.DataFrame({
"dept_id": [1, 2, 3],
"dept_name": ["IT", "HR", "Finance"],
"budget": [100000, 50000, 75000]
})
# Create SQL context and register tables
ctx = pl.SQLContext()
ctx.register("employees", employees)
ctx.register("departments", departments)
# Execute join query
result = ctx.execute("""
SELECT
e.name,
d.dept_name,
d.budget
FROM employees e
JOIN departments d ON e.dept_id = d.dept_id
ORDER BY d.budget DESC
""")# Execute SQL with keyword arguments
result = pl.sql(
"""
SELECT
e.name,
d.dept_name,
e.salary / d.budget * 100 as salary_percentage
FROM emp e
JOIN dept d ON e.dept_id = d.dept_id
""",
emp=employees,
dept=departments
)sales_data = pl.DataFrame({
"product": ["A", "B", "A", "C", "B", "A"],
"quarter": ["Q1", "Q1", "Q2", "Q2", "Q3", "Q3"],
"revenue": [1000, 1500, 1200, 800, 1800, 1100]
})
result = sales_data.sql("""
WITH quarterly_totals AS (
SELECT
quarter,
SUM(revenue) as total_revenue,
COUNT(*) as product_count
FROM self
GROUP BY quarter
),
product_performance AS (
SELECT
product,
SUM(revenue) as total_product_revenue,
AVG(revenue) as avg_product_revenue
FROM self
GROUP BY product
)
SELECT
qt.quarter,
qt.total_revenue,
pp.product,
pp.avg_product_revenue
FROM quarterly_totals qt
CROSS JOIN product_performance pp
WHERE pp.avg_product_revenue > 1000
ORDER BY qt.quarter, pp.avg_product_revenue DESC
""")time_series = pl.DataFrame({
"date": pl.date_range(pl.date(2023, 1, 1), pl.date(2023, 1, 10), "1d", eager=True),
"value": [100, 105, 102, 108, 110, 107, 112, 115, 109, 118]
})
result = time_series.sql("""
SELECT
date,
value,
LAG(value, 1) OVER (ORDER BY date) as prev_value,
value - LAG(value, 1) OVER (ORDER BY date) as change,
AVG(value) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg_3,
ROW_NUMBER() OVER (ORDER BY value DESC) as value_rank
FROM self
ORDER BY date
""")large_dataset = pl.DataFrame({
"category": ["A", "B", "A", "C", "B", "A", "C", "B"],
"value": [10, 20, 15, 25, 30, 12, 28, 22],
"region": ["North", "South", "North", "West", "South", "West", "North", "West"]
})
result = large_dataset.sql("""
SELECT category, region, value
FROM self
WHERE value > (
SELECT AVG(value)
FROM self s2
WHERE s2.category = self.category
)
UNION ALL
SELECT category, region, value
FROM self
WHERE region = 'North' AND value > 20
ORDER BY category, value DESC
""")# Use SQL expressions within regular Polars operations
df_with_sql_expr = df.select([
pl.col("name"),
pl.sql_expr("age * 12").alias("age_in_months"),
pl.sql_expr("CASE WHEN salary > 60000 THEN 'High' ELSE 'Low' END").alias("salary_category"),
pl.sql_expr("UPPER(department)").alias("dept_upper")
])
# Filter using SQL expressions
filtered_df = df.filter(
pl.sql_expr("age BETWEEN 25 AND 35 AND department IN ('IT', 'Finance')")
)# Create LazyFrame with SQL
lazy_df = pl.scan_csv("large_dataset.csv")
# Execute SQL on LazyFrame (returns LazyFrame)
lazy_result = lazy_df.sql("""
SELECT
category,
COUNT(*) as count,
AVG(value) as avg_value,
SUM(CASE WHEN value > 100 THEN 1 ELSE 0 END) as high_value_count
FROM self
WHERE date >= '2023-01-01'
GROUP BY category
HAVING avg_value > 50
ORDER BY avg_value DESC
""")
# Collect the lazy result
final_result = lazy_result.collect()# Register DataFrames from global namespace
df1 = pl.DataFrame({"a": [1, 2, 3]})
df2 = pl.DataFrame({"b": [4, 5, 6]})
ctx = pl.SQLContext()
ctx.register_globals() # Automatically registers df1 and df2
# Register many frames at once
additional_frames = {
"customers": customer_df,
"orders": order_df,
"products": product_df
}
ctx.register_many(additional_frames)
# Complex multi-table query
result = ctx.execute("""
SELECT
c.customer_name,
p.product_name,
o.quantity,
o.price * o.quantity as total_amount,
RANK() OVER (PARTITION BY c.customer_id ORDER BY o.price * o.quantity DESC) as purchase_rank
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN products p ON o.product_id = p.product_id
WHERE o.order_date >= '2023-01-01'
AND p.category = 'Electronics'
ORDER BY c.customer_name, purchase_rank
""")
# Get list of registered tables
tables_list = ctx.tables()
print(f"Registered tables: {tables_list}")Install with Tessl CLI
npx tessl i tessl/pypi-polars-lts-cpu