Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows
—
SQL query interface allowing standard SQL operations on DataFrames and integration with existing SQL workflows. Polars provides a comprehensive SQL engine that supports most standard SQL features while maintaining the performance characteristics of the native API.
Context manager for executing SQL queries against registered DataFrames and LazyFrames.
class SQLContext:
def __init__(self, frames: dict[str, DataFrame | LazyFrame] | None = None, **named_frames: DataFrame | LazyFrame):
"""
Create SQL execution context.
Parameters:
- frames: Dictionary mapping table names to DataFrames/LazyFrames
- named_frames: Named DataFrames/LazyFrames as keyword arguments
Examples:
ctx = SQLContext({"users": df1, "orders": df2})
ctx = SQLContext(users=df1, orders=df2)
"""
def execute(self, query: str, *, eager: bool = True) -> DataFrame | LazyFrame:
"""
Execute SQL query.
Parameters:
- query: SQL query string
- eager: Return DataFrame (True) or LazyFrame (False)
Returns:
Query result as DataFrame or LazyFrame
Examples:
result = ctx.execute("SELECT * FROM users WHERE age > 25")
lazy_result = ctx.execute("SELECT * FROM users", eager=False)
"""
def register(self, name: str, frame: DataFrame | LazyFrame) -> SQLContext:
"""
Register DataFrame/LazyFrame as table.
Parameters:
- name: Table name for SQL queries
- frame: DataFrame or LazyFrame to register
Returns:
Self for method chaining
Examples:
ctx.register("customers", customer_df)
"""
def unregister(self, name: str) -> SQLContext:
"""
Remove registered table.
Parameters:
- name: Table name to remove
Returns:
Self for method chaining
Examples:
ctx.unregister("temp_table")
"""
def tables(self) -> list[str]:
"""
Get list of registered table names.
Returns:
List of table names
"""Standalone function for executing SQL queries with inline table registration.
def sql(query: str, *, eager: bool = True, **named_frames: DataFrame | LazyFrame) -> DataFrame | LazyFrame:
"""
Execute SQL query with inline table registration.
Parameters:
- query: SQL query string
- eager: Return DataFrame (True) or LazyFrame (False)
- named_frames: DataFrames/LazyFrames to use as tables
Returns:
Query result as DataFrame or LazyFrame
Examples:
result = pl.sql("SELECT * FROM users WHERE age > 25", users=df)
result = pl.sql("SELECT u.name, o.amount FROM users u JOIN orders o ON u.id = o.user_id", users=users_df, orders=orders_df)
"""
def sql_expr(sql: str) -> Expr:
"""
Create expression from SQL fragment.
Parameters:
- sql: SQL expression string
Returns:
Expression object
Examples:
expr = pl.sql_expr("CASE WHEN age > 18 THEN 'Adult' ELSE 'Minor' END")
df.with_columns(expr.alias("category"))
"""Standard SELECT syntax with column selection, aliases, and expressions.
-- Basic selection
SELECT name, age FROM users;
-- Column aliases
SELECT name AS full_name, age * 12 AS age_months FROM users;
-- Expressions and functions
SELECT
name,
UPPER(name) AS name_upper,
age + 1 AS next_year_age,
CASE WHEN age >= 18 THEN 'Adult' ELSE 'Minor' END AS category
FROM users;
-- All columns
SELECT * FROM users;
-- Distinct values
SELECT DISTINCT department FROM employees;Filtering with various conditions and operators.
-- Simple conditions
SELECT * FROM users WHERE age > 25;
SELECT * FROM users WHERE name = 'John';
SELECT * FROM users WHERE active = true;
-- Multiple conditions
SELECT * FROM users WHERE age > 18 AND age < 65;
SELECT * FROM users WHERE department = 'Sales' OR department = 'Marketing';
-- IN and NOT IN
SELECT * FROM users WHERE department IN ('Sales', 'Marketing', 'Support');
SELECT * FROM users WHERE age NOT IN (25, 30, 35);
-- BETWEEN
SELECT * FROM sales WHERE amount BETWEEN 1000 AND 5000;
-- LIKE patterns
SELECT * FROM users WHERE name LIKE 'John%';
SELECT * FROM users WHERE email LIKE '%@company.com';
-- NULL handling
SELECT * FROM users WHERE phone IS NOT NULL;
SELECT * FROM users WHERE notes IS NULL;Various types of joins between tables.
-- INNER JOIN
SELECT u.name, o.amount
FROM users u
INNER JOIN orders o ON u.id = o.user_id;
-- LEFT JOIN
SELECT u.name, o.amount
FROM users u
LEFT JOIN orders o ON u.id = o.user_id;
-- RIGHT JOIN
SELECT u.name, o.amount
FROM users u
RIGHT JOIN orders o ON u.id = o.user_id;
-- FULL OUTER JOIN
SELECT u.name, o.amount
FROM users u
FULL OUTER JOIN orders o ON u.id = o.user_id;
-- Multiple joins
SELECT u.name, o.amount, p.name AS product_name
FROM users u
JOIN orders o ON u.id = o.user_id
JOIN products p ON o.product_id = p.id;
-- Self join
SELECT u1.name AS user1, u2.name AS user2
FROM users u1
JOIN users u2 ON u1.manager_id = u2.id;Grouping data with aggregate functions.
-- Basic grouping
SELECT department, COUNT(*) AS employee_count
FROM employees
GROUP BY department;
-- Multiple aggregations
SELECT
department,
COUNT(*) AS count,
AVG(salary) AS avg_salary,
MAX(salary) AS max_salary,
MIN(salary) AS min_salary,
SUM(salary) AS total_salary
FROM employees
GROUP BY department;
-- Multiple grouping columns
SELECT department, location, COUNT(*) AS count
FROM employees
GROUP BY department, location;
-- HAVING clause
SELECT department, AVG(salary) AS avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 50000;Sorting query results.
-- Single column ascending
SELECT * FROM users ORDER BY age;
-- Single column descending
SELECT * FROM users ORDER BY age DESC;
-- Multiple columns
SELECT * FROM users ORDER BY department, age DESC;
-- Order by expression
SELECT name, age, salary FROM employees ORDER BY salary / age DESC;
-- Order by column position
SELECT name, age FROM users ORDER BY 2 DESC;Limiting and paginating results.
-- Limit results
SELECT * FROM users LIMIT 10;
-- Offset and limit (pagination)
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 20;
-- Top N results
SELECT * FROM sales ORDER BY amount DESC LIMIT 5;Nested queries for complex operations.
-- Subquery in WHERE
SELECT * FROM users
WHERE department IN (
SELECT department FROM departments WHERE budget > 100000
);
-- Correlated subquery
SELECT u.name, u.salary
FROM users u
WHERE u.salary > (
SELECT AVG(salary) FROM users u2 WHERE u2.department = u.department
);
-- Subquery in SELECT
SELECT
name,
salary,
(SELECT AVG(salary) FROM users) AS company_avg
FROM users;
-- EXISTS
SELECT * FROM users u
WHERE EXISTS (
SELECT 1 FROM orders o WHERE o.user_id = u.id
);Advanced analytical functions with window specifications.
-- Row number
SELECT
name,
salary,
ROW_NUMBER() OVER (ORDER BY salary DESC) AS salary_rank
FROM employees;
-- Partition by
SELECT
name,
department,
salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank
FROM employees;
-- Aggregate window functions
SELECT
name,
salary,
SUM(salary) OVER (ORDER BY salary) AS running_total,
AVG(salary) OVER (PARTITION BY department) AS dept_avg
FROM employees;
-- LAG and LEAD
SELECT
date,
sales,
LAG(sales, 1) OVER (ORDER BY date) AS prev_sales,
LEAD(sales, 1) OVER (ORDER BY date) AS next_sales
FROM daily_sales;WITH clauses for complex queries.
-- Simple CTE
WITH high_earners AS (
SELECT * FROM employees WHERE salary > 80000
)
SELECT department, COUNT(*)
FROM high_earners
GROUP BY department;
-- Multiple CTEs
WITH
sales_summary AS (
SELECT user_id, SUM(amount) AS total_sales
FROM orders
GROUP BY user_id
),
user_categories AS (
SELECT
user_id,
CASE
WHEN total_sales > 10000 THEN 'VIP'
WHEN total_sales > 5000 THEN 'Premium'
ELSE 'Standard'
END AS category
FROM sales_summary
)
SELECT category, COUNT(*) AS user_count
FROM user_categories
GROUP BY category;
-- Recursive CTE (if supported)
WITH RECURSIVE subordinates AS (
SELECT id, name, manager_id, 1 as level
FROM employees
WHERE manager_id IS NULL
UNION ALL
SELECT e.id, e.name, e.manager_id, s.level + 1
FROM employees e
JOIN subordinates s ON e.manager_id = s.id
)
SELECT * FROM subordinates;SQL functions and type operations.
-- String functions
SELECT
UPPER(name) AS name_upper,
LOWER(name) AS name_lower,
LENGTH(name) AS name_length,
SUBSTRING(name, 1, 3) AS name_prefix,
CONCAT(first_name, ' ', last_name) AS full_name
FROM users;
-- Numeric functions
SELECT
ROUND(salary, -3) AS salary_rounded,
ABS(balance) AS balance_abs,
CEIL(rating) AS rating_ceil,
FLOOR(rating) AS rating_floor
FROM accounts;
-- Date functions
SELECT
EXTRACT(YEAR FROM created_at) AS year,
EXTRACT(MONTH FROM created_at) AS month,
DATE_TRUNC('month', created_at) AS month_start,
created_at + INTERVAL '30 days' AS future_date
FROM orders;
-- Conditional functions
SELECT
name,
CASE
WHEN age < 18 THEN 'Minor'
WHEN age < 65 THEN 'Adult'
ELSE 'Senior'
END AS age_category,
COALESCE(phone, email, 'No contact') AS contact_info,
NULLIF(status, 'inactive') AS active_status
FROM users;import polars as pl
# Create sample data
users = pl.DataFrame({
"id": [1, 2, 3, 4, 5],
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [25, 30, 35, 28, 32],
"department": ["Sales", "Engineering", "Sales", "Marketing", "Engineering"]
})
orders = pl.DataFrame({
"id": [101, 102, 103, 104, 105],
"user_id": [1, 2, 1, 3, 2],
"amount": [100, 250, 150, 300, 200],
"date": ["2023-01-15", "2023-01-16", "2023-01-17", "2023-01-18", "2023-01-19"]
})
# Simple SQL query
result = pl.sql("""
SELECT name, age, department
FROM users
WHERE age > 30
ORDER BY age DESC
""", users=users)
print(result)# Create SQL context with multiple tables
ctx = pl.SQLContext({
"users": users,
"orders": orders
})
# Complex query with joins
result = ctx.execute("""
SELECT
u.name,
u.department,
COUNT(o.id) as order_count,
SUM(o.amount) as total_spent,
AVG(o.amount) as avg_order
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name, u.department
ORDER BY total_spent DESC
""")
# Register additional tables
products = pl.DataFrame({
"id": [1, 2, 3],
"name": ["Widget", "Gadget", "Tool"],
"category": ["A", "B", "A"]
})
ctx.register("products", products)# Window functions
result = ctx.execute("""
SELECT
name,
department,
age,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY age) as dept_rank,
AVG(age) OVER (PARTITION BY department) as dept_avg_age,
age - AVG(age) OVER (PARTITION BY department) as age_diff
FROM users
""")
# CTE with aggregations
result = ctx.execute("""
WITH department_stats AS (
SELECT
department,
COUNT(*) as emp_count,
AVG(age) as avg_age,
MAX(age) as max_age
FROM users
GROUP BY department
),
order_stats AS (
SELECT
u.department,
SUM(o.amount) as dept_sales
FROM users u
JOIN orders o ON u.id = o.user_id
GROUP BY u.department
)
SELECT
ds.department,
ds.emp_count,
ds.avg_age,
COALESCE(os.dept_sales, 0) as total_sales,
COALESCE(os.dept_sales, 0) / ds.emp_count as sales_per_employee
FROM department_stats ds
LEFT JOIN order_stats os ON ds.department = os.department
ORDER BY sales_per_employee DESC
""")# Start with SQL, continue with Polars API
lazy_result = pl.sql("""
SELECT u.name, u.department, o.amount, o.date
FROM users u
JOIN orders o ON u.id = o.user_id
WHERE u.age > 25
""", users=users, orders=orders, eager=False)
# Continue with Polars operations
final_result = (lazy_result
.with_columns([
pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
pl.col("amount").cast(pl.Float64)
])
.filter(pl.col("amount") > 150)
.group_by("department")
.agg([
pl.col("amount").sum().alias("total_sales"),
pl.col("name").n_unique().alias("unique_customers")
])
.collect()
)# Use SQL expressions in Polars operations
df = users.with_columns([
pl.sql_expr("CASE WHEN age >= 30 THEN 'Senior' ELSE 'Junior' END").alias("seniority"),
pl.sql_expr("UPPER(name)").alias("name_upper"),
pl.sql_expr("age * 12").alias("age_months")
])
# Complex SQL expressions
df = users.with_columns([
pl.sql_expr("""
CASE
WHEN department = 'Engineering' AND age > 30 THEN 'Senior Engineer'
WHEN department = 'Engineering' THEN 'Engineer'
WHEN department = 'Sales' AND age > 28 THEN 'Senior Sales'
WHEN department = 'Sales' THEN 'Sales Rep'
ELSE 'Other'
END
""").alias("role")
])# Lazy execution with SQL
lazy_query = pl.sql("""
SELECT
department,
COUNT(*) as count,
AVG(age) as avg_age
FROM users
WHERE age > 25
GROUP BY department
""", users=users.lazy(), eager=False)
# Check execution plan
print(lazy_query.explain())
# Collect with optimizations
result = lazy_query.collect(
predicate_pushdown=True,
projection_pushdown=True
)try:
result = ctx.execute("SELECT * FROM nonexistent_table")
except pl.SQLSyntaxError as e:
print(f"SQL syntax error: {e}")
except pl.SQLInterfaceError as e:
print(f"SQL interface error: {e}")
except Exception as e:
print(f"Other error: {e}").collect() optimizationInstall with Tessl CLI
npx tessl i tessl/pypi-polars-u64-idx@1.33.1