tessl/pypi-sqlglot

A comprehensive SQL parser, transpiler, optimizer, and engine supporting 30+ dialects

Overview

Eval results

Files

SQL Dialects

Name: tessl/pypi-sqlglot
Author: tessl

Support for 30+ SQL dialects with dialect-specific parsing, generation, and transformation rules. Each dialect handles unique syntax, functions, data types, and SQL features specific to different database systems.

Capabilities

Base Dialect System

Core dialect functionality that provides the foundation for all SQL dialect implementations.

class Dialect:
    """Base dialect class providing core SQL processing functionality."""
    
    def parse(self, sql: str, **opts) -> List[Optional[Expression]]:
        """Parse SQL string using dialect-specific grammar rules."""
    
    def generate(self, expression: Expression, **opts) -> str:
        """Generate SQL string from expression using dialect-specific syntax."""
    
    def tokenize(self, sql: str) -> List[Token]:
        """Tokenize SQL string using dialect-specific keywords and operators."""
    
    @classmethod
    def get_or_raise(cls, dialect: str) -> Dialect:
        """Get dialect instance by name or raise error if not found."""

    @classmethod  
    def get(cls, dialect: str) -> Optional[Dialect]:
        """Get dialect by name, returns None if not found."""

Cloud Data Warehouse Dialects

Modern cloud-based data warehouse platforms with advanced analytics features.

class BigQuery(Dialect):
    """Google BigQuery dialect with standard SQL and legacy SQL support."""

class Snowflake(Dialect):
    """Snowflake dialect with cloud data warehouse features."""

class Redshift(Dialect):
    """Amazon Redshift dialect with PostgreSQL-based syntax."""

class Databricks(Dialect):
    """Databricks dialect based on Apache Spark SQL."""

class Fabric(Dialect):
    """Microsoft Fabric dialect for data analytics."""

Open Source Analytics Engines

Popular open-source SQL engines for data processing and analytics.

class Spark(Dialect):
    """Apache Spark SQL dialect for distributed data processing."""

class Spark2(Dialect):
    """Apache Spark 2.x dialect with legacy compatibility."""

class DuckDB(Dialect):
    """DuckDB dialect for embedded analytical processing."""

class ClickHouse(Dialect):
    """ClickHouse dialect for real-time analytics."""

class Presto(Dialect):
    """Presto dialect for distributed query processing."""

class Trino(Dialect):
    """Trino dialect (formerly PrestoSQL) for federated queries."""

class Hive(Dialect):
    """Apache Hive dialect for Hadoop data warehouse."""

class Drill(Dialect):
    """Apache Drill dialect for schema-free SQL queries."""

class Doris(Dialect):
    """Apache Doris dialect for modern data warehouse."""

class StarRocks(Dialect):
    """StarRocks dialect for real-time analytics."""

Traditional Database Systems

Established relational database management systems.

class MySQL(Dialect):
    """MySQL dialect with MySQL-specific syntax and functions."""

class Postgres(Dialect):
    """PostgreSQL dialect with advanced SQL features."""

class Oracle(Dialect):
    """Oracle Database dialect with enterprise features."""

class SQLite(Dialect):
    """SQLite dialect for embedded databases."""

class TSQL(Dialect):
    """Microsoft SQL Server T-SQL dialect."""

class Teradata(Dialect):
    """Teradata dialect for enterprise data warehousing."""

Specialized and Emerging Platforms

Newer and specialized SQL platforms for specific use cases.

class Athena(Dialect):
    """Amazon Athena dialect for serverless queries."""

class Dremio(Dialect):
    """Dremio dialect for data lake analytics."""

class Exasol(Dialect):
    """Exasol dialect for high-performance analytics."""

class Materialize(Dialect):
    """Materialize dialect for streaming SQL."""

class RisingWave(Dialect):
    """RisingWave dialect for stream processing."""

class SingleStore(Dialect):
    """SingleStore dialect for distributed SQL."""

class Tableau(Dialect):
    """Tableau dialect for business intelligence."""

class Druid(Dialect):
    """Apache Druid dialect for real-time analytics."""

class Dune(Dialect):
    """Dune Analytics dialect for blockchain data."""

class PRQL(Dialect):
    """PRQL (Pipelined Relational Query Language) dialect."""

Usage Examples

Working with Specific Dialects

import sqlglot
from sqlglot.dialects import BigQuery, Snowflake, Spark

# Parse using specific dialect
bigquery = BigQuery()
expressions = bigquery.parse("SELECT EXTRACT(YEAR FROM date_col) FROM table")

# Generate SQL for specific dialect  
snowflake = Snowflake()
sql = snowflake.generate(expressions[0])

# Use dialect names with main functions
spark_sql = "SELECT explode(split('a,b,c', ',')) as item"
postgres_sql = sqlglot.transpile(spark_sql, read="spark", write="postgres")[0]

Dialect-Specific Features

import sqlglot

# BigQuery array and struct syntax
bq_query = """
SELECT 
    arr[OFFSET(0)] as first_element,
    struct_col.field_name,
    PARSE_DATE('%Y-%m-%d', date_string)
FROM table_name
"""
standard_sql = sqlglot.transpile(bq_query, read="bigquery", write="postgres")[0]

# Snowflake variant data type
sf_query = """
SELECT 
    variant_col:field::string as extracted_field,
    FLATTEN(array_col) as flattened
FROM table_name
"""

# Spark SQL window functions with specific syntax
spark_query = """
SELECT 
    col1,
    LAG(col1, 1) OVER (PARTITION BY col2 ORDER BY col3 
                       ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as prev_val
FROM table
"""

Custom Dialect Creation

from sqlglot import exp
from sqlglot.dialects.dialect import Dialect
from sqlglot.generator import Generator
from sqlglot.tokens import Tokenizer, TokenType

class CustomDialect(Dialect):
    class Tokenizer(Tokenizer):
        QUOTES = ["'", '"']  # String delimiters
        IDENTIFIERS = ["`"]  # Identifier delimiters
        
        KEYWORDS = {
            **Tokenizer.KEYWORDS,
            "INT64": TokenType.BIGINT,
            "FLOAT64": TokenType.DOUBLE,
        }
    
    class Generator(Generator):
        # Custom SQL generation rules
        TRANSFORMS = {
            exp.Array: lambda self, e: f"[{self.expressions(e)}]",
        }
        
        # Data type mappings
        TYPE_MAPPING = {
            exp.DataType.Type.TINYINT: "INT64",
            exp.DataType.Type.SMALLINT: "INT64", 
            exp.DataType.Type.INT: "INT64",
            exp.DataType.Type.BIGINT: "INT64",
            exp.DataType.Type.FLOAT: "FLOAT64",
            exp.DataType.Type.DOUBLE: "FLOAT64",
        }

# Use custom dialect
custom = CustomDialect()
parsed = custom.parse("SELECT col FROM table WHERE int_col > 5")
generated = custom.generate(parsed[0])

Dialect Detection and Conversion

import sqlglot
from sqlglot.dialects import Dialect

# Get available dialects
available_dialects = [
    "bigquery", "snowflake", "spark", "postgres", "mysql", 
    "oracle", "duckdb", "clickhouse", "presto", "trino"
]

# Convert between multiple dialects
source_sql = "SELECT EXTRACT(YEAR FROM date_col) FROM table"

for target_dialect in ["postgres", "mysql", "snowflake"]:
    try:
        converted = sqlglot.transpile(
            source_sql, 
            read="bigquery", 
            write=target_dialect
        )[0]
        print(f"{target_dialect}: {converted}")
    except Exception as e:
        print(f"Error converting to {target_dialect}: {e}")

Dialect-Specific Optimization

import sqlglot
from sqlglot.optimizer import optimize

# Optimize for specific target dialect
sql = """
SELECT user_id, COUNT(*) as cnt
FROM users 
WHERE created_date >= '2023-01-01'
GROUP BY user_id
HAVING COUNT(*) > 5
"""

# Optimize for BigQuery
optimized_bq = optimize(sql, dialect="bigquery")
bq_sql = optimized_bq.sql(dialect="bigquery", pretty=True)

# Optimize for Snowflake  
optimized_sf = optimize(sql, dialect="snowflake")
sf_sql = optimized_sf.sql(dialect="snowflake", pretty=True)

print("BigQuery:", bq_sql)
print("Snowflake:", sf_sql)

Types

class Dialect:
    """Base dialect class with extensible components."""
    
    class Tokenizer:
        """Dialect-specific tokenization rules."""
        QUOTES: List[str]      # String quote characters
        IDENTIFIERS: List[str] # Identifier quote characters  
        KEYWORDS: Dict[str, TokenType] # Keyword mappings
        
    class Parser:
        """Dialect-specific parsing rules."""
        FUNCTIONS: Dict[str, Callable] # Function parsers
        
    class Generator:
        """Dialect-specific SQL generation rules."""
        TRANSFORMS: Dict[Type, Callable] # Expression transformers
        TYPE_MAPPING: Dict[Type, str]    # Data type mappings
        
    # Core dialect methods
    def parse(self, sql: str, **opts) -> List[Optional[Expression]]: ...
    def generate(self, expression: Expression, **opts) -> str: ...
    def tokenize(self, sql: str) -> List[Token]: ...

# Type alias for dialect specification
DialectType = Union[str, Dialect, Type[Dialect]]

Install with Tessl CLI