tessl/pypi-sqlglot

A comprehensive SQL parser, transpiler, optimizer, and engine supporting 30+ dialects

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-sqlglot
Author: tessl

Additional tools for SQL analysis, comparison, and manipulation including AST diffing, column lineage analysis, time parsing utilities, and error handling for comprehensive SQL processing workflows.

Capabilities

AST Diffing and Comparison

Compare SQL expressions and identify differences between query structures.

def diff(
    source: Expression, 
    target: Expression,
    **opts
) -> str:
    """
    Compare two SQL expressions and generate diff output.
    
    Args:
        source (Expression): Source expression for comparison
        target (Expression): Target expression for comparison  
        **opts: Diff formatting options
        
    Returns:
        str: Human-readable diff showing changes between expressions
    """

Column Lineage Analysis

Trace column dependencies and data flow through complex SQL queries.

def lineage(
    sql: str,
    schema: Optional[Schema] = None,
    **opts
) -> Dict:
    """
    Analyze column lineage and dependencies in SQL query.
    
    Args:
        sql (str): SQL query to analyze
        schema (Schema): Schema for column resolution
        **opts: Lineage analysis options
        
    Returns:
        Dict: Column lineage mapping showing data flow
    """

Error Handling Classes

Comprehensive error handling for SQL processing operations.

class SqlglotError(Exception):
    """Base exception class for all SQLGlot errors."""

class ParseError(SqlglotError):
    """Exception raised when SQL parsing fails."""
    
    def __init__(
        self,
        message: str,
        errors: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Initialize parse error with message and detailed error information.
        
        Args:
            message (str): Error message
            errors (List[Dict]): List of detailed error dictionaries
        """
    
    @classmethod
    def new(
        cls,
        message: str,
        description: Optional[str] = None,
        line: Optional[int] = None,
        col: Optional[int] = None,  
        start_context: Optional[str] = None,
        highlight: Optional[str] = None,
        end_context: Optional[str] = None,
        into_expression: Optional[str] = None
    ) -> ParseError:
        """Create new ParseError with detailed location information."""

class TokenError(SqlglotError):
    """Exception raised when tokenization fails."""

class UnsupportedError(SqlglotError):
    """Exception raised when encountering unsupported SQL features."""

class OptimizeError(SqlglotError):
    """Exception raised during query optimization."""

class SchemaError(SqlglotError):
    """Exception raised for schema-related issues."""

class ExecuteError(SqlglotError):
    """Exception raised during SQL execution."""

class ErrorLevel:
    """Error handling levels for parser and validator."""
    
    IGNORE: str    # Ignore all errors
    WARN: str      # Log errors but continue processing
    RAISE: str     # Collect all errors and raise single exception
    IMMEDIATE: str # Raise exception immediately on first error

Helper Utilities

General utility functions and classes for SQL processing.

class AutoName:
    """Base class for auto-naming enumerations."""

def dict_depth(mapping: Dict) -> int:
    """
    Calculate maximum nesting depth of dictionary.
    
    Args:
        mapping (Dict): Dictionary to analyze
        
    Returns:
        int: Maximum nesting depth
    """

def first(iterable) -> Any:
    """
    Return first element from iterable.
    
    Args:
        iterable: Iterable to get first element from
        
    Returns:
        Any: First element or None if empty
    """

def concat_messages(errors: List[Any], maximum: int) -> str:
    """
    Concatenate error messages with truncation.
    
    Args:
        errors (List): List of error objects
        maximum (int): Maximum number of errors to include
        
    Returns:
        str: Concatenated error message string
    """

def merge_errors(errors: List[ParseError]) -> List[Dict[str, Any]]:
    """
    Merge multiple ParseError instances into unified error list.
    
    Args:
        errors (List[ParseError]): Parse errors to merge
        
    Returns:
        List[Dict]: Merged error information dictionaries
    """

Time and Date Utilities

Specialized utilities for handling SQL time and date operations.

# Time parsing and formatting utilities for SQL time literals
# Located in sqlglot.time module - provides functions for:
# - Parsing time strings in various formats
# - Converting between time representations  
# - Formatting time values for different SQL dialects

Transformation Utilities

Various SQL transformation and manipulation utilities.

# SQL transformation utilities for advanced query manipulation
# Located in sqlglot.transforms module - provides functions for:
# - Custom expression transformations
# - Dialect-specific syntax conversions
# - Advanced AST manipulation patterns

JSON Path Support

JSON path expression support for SQL queries with JSON data.

# JSON path expression utilities for SQL JSON operations
# Located in sqlglot.jsonpath module - provides:
# - JSON path parsing and evaluation
# - Integration with SQL JSON functions
# - Cross-dialect JSON path support

Data Structure Utilities

Trie and other data structure utilities for efficient processing.

def new_trie() -> Dict:
    """
    Create new trie data structure.
    
    Returns:
        Dict: Empty trie structure
    """

def in_trie(trie: Dict, key: str) -> TrieResult:
    """
    Check if key exists in trie structure.
    
    Args:
        trie (Dict): Trie to search
        key (str): Key to search for
        
    Returns:
        TrieResult: Result indicating presence and type of match
    """

class TrieResult:
    """Result of trie lookup operation."""
    
    def __init__(self, exists: bool, prefix: bool = False):
        """
        Initialize trie result.
        
        Args:
            exists (bool): Whether key exists in trie
            prefix (bool): Whether key is a prefix of existing keys
        """

Serialization Utilities

Serialization and deserialization utilities for expressions.

# Serialization utilities for SQLGlot expressions
# Located in sqlglot.serde module - provides:
# - Expression serialization to JSON/dict format
# - Deserialization from stored representations
# - Cross-process expression transfer support

Usage Examples

Expression Diffing

import sqlglot
from sqlglot.diff import diff

# Compare two similar queries
query1 = sqlglot.parse_one("SELECT name, age FROM users WHERE age > 25")
query2 = sqlglot.parse_one("SELECT name, age, email FROM users WHERE age >= 25")

# Generate diff
diff_output = diff(query1, query2)
print(diff_output)

# Compare more complex queries
original = sqlglot.parse_one("""
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.active = 1
GROUP BY u.name
""")

modified = sqlglot.parse_one("""
SELECT u.name, u.email, COUNT(o.id) as total_orders
FROM users u
LEFT JOIN orders o ON u.id = o.user_id  
WHERE u.active = 1 AND u.verified = 1
GROUP BY u.name, u.email
""")

print(diff(original, modified))

Column Lineage Analysis

import sqlglot
from sqlglot.lineage import lineage
from sqlglot.schema import MappingSchema

# Define schema for lineage analysis
schema = MappingSchema({
    "raw_data": {
        "user_id": "INT",
        "first_name": "VARCHAR",
        "last_name": "VARCHAR", 
        "email": "VARCHAR",
        "signup_date": "DATE"
    },
    "orders": {
        "id": "INT",
        "user_id": "INT",
        "amount": "DECIMAL",
        "order_date": "DATE"
    }
})

# Analyze complex query lineage
sql = """
SELECT 
    CONCAT(u.first_name, ' ', u.last_name) as full_name,
    u.email,
    COUNT(o.id) as total_orders,
    SUM(o.amount) as total_spent,
    AVG(o.amount) as avg_order_value,
    MAX(o.order_date) as last_order_date
FROM raw_data u
LEFT JOIN orders o ON u.user_id = o.user_id
WHERE u.signup_date >= '2023-01-01'
GROUP BY u.user_id, u.first_name, u.last_name, u.email
"""

# Get lineage information
lineage_info = lineage(sql, schema=schema)
print("Column lineage:")
for output_col, source_cols in lineage_info.items():
    print(f"{output_col} <- {source_cols}")

Error Handling and Validation

import sqlglot
from sqlglot import ParseError, UnsupportedError, ErrorLevel

# Handle parsing errors gracefully
def safe_parse(sql_queries):
    results = []
    errors = []
    
    for sql in sql_queries:
        try:
            parsed = sqlglot.parse_one(sql)
            results.append(parsed)
        except ParseError as e:
            errors.append(f"Parse error in '{sql}': {e}")
        except UnsupportedError as e:
            errors.append(f"Unsupported feature in '{sql}': {e}")
    
    return results, errors

# Test with mixed valid/invalid SQL
test_queries = [
    "SELECT * FROM users",
    "SELECT FROM",  # Invalid
    "SELECT name FROM users WHERE age > 25",
    "INVALID SQL SYNTAX",  # Invalid
]

parsed_queries, parse_errors = safe_parse(test_queries)
print(f"Successfully parsed: {len(parsed_queries)} queries")
print(f"Errors: {len(parse_errors)}")
for error in parse_errors:
    print(f"  {error}")

# Use different error levels
try:
    # Parse with warning level - continues on errors
    expressions = sqlglot.parse(
        "SELECT 1; INVALID; SELECT 2;",
        error_level=ErrorLevel.WARN
    )
    print(f"Parsed {len([e for e in expressions if e])} valid expressions")
except Exception as e:
    print(f"Error: {e}")

Working with Helper Utilities

import sqlglot
from sqlglot.helper import dict_depth, first

# Calculate nested dictionary depth
schema_dict = {
    "database1": {
        "schema1": {
            "table1": {"col1": "INT", "col2": "VARCHAR"}
        }
    },
    "database2": {
        "schema1": {
            "table1": {"col1": "INT"},
            "table2": {"col1": "INT", "col2": "VARCHAR", "col3": "DATE"}
        }
    }
}

depth = dict_depth(schema_dict)
print(f"Schema depth: {depth}")

# Get first valid expression from parse results
sql_statements = "INVALID; SELECT 1; SELECT 2;"
expressions = sqlglot.parse(sql_statements, error_level=ErrorLevel.WARN)
first_valid = first(expr for expr in expressions if expr is not None)
if first_valid:
    print(f"First valid expression: {first_valid.sql()}")

Trie Data Structure Usage

from sqlglot.trie import new_trie, in_trie

# Create trie for keyword matching
keywords_trie = new_trie()

# Add keywords (this would typically be done internally)
keywords = ["SELECT", "FROM", "WHERE", "GROUP", "ORDER", "HAVING"]
# Note: Actual trie population would use internal SQLGlot methods

# Check for keyword presence
for word in ["SELECT", "SELEC", "SELECTED", "FROM", "TABLE"]:
    result = in_trie(keywords_trie, word.upper())
    if result.exists:
        print(f"'{word}' is a complete keyword")
    elif result.prefix:
        print(f"'{word}' is a keyword prefix")
    else:
        print(f"'{word}' is not a keyword or prefix")

Advanced Error Information

import sqlglot
from sqlglot import ParseError

# Parse with detailed error information
invalid_sql = """
SELECT name, age
FROM users
WHERE age > 25 AND
    status = 'active' AND
    created_date >= '2023-01-01'
    missing_clause
"""

try:
    parsed = sqlglot.parse_one(invalid_sql)
except ParseError as e:
    print(f"Parse error: {e}")
    
    # Access detailed error information
    for error_detail in e.errors:
        print(f"  Line: {error_detail.get('line')}")
        print(f"  Column: {error_detail.get('col')}")
        print(f"  Description: {error_detail.get('description')}")
        print(f"  Context: {error_detail.get('start_context')}")
        print(f"  Highlight: {error_detail.get('highlight')}")

Types

class SqlglotError(Exception):
    """Base exception for all SQLGlot errors."""

class ParseError(SqlglotError):
    """SQL parsing error with detailed information."""
    
    errors: List[Dict[str, Any]]  # Detailed error information
    
    def __init__(self, message: str, errors: Optional[List[Dict]] = None): ...

class TokenError(SqlglotError):
    """Tokenization error."""

class UnsupportedError(SqlglotError):
    """Unsupported feature error."""

class OptimizeError(SqlglotError):
    """Query optimization error."""

class SchemaError(SqlglotError):
    """Schema-related error."""

class ExecuteError(SqlglotError):
    """SQL execution error."""

class ErrorLevel:
    """Error handling level enumeration."""
    
    IGNORE: str    # Ignore all errors
    WARN: str      # Log errors but continue  
    RAISE: str     # Collect errors and raise exception
    IMMEDIATE: str # Raise immediately on first error

class TrieResult:
    """Result of trie lookup operation."""
    
    exists: bool   # Whether key exists completely in trie
    prefix: bool   # Whether key is prefix of existing keys
    
    def __init__(self, exists: bool, prefix: bool = False): ...

class AutoName:
    """Base class for auto-naming enumerations."""

Install with Tessl CLI