tessl/pypi-pyspark

Python API for Apache Spark, providing distributed computing, data analysis, and machine learning capabilities

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

SQL and DataFrames

Name: tessl/pypi-pyspark
Author: tessl

Structured data processing with DataFrames, SQL queries, and comprehensive built-in functions. This is the primary high-level interface for working with structured and semi-structured data in PySpark, providing optimized query execution through the Catalyst optimizer.

Capabilities

Spark Session

Unified entry point for DataFrame and Dataset functionality, replacing SQLContext in newer versions.

class SparkSession:
    def __init__(self, sparkContext, jsparkSession=None):
        """Create a new SparkSession."""

    @classmethod
    def builder(cls):
        """
        Creates a Builder for constructing a SparkSession.

        Returns:
        SparkSession.Builder
        """

    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a DataFrame from RDD, list, pandas DataFrame, numpy array, or PyArrow Table.

        Parameters:
        - data: Input data (RDD, Iterable, pandas DataFrame, numpy array, PyArrow Table)
        - schema (StructType, str, list): Schema definition for the DataFrame
        - samplingRatio (float): Sample ratio for schema inference from RDD (0.0-1.0)
        - verifySchema (bool): Verify data conforms to the provided schema

        Returns:
        DataFrame with the specified data and schema
        """

    def sql(self, sqlQuery):
        """
        Returns a DataFrame representing the result of the given query.

        Parameters:
        - sqlQuery (str): SQL query string

        Returns:
        DataFrame with query results
        """

    def table(self, tableName):
        """
        Returns the specified table as a DataFrame.

        Parameters:
        - tableName (str): Table name

        Returns:
        DataFrame representing the table
        """

    def range(self, start, end=None, step=1, numPartitions=None):
        """
        Create a DataFrame with single pyspark.sql.types.LongType column named id.

        Parameters:
        - start (int): Start value (inclusive)
        - end (int): End value (exclusive)
        - step (int): Step size
        - numPartitions (int): Number of partitions

        Returns:
        DataFrame with id column
        """

    def stop(self):
        """Stop the underlying SparkContext."""

    @property
    def read(self):
        """
        Returns a DataFrameReader for reading data.

        Returns:
        DataFrameReader
        """

    @property
    def readStream(self):
        """
        Returns a DataStreamReader for reading streaming data.

        Returns:
        DataStreamReader
        """

    @property
    def catalog(self):
        """
        Interface to Spark's catalog of databases, tables and functions.

        Returns:
        Catalog
        """

    @property
    def udf(self):
        """
        Returns a UDFRegistration for registering user-defined functions.

        Returns:
        UDFRegistration
        """

SparkSession Builder

Builder pattern for creating SparkSession instances with configuration.

class Builder:
    def appName(self, name):
        """
        Sets a name for the application.

        Parameters:
        - name (str): Application name

        Returns:
        Builder
        """

    def master(self, master):
        """
        Sets the Spark master URL.

        Parameters:
        - master (str): Master URL

        Returns:
        Builder
        """

    def config(self, key=None, value=None, conf=None):
        """
        Sets a config option or SparkConf.

        Parameters:
        - key (str): Configuration key
        - value (str): Configuration value
        - conf (SparkConf): SparkConf object

        Returns:
        Builder
        """

    def enableHiveSupport(self):
        """
        Enables Hive support.

        Returns:
        Builder
        """

    def getOrCreate(self):
        """
        Gets an existing SparkSession or creates a new one.

        Returns:
        SparkSession
        """

DataFrame

Distributed collection of data organized into named columns.

class DataFrame:
    def select(self, *cols):
        """
        Projects a set of expressions and returns a new DataFrame.

        Parameters:
        - cols: Column expressions or names

        Returns:
        DataFrame with selected columns
        """

    def filter(self, condition):
        """
        Filters rows using the given condition.

        Parameters:
        - condition: Filter condition (Column or string)

        Returns:
        Filtered DataFrame
        """

    def where(self, condition):
        """
        Filters rows using the given condition (alias for filter).

        Parameters:
        - condition: Filter condition (Column or string)

        Returns:
        Filtered DataFrame
        """

    def groupBy(self, *cols):
        """
        Group DataFrame using the specified columns.

        Parameters:
        - cols: Column names or expressions

        Returns:
        GroupedData for aggregation
        """

    def agg(self, *exprs):
        """
        Aggregate on the entire DataFrame without groups.

        Parameters:
        - exprs: Aggregation expressions

        Returns:
        DataFrame with aggregated results
        """

    def orderBy(self, *cols, **kwargs):
        """
        Sort DataFrame by specified columns.

        Parameters:
        - cols: Column names or expressions
        - ascending (bool): Sort in ascending order

        Returns:
        Sorted DataFrame
        """

    def sort(self, *cols, **kwargs):
        """
        Sort DataFrame by specified columns (alias for orderBy).

        Parameters:
        - cols: Column names or expressions
        - ascending (bool): Sort in ascending order

        Returns:
        Sorted DataFrame
        """

    def join(self, other, on=None, how=None):
        """
        Join with another DataFrame.

        Parameters:
        - other (DataFrame): DataFrame to join with
        - on: Join condition (column names or expression)
        - how (str): Join type ("inner", "outer", "left", "right", "semi", "anti")

        Returns:
        Joined DataFrame
        """

    def union(self, other):
        """
        Return a new DataFrame containing union of rows.

        Parameters:
        - other (DataFrame): Another DataFrame

        Returns:
        Union DataFrame
        """

    def unionByName(self, other, allowMissingColumns=False):
        """
        Return a new DataFrame containing union of rows by column names.

        Parameters:
        - other (DataFrame): Another DataFrame
        - allowMissingColumns (bool): Allow missing columns

        Returns:
        Union DataFrame
        """

    def intersect(self, other):
        """
        Return a new DataFrame containing rows in both DataFrames.

        Parameters:
        - other (DataFrame): Another DataFrame

        Returns:
        Intersection DataFrame
        """

    def subtract(self, other):
        """
        Return a new DataFrame containing rows in this DataFrame but not in the other.

        Parameters:
        - other (DataFrame): Another DataFrame

        Returns:
        Difference DataFrame
        """

    def distinct(self):
        """
        Return a new DataFrame with distinct rows.

        Returns:
        DataFrame with distinct rows
        """

    def dropDuplicates(self, subset=None):
        """
        Return a new DataFrame with duplicate rows removed.

        Parameters:
        - subset (list): Column names to consider for duplicates

        Returns:
        DataFrame without duplicates
        """

    def drop(self, *cols):
        """
        Return a new DataFrame with specified columns dropped.

        Parameters:
        - cols: Column names to drop

        Returns:
        DataFrame with columns dropped
        """

    def withColumn(self, colName, col):
        """
        Return a new DataFrame by adding or replacing a column.

        Parameters:
        - colName (str): Column name
        - col (Column): Column expression

        Returns:
        DataFrame with new/updated column
        """

    def withColumnRenamed(self, existing, new):
        """
        Return a new DataFrame by renaming a column.

        Parameters:
        - existing (str): Existing column name
        - new (str): New column name

        Returns:
        DataFrame with renamed column
        """

    def show(self, n=20, truncate=True, vertical=False):
        """
        Print the first n rows to the console.

        Parameters:
        - n (int): Number of rows to show
        - truncate (bool or int): Truncate strings (True/False or max character width)
        - vertical (bool): Print rows vertically instead of horizontally
        """

    def collect(self):
        """
        Return all the records as a list of Row.

        Returns:
        List of Row objects
        """

    def take(self, num):
        """
        Return the first num rows as a list of Row.

        Parameters:
        - num (int): Number of rows to return

        Returns:
        List of Row objects
        """

    def first(self):
        """
        Return the first row as a Row.

        Returns:
        First Row
        """

    def head(self, n=None):
        """
        Return the first n rows or the first row if n is None.

        Parameters:
        - n (int): Number of rows to return

        Returns:
        Row or list of Rows
        """

    def count(self):
        """
        Return the number of rows in this DataFrame.

        Returns:
        Number of rows
        """

    def describe(self, *cols):
        """
        Compute basic statistics for numeric and string columns.

        Parameters:
        - cols: Column names

        Returns:
        DataFrame with statistics
        """

    def summary(self, *statistics):
        """
        Compute specified statistics for numeric and string columns.

        Parameters:
        - statistics: Statistics to compute

        Returns:
        DataFrame with statistics
        """

    def cache(self):
        """
        Persist this DataFrame with the default storage level.

        Returns:
        This DataFrame
        """

    def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
        """
        Persist this DataFrame with the given storage level.

        Parameters:
        - storageLevel (StorageLevel): Storage level

        Returns:
        This DataFrame
        """

    def unpersist(self, blocking=False):
        """
        Mark the DataFrame as non-persistent.

        Parameters:
        - blocking (bool): Whether to block until complete

        Returns:
        This DataFrame
        """

    def coalesce(self, numPartitions):
        """
        Return a new DataFrame with reduced number of partitions.

        Parameters:
        - numPartitions (int): Target number of partitions

        Returns:
        Coalesced DataFrame
        """

    def repartition(self, numPartitions, *cols):
        """
        Return a new DataFrame partitioned by the given expressions.

        Parameters:
        - numPartitions (int): Number of partitions
        - cols: Partitioning expressions

        Returns:
        Repartitioned DataFrame
        """

    def createOrReplaceTempView(self, name):
        """
        Create or replace a local temporary view.

        Parameters:
        - name (str): View name
        """

    def createGlobalTempView(self, name):
        """
        Create a global temporary view.

        Parameters:
        - name (str): View name
        """

    @property
    def write(self):
        """
        Interface for saving the content of DataFrame.

        Returns:
        DataFrameWriter
        """

    @property
    def writeStream(self):
        """
        Interface for saving the content of streaming DataFrame.

        Returns:
        DataStreamWriter
        """

    @property
    def schema(self):
        """
        Return the schema of this DataFrame.

        Returns:
        StructType representing the schema
        """

    @property
    def columns(self):
        """
        Return all column names as a list.

        Returns:
        List of column names
        """

    @property
    def dtypes(self):
        """
        Return all column names and their data types as a list.

        Returns:
        List of (name, type) tuples
        """

DataFrame Specialized Functions

Specialized function classes for handling missing data and statistical operations.

class DataFrameNaFunctions:
    """
    Functionality for working with missing data in DataFrames.
    Accessed via DataFrame.na property.
    """

    def drop(self, how="any", thresh=None, subset=None):
        """
        Drop rows with null values.

        Parameters:
        - how (str): "any" or "all" - drop rows with any/all null values
        - thresh (int): Minimum number of non-null values required
        - subset (list): Column subset to consider for null checking

        Returns:
        DataFrame with null rows dropped
        """

    def fill(self, value, subset=None):
        """
        Fill null values with specified value.

        Parameters:
        - value: Value to replace nulls (dict for per-column values)
        - subset (list): Column subset to fill

        Returns:
        DataFrame with null values filled
        """

    def replace(self, to_replace, value=None, subset=None):
        """
        Replace specified values in DataFrame.

        Parameters:
        - to_replace: Value(s) to replace
        - value: Replacement value(s)
        - subset (list): Column subset to apply replacement

        Returns:
        DataFrame with values replaced
        """

class DataFrameStatFunctions:
    """
    Functionality for statistical operations on DataFrames.
    Accessed via DataFrame.stat property.
    """

    def approxQuantile(self, col, probabilities, relativeError):
        """
        Calculate approximate quantiles for numeric columns.

        Parameters:
        - col (str): Column name or list of column names
        - probabilities (list): List of quantile probabilities (0.0 to 1.0)
        - relativeError (float): Relative error tolerance

        Returns:
        List of quantile values
        """

    def corr(self, col1, col2, method="pearson"):
        """
        Calculate correlation between two columns.

        Parameters:
        - col1 (str): First column name
        - col2 (str): Second column name  
        - method (str): Correlation method ("pearson" or "spearman")

        Returns:
        Correlation coefficient as float
        """

    def cov(self, col1, col2):
        """
        Calculate covariance between two columns.

        Parameters:
        - col1 (str): First column name
        - col2 (str): Second column name

        Returns:
        Covariance as float
        """

    def crosstab(self, col1, col2):
        """
        Calculate cross-tabulation between two columns.

        Parameters:
        - col1 (str): First column name
        - col2 (str): Second column name

        Returns:
        DataFrame with cross-tabulation results
        """

    def freqItems(self, cols, support=None):
        """
        Find frequent items for specified columns.

        Parameters:
        - cols (list): Column names
        - support (float): Minimum support threshold

        Returns:
        DataFrame with frequent items
        """

    def sampleBy(self, col, fractions, seed=None):
        """
        Stratified sampling by column values.

        Parameters:
        - col (str): Column to stratify by
        - fractions (dict): Sampling fractions per stratum
        - seed (int): Random seed

        Returns:
        Sampled DataFrame
        """

Column Operations

Column expressions for DataFrame transformations.

class Column:
    def alias(self, *alias, **kwargs):
        """
        Return a column with an alias.

        Parameters:
        - alias: Alias name(s)

        Returns:
        Aliased Column
        """

    def cast(self, dataType):
        """
        Convert the column to a different data type.

        Parameters:
        - dataType: Target data type

        Returns:
        Casted Column
        """

    def contains(self, other):
        """
        Check if column contains the specified value.

        Parameters:
        - other: Value to check

        Returns:
        Boolean Column
        """

    def startswith(self, other):
        """
        Check if column starts with the specified string.

        Parameters:
        - other (str): String to check

        Returns:
        Boolean Column
        """

    def endswith(self, other):
        """
        Check if column ends with the specified string.

        Parameters:
        - other (str): String to check

        Returns:
        Boolean Column
        """

    def isNull(self):
        """
        Check if column is null.

        Returns:
        Boolean Column
        """

    def isNotNull(self):
        """
        Check if column is not null.

        Returns:
        Boolean Column
        """

    def isin(self, *cols):
        """
        Check if column value is in the specified list.

        Parameters:
        - cols: Values to check against

        Returns:
        Boolean Column
        """

    def between(self, lowerBound, upperBound):
        """
        Check if column is between two values.

        Parameters:
        - lowerBound: Lower bound
        - upperBound: Upper bound

        Returns:
        Boolean Column
        """

    def when(self, condition, value):
        """
        Evaluate a list of conditions and return one of multiple possible result expressions.

        Parameters:
        - condition: Condition expression
        - value: Value when condition is true

        Returns:
        Column with conditional logic
        """

    def otherwise(self, value):
        """
        Evaluate a list of conditions and return one of multiple possible result expressions.

        Parameters:
        - value: Default value

        Returns:
        Column with conditional logic
        """

    def substr(self, startPos, length):
        """
        Return a substring of the column.

        Parameters:
        - startPos (int): Starting position
        - length (int): Length of substring

        Returns:
        Substring Column
        """

    def asc(self):
        """
        Return a sort expression based on ascending order.

        Returns:
        Ascending sort Column
        """

    def desc(self):
        """
        Return a sort expression based on descending order.

        Returns:
        Descending sort Column
        """

Window Operations

Window functions for analytical operations over groups of rows.

class Window:
    """
    Utility functions for defining window specifications in DataFrames.
    """
    
    unboundedPreceding: int
    """Represents unbounded preceding frame boundary."""
    
    unboundedFollowing: int  
    """Represents unbounded following frame boundary."""
    
    currentRow: int
    """Represents current row frame boundary."""

    @staticmethod
    def partitionBy(*cols):
        """
        Creates a WindowSpec with the partitioning defined.

        Parameters:
        - cols: Column names or expressions for partitioning

        Returns:
        WindowSpec with partitioning defined
        """

    @staticmethod  
    def orderBy(*cols):
        """
        Creates a WindowSpec with the ordering defined.

        Parameters:
        - cols: Column names or expressions for ordering

        Returns:
        WindowSpec with ordering defined
        """

    @staticmethod
    def rowsBetween(start, end):
        """
        Creates a WindowSpec with row-based frame boundaries.

        Parameters:
        - start (int): Start boundary (inclusive)
        - end (int): End boundary (inclusive)

        Returns:
        WindowSpec with frame boundaries defined
        """

    @staticmethod
    def rangeBetween(start, end):
        """
        Creates a WindowSpec with range-based frame boundaries.

        Parameters:
        - start: Start boundary value
        - end: End boundary value

        Returns:
        WindowSpec with range frame boundaries
        """

class WindowSpec:
    """
    Window specification that defines partitioning, ordering, and frame boundaries.
    """

    def partitionBy(*cols):
        """
        Defines the partitioning columns for this window.

        Parameters:
        - cols: Column names or expressions

        Returns:
        WindowSpec with updated partitioning
        """

    def orderBy(*cols):
        """
        Defines the ordering columns for this window.

        Parameters:
        - cols: Column names or expressions with optional sort direction

        Returns:
        WindowSpec with updated ordering
        """

    def rowsBetween(start, end):
        """
        Defines row-based frame boundaries for this window.

        Parameters:
        - start (int): Start row offset
        - end (int): End row offset

        Returns:
        WindowSpec with frame boundaries
        """

    def rangeBetween(start, end):
        """
        Defines range-based frame boundaries for this window.

        Parameters:
        - start: Start range value
        - end: End range value

        Returns:
        WindowSpec with range frame boundaries
        """

Data Reading and Writing

Interfaces for reading and writing data to various formats and sources.

class DataFrameReader:
    def format(self, source):
        """
        Specify the input data source format.

        Parameters:
        - source (str): Data source format

        Returns:
        DataFrameReader
        """

    def option(self, key, value):
        """
        Add an input option for the underlying data source.

        Parameters:
        - key (str): Option key
        - value: Option value

        Returns:
        DataFrameReader
        """

    def options(self, **options):
        """
        Add input options for the underlying data source.

        Parameters:
        - options: Keyword arguments of options

        Returns:
        DataFrameReader
        """

    def schema(self, schema):
        """
        Specify the input schema.

        Parameters:
        - schema: Schema definition

        Returns:
        DataFrameReader
        """

    def load(self, path=None, format=None, schema=None, **options):
        """
        Load data from a data source.

        Parameters:
        - path (str): File path
        - format (str): Data source format
        - schema: Schema definition
        - options: Additional options

        Returns:
        DataFrame
        """

    def csv(self, path, schema=None, sep=None, encoding=None, quote=None, 
            escape=None, comment=None, header=None, inferSchema=None, 
            ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, 
            nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, 
            dateFormat=None, timestampFormat=None, maxColumns=None, 
            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, 
            mode=None, columnNameOfCorruptRecord=None, multiLine=None, 
            charToEscapeQuoteEscaping=None, samplingRatio=None, 
            enforceSchema=None, emptyValue=None, locale=None, lineSep=None, 
            pathGlobFilter=None, recursiveFileLookup=None, 
            modifiedBefore=None, modifiedAfter=None, unescapedQuoteHandling=None):
        """
        Load a CSV file and return the results as a DataFrame.

        Parameters:
        - path (str): CSV file path
        - schema: Schema for the CSV file
        - sep (str): Column separator
        - encoding (str): Character encoding
        - quote (str): Quote character
        - escape (str): Escape character
        - comment (str): Comment character
        - header (bool): Whether first line is header
        - inferSchema (bool): Automatically infer column types
        - ignoreLeadingWhiteSpace (bool): Ignore leading whitespaces
        - ignoreTrailingWhiteSpace (bool): Ignore trailing whitespaces
        - nullValue (str): String representation of null value
        - nanValue (str): String representation of NaN value
        - positiveInf (str): String representation of positive infinity
        - negativeInf (str): String representation of negative infinity
        - dateFormat (str): Date format string
        - timestampFormat (str): Timestamp format string
        - maxColumns (int): Maximum number of columns
        - maxCharsPerColumn (int): Maximum characters per column
        - maxMalformedLogPerPartition (int): Maximum malformed records to log per partition
        - mode (str): Parse mode ("PERMISSIVE", "DROPMALFORMED", "FAILFAST")
        - columnNameOfCorruptRecord (str): Column name for corrupt records
        - multiLine (bool): Parse multi-line records
        - charToEscapeQuoteEscaping (str): Character to escape quote escaping
        - samplingRatio (float): Sampling ratio for schema inference
        - enforceSchema (bool): Enforce specified or inferred schema
        - emptyValue (str): String representation of empty value
        - locale (str): Locale for parsing
        - lineSep (str): Line separator
        - pathGlobFilter (str): Path glob filter
        - recursiveFileLookup (bool): Recursive file lookup
        - modifiedBefore (str): Files modified before timestamp
        - modifiedAfter (str): Files modified after timestamp
        - unescapedQuoteHandling (str): How to handle unescaped quotes

        Returns:
        DataFrame
        """

    def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
             allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
             mode=None, columnNameOfCorruptRecord=None, dateFormat=None,
             timestampFormat=None, multiLine=None, allowUnquotedControlChars=None,
             lineSep=None, samplingRatio=None, dropFieldIfAllNull=None,
             encoding=None, locale=None, pathGlobFilter=None, recursiveFileLookup=None,
             modifiedBefore=None, modifiedAfter=None):
        """
        Load JSON files and return the results as a DataFrame.

        Parameters:
        - path (str): JSON file path
        - schema: Schema for the JSON data
        - primitivesAsString (bool): Infer primitive types as strings
        - prefersDecimal (bool): Prefer decimal type for numbers
        - allowComments (bool): Allow Java/C++ style comments
        - allowUnquotedFieldNames (bool): Allow unquoted field names
        - allowSingleQuotes (bool): Allow single quotes
        - allowNumericLeadingZero (bool): Allow leading zeros in numbers
        - allowBackslashEscapingAnyCharacter (bool): Allow backslash escaping
        - mode (str): Parse mode
        - columnNameOfCorruptRecord (str): Column name for corrupt records
        - dateFormat (str): Date format string
        - timestampFormat (str): Timestamp format string
        - multiLine (bool): Parse multi-line JSON records
        - allowUnquotedControlChars (bool): Allow unquoted control characters
        - lineSep (str): Line separator
        - samplingRatio (float): Sampling ratio for schema inference
        - dropFieldIfAllNull (bool): Drop fields with all null values
        - encoding (str): Character encoding
        - locale (str): Locale for parsing
        - pathGlobFilter (str): Path glob filter
        - recursiveFileLookup (bool): Recursive file lookup
        - modifiedBefore (str): Files modified before timestamp
        - modifiedAfter (str): Files modified after timestamp

        Returns:
        DataFrame
        """

    def parquet(self, *paths, **options):
        """
        Load Parquet files and return the results as a DataFrame.

        Parameters:
        - paths: Parquet file paths
        - options: Additional options

        Returns:
        DataFrame
        """

    def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None,
             recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None):
        """
        Load text files and return a DataFrame with a single string column.

        Parameters:
        - paths: Text file paths
        - wholetext (bool): Read files as single record
        - lineSep (str): Line separator
        - pathGlobFilter (str): Path glob filter
        - recursiveFileLookup (bool): Recursive file lookup
        - modifiedBefore (str): Files modified before timestamp
        - modifiedAfter (str): Files modified after timestamp

        Returns:
        DataFrame
        """

    def orc(self, path, mergeSchema=None, pathGlobFilter=None, recursiveFileLookup=None,
            modifiedBefore=None, modifiedAfter=None):
        """
        Load ORC files and return the results as a DataFrame.

        Parameters:
        - path (str): ORC file path
        - mergeSchema (bool): Merge schemas from multiple files
        - pathGlobFilter (str): Path glob filter
        - recursiveFileLookup (bool): Recursive file lookup
        - modifiedBefore (str): Files modified before timestamp
        - modifiedAfter (str): Files modified after timestamp

        Returns:
        DataFrame
        """

    def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None,
             numPartitions=None, predicates=None, properties=None):
        """
        Construct a DataFrame representing the database table.

        Parameters:
        - url (str): JDBC database URL
        - table (str): Table name
        - column (str): Column name for partitioning
        - lowerBound: Lower bound for partitioning column
        - upperBound: Upper bound for partitioning column
        - numPartitions (int): Number of partitions
        - predicates (list): List of expressions for partitioning
        - properties (dict): JDBC connection properties

        Returns:
        DataFrame
        """

Structured Streaming

Components for real-time data processing with streaming DataFrames.

class DataStreamReader:
    """
    Interface for reading streaming data from various sources.
    Accessed via SparkSession.readStream property.
    """

    def format(self, source):
        """
        Specify data source format.

        Parameters:
        - source (str): Source format ("kafka", "socket", "rate", etc.)

        Returns:
        DataStreamReader with format specified
        """

    def option(self, key, value):
        """
        Add input option for streaming source.

        Parameters:
        - key (str): Option name
        - value: Option value

        Returns:
        DataStreamReader with option set
        """

    def schema(self, schema):
        """
        Specify input schema for streaming data.

        Parameters:
        - schema (StructType): Schema definition

        Returns:
        DataStreamReader with schema specified
        """

    def load(self, path=None):
        """
        Load streaming DataFrame from configured source.

        Parameters:
        - path (str): Optional source path

        Returns:
        Streaming DataFrame
        """

class DataStreamWriter:
    """
    Interface for writing streaming data to various sinks.
    Accessed via DataFrame.writeStream property.
    """

    def format(self, source):
        """
        Specify output sink format.

        Parameters:
        - source (str): Sink format ("console", "memory", "kafka", etc.)

        Returns:
        DataStreamWriter with format specified
        """

    def outputMode(self, outputMode):
        """
        Specify output mode for streaming queries.

        Parameters:
        - outputMode (str): "append", "complete", or "update"

        Returns:
        DataStreamWriter with output mode set
        """

    def trigger(self, **kwargs):
        """
        Set trigger for streaming query execution.

        Parameters:
        - kwargs: Trigger options (processingTime, once, continuous)

        Returns:
        DataStreamWriter with trigger configured
        """

    def start(self, path=None):
        """
        Start the streaming query.

        Parameters:
        - path (str): Optional output path

        Returns:
        StreamingQuery representing the running query
        """

class StreamingQuery:
    """
    Handle to a streaming query execution.
    """

    @property
    def id(self):
        """Unique identifier for this query."""

    @property
    def isActive(self):
        """Whether the query is currently active."""

    def awaitTermination(self, timeout=None):
        """
        Wait for query termination.

        Parameters:
        - timeout (int): Maximum time to wait (seconds)

        Returns:
        True if query terminated, False if timeout
        """

    def stop(self):
        """Stop the streaming query."""

class StreamingQueryManager:
    """
    Manager for streaming queries.
    Accessed via SparkSession.streams property.
    """

    @property
    def active(self):
        """List of currently active streaming queries."""

    def get(self, id):
        """
        Get a streaming query by ID.

        Parameters:
        - id (str): Query ID

        Returns:
        StreamingQuery or None if not found
        """

    def awaitAnyTermination(self, timeout=None):
        """
        Wait for any streaming query to terminate.

        Parameters:
        - timeout (int): Maximum time to wait (seconds)

        Returns:
        True if any query terminated, False if timeout
        """

SQL Functions

Comprehensive built-in functions for data processing and transformation.

# Column creation and references
def col(col_name):
    """
    Return a Column based on the given column name.

    Parameters:
    - col_name (str): Column name

    Returns:
    Column
    """

def column(col_name):
    """
    Return a Column based on the given column name (alias for col).

    Parameters:
    - col_name (str): Column name

    Returns:
    Column
    """

def lit(col_value):
    """
    Create a Column of literal value.

    Parameters:
    - col_value: Literal value

    Returns:
    Column with literal value
    """

def expr(str_expr):
    """
    Parse SQL expression into a Column.

    Parameters:
    - str_expr (str): SQL expression string

    Returns:
    Column representing the expression
    """

# Mathematical functions
def abs(col):
    """Return the absolute value of a column."""

def acos(col):
    """Return the arc cosine of a column."""

def asin(col):
    """Return the arc sine of a column."""

def atan(col):
    """Return the arc tangent of a column."""

def cos(col):
    """Return the cosine of a column."""

def sin(col):
    """Return the sine of a column."""

def tan(col):
    """Return the tangent of a column."""

def sqrt(col):
    """Return the square root of a column."""

def exp(col):
    """Return the exponential of a column."""

def log(arg1, arg2=None):
    """Return the natural logarithm or logarithm with specified base."""

def pow(col1, col2):
    """Return col1 raised to the power of col2."""

def round(col, scale=0):
    """Round the given value to scale decimal places."""

def ceil(col):
    """Return the ceiling of a column."""

def floor(col):
    """Return the floor of a column."""

# String functions
def upper(col):
    """Convert a string column to uppercase."""

def lower(col):
    """Convert a string column to lowercase."""

def length(col):
    """Return the length of a string column."""

def trim(col):
    """Trim spaces from both ends of a string column."""

def ltrim(col):
    """Trim spaces from the left end of a string column."""

def rtrim(col):
    """Trim spaces from the right end of a string column."""

def concat(*cols):
    """Concatenate multiple string columns."""

def concat_ws(sep, *cols):
    """Concatenate multiple string columns with separator."""

def substring(str, pos, len):
    """Return substring of str from pos with specified length."""

def split(str, pattern, limit=-1):
    """Split str around matches of the given pattern."""

def regexp_extract(str, pattern, idx):
    """Extract a specific group matched by pattern from str."""

def regexp_replace(str, pattern, replacement):
    """Replace all substrings that match pattern with replacement."""

# Date and time functions
def current_date():
    """Return the current date as a date column."""

def current_timestamp():
    """Return the current timestamp as a timestamp column."""

def date_add(start, days):
    """Return the date that is days days after start."""

def date_sub(start, days):
    """Return the date that is days days before start."""

def datediff(end, start):
    """Return the number of days from start to end."""

def year(col):
    """Extract the year from a date/timestamp column."""

def month(col):
    """Extract the month from a date/timestamp column."""

def dayofmonth(col):
    """Extract the day of month from a date/timestamp column."""

def hour(col):
    """Extract the hour from a timestamp column."""

def minute(col):
    """Extract the minute from a timestamp column."""

def second(col):
    """Extract the second from a timestamp column."""

def date_format(date, format):
    """Convert a date/timestamp to a string with the given format."""

def to_date(col, format=None):
    """Convert a string column to a date column."""

def to_timestamp(col, format=None):
    """Convert a string column to a timestamp column."""

# Array functions
def array(*cols):
    """Create a new array column."""

def array_contains(col, value):
    """Return true if the array contains the given value."""

def size(col):
    """Return the size of an array or map column."""

def sort_array(col, asc=True):
    """Sort the input array in ascending or descending order."""

def reverse(col):
    """Return a reversed string or array."""

def slice(x, start, length):
    """Return a slice of the array from start index with specified length."""

def array_join(col, delimiter, null_replacement=None):
    """Concatenate array elements using delimiter."""

def explode(col):
    """Return a new row for each element in the array column."""

def posexplode(col):
    """Return a new row for each element with position in the array column."""

# Map functions
def create_map(*cols):
    """Create a map from key-value pairs."""

def map_keys(col):
    """Return the keys of a map column as an array."""

def map_values(col):
    """Return the values of a map column as an array."""

def map_from_arrays(col1, col2):
    """Create a map from key and value arrays."""

# Aggregate functions
def count(col):
    """Return the number of items in a group (including null values)."""

def countDistinct(col, *cols):
    """Return the number of distinct items in a group."""

def sum(col):
    """Return the sum of values in a group."""

def avg(col):
    """Return the average of values in a group."""

def mean(col):
    """Return the average of values in a group (alias for avg)."""

def max(col):
    """Return the maximum value in a group."""

def min(col):
    """Return the minimum value in a group."""

def first(col, ignorenulls=False):
    """Return the first value in a group."""

def last(col, ignorenulls=False):
    """Return the last value in a group."""

def stddev(col):
    """Return the sample standard deviation of values in a group."""

def stddev_pop(col):
    """Return the population standard deviation of values in a group."""

def variance(col):
    """Return the sample variance of values in a group."""

def var_pop(col):
    """Return the population variance of values in a group."""

def collect_list(col):
    """Return a list of objects with duplicates."""

def collect_set(col):
    """Return a set of objects with duplicate elements eliminated."""

# Window functions
def row_number():
    """Return a sequential number starting at 1 within a window partition."""

def rank():
    """Return the rank of rows within a window partition."""

def dense_rank():
    """Return the dense rank of rows within a window partition."""

def percent_rank():
    """Return the relative rank of rows within a window partition."""

def cume_dist():
    """Return the cumulative distribution of values within a window partition."""

def lag(col, offset=1, default=None):
    """Return the value that is offset rows before the current row."""

def lead(col, offset=1, default=None):
    """Return the value that is offset rows after the current row."""

def ntile(n):
    """Return the ntile group id (1-indexed) within a window partition."""

# Conditional functions
def when(condition, value):
    """Evaluate a list of conditions and return one of multiple possible result expressions."""

def coalesce(*cols):
    """Return the first non-null value among the given columns."""

def greatest(*cols):
    """Return the greatest value among the given columns."""

def least(*cols):
    """Return the least value among the given columns."""

def isnull(col):
    """Return true if the column is null."""

def isnan(col):
    """Return true if the column is NaN."""

# JSON functions
def from_json(col, schema, options=None):
    """Parse a column containing a JSON string."""

def to_json(col, options=None):
    """Convert a column containing a struct to a JSON string."""

def get_json_object(col, path):
    """Extract a JSON object from a JSON string."""

def json_tuple(col, *fields):
    """Return a tuple of JSON object based on the given fields."""

# Type conversion functions
def cast(col, dataType):
    """Convert the column to a different data type."""

# Null handling functions
def isNull(col):
    """Return true if the column is null."""

def isNotNull(col):
    """Return true if the column is not null."""

def dropna(how='any', thresh=None, subset=None):
    """Return a new DataFrame omitting rows with null values."""

def fillna(value, subset=None):
    """Replace null values with specified value."""

def replace(to_replace, value=None, subset=None):
    """Replace values matching keys in to_replace with corresponding values."""

Types

class Row:
    """A row of data in a DataFrame."""
    
    def __init__(self, **kwargs):
        """Create a row with named arguments."""
    
    def asDict(self, recursive=False):
        """
        Return as a dict.
        
        Parameters:
        - recursive (bool): Turn nested rows to dict recursively
        
        Returns:
        Dict representation of the row
        """

class GroupedData:
    """A set of methods for aggregations on a DataFrame."""
    
    def agg(self, *exprs):
        """Compute aggregates and return the result as a DataFrame."""
    
    def count(self):
        """Count the number of rows for each group."""
    
    def mean(self, *cols):
        """Compute the average value for each numeric column for each group."""
    
    def avg(self, *cols):
        """Compute the average value for each numeric column for each group."""
    
    def max(self, *cols):
        """Compute the max value for each numeric column for each group."""
    
    def min(self, *cols):
        """Compute the min value for each numeric column for each group."""
    
    def sum(self, *cols):
        """Compute the sum for each numeric column for each group."""

from pyspark.sql.types import *

class DataType:
    """Base class for data types."""

class NullType(DataType):
    """Null data type."""

class StringType(DataType):
    """String data type."""

class BinaryType(DataType):
    """Binary data type."""

class BooleanType(DataType):
    """Boolean data type."""

class DateType(DataType):
    """Date data type."""

class TimestampType(DataType):
    """Timestamp data type."""

class DecimalType(DataType):
    """Decimal data type with precision and scale."""
    
    def __init__(self, precision=10, scale=0): ...

class DoubleType(DataType):
    """Double precision floating point data type."""

class FloatType(DataType):
    """Single precision floating point data type."""

class ByteType(DataType):
    """Byte integer data type."""

class IntegerType(DataType):
    """32-bit integer data type."""

class LongType(DataType):
    """64-bit integer data type."""

class ShortType(DataType):
    """16-bit integer data type."""

class ArrayType(DataType):
    """Array data type."""
    
    def __init__(self, elementType, containsNull=True): ...

class MapType(DataType):
    """Map data type."""
    
    def __init__(self, keyType, valueType, valueContainsNull=True): ...

class StructField:
    """A field in StructType."""
    
    def __init__(self, name, dataType, nullable=True, metadata=None): ...

class StructType(DataType):
    """Struct data type representing a row."""
    
    def __init__(self, fields=None): ...
    
    def add(self, field, data_type=None, nullable=True, metadata=None): ...

Install with Tessl CLI