tessl/pypi-delta-spark

Python APIs for using Delta Lake with Apache Spark

—

Pending

Overview

Eval results

Files

Table Operations

Name: tessl/pypi-delta-spark
Author: tessl

Core table management functionality for Delta Lake including creation, reading, updating, deleting, and table access patterns. Provides both path-based and catalog-based table operations with comprehensive CRUD support.

Capabilities

Table Access

Load existing Delta tables from filesystem paths or catalog names.

class DeltaTable:
    @classmethod
    def forPath(
        cls,
        spark: SparkSession, 
        path: str,
        hadoop_conf: Dict[str, str] = None
    ) -> DeltaTable:
        """
        Load Delta table from filesystem path.
        
        Parameters:
        - spark: SparkSession instance
        - path: Path to Delta table directory
        - hadoop_conf: Optional Hadoop configuration for file system access
        
        Returns:
        DeltaTable instance
        """
    
    @classmethod
    def forName(cls, spark: SparkSession, table_name: str) -> DeltaTable:
        """
        Load Delta table by catalog name.
        
        Parameters:
        - spark: SparkSession instance  
        - table_name: Table name in catalog (can be qualified: catalog.db.table)
        
        Returns:
        DeltaTable instance
        """
    
    @classmethod
    def isDeltaTable(cls, spark: SparkSession, identifier: str) -> bool:
        """
        Check if path or table identifier is a Delta table.
        
        Parameters:
        - spark: SparkSession instance
        - identifier: Path or table name to check
        
        Returns:
        True if identifier refers to a Delta table
        """

object DeltaTable {
  def forPath(spark: SparkSession, path: String): DeltaTable
  def forPath(
    spark: SparkSession, 
    path: String, 
    hadoopConf: java.util.Map[String, String]
  ): DeltaTable
  def forName(spark: SparkSession, tableName: String): DeltaTable
  def isDeltaTable(spark: SparkSession, identifier: String): Boolean
}

Data Reading

Convert Delta tables to DataFrames and apply aliases for query operations.

class DeltaTable:
    def toDF(self) -> DataFrame:
        """Get DataFrame representation of Delta table."""
    
    def alias(self, alias_name: str) -> DeltaTable:
        """
        Apply alias to Delta table for use in queries.
        
        Parameters:
        - alias_name: Alias name for the table
        
        Returns:
        DeltaTable with applied alias
        """

class DeltaTable {
  def toDF: Dataset[Row]
  def as(alias: String): DeltaTable  
  def alias(alias: String): DeltaTable
}

Data Deletion

Delete rows from Delta tables with optional filtering conditions.

class DeltaTable:
    def delete(self, condition: Optional[Union[str, Column]] = None) -> None:
        """
        Delete data matching condition.
        
        Parameters:
        - condition: Optional SQL condition string or Column expression for filtering rows to delete
        """

class DeltaTable {
  def delete(): Unit
  def delete(condition: String): Unit
  def delete(condition: Column): Unit
}

Usage examples:

# Delete all rows
delta_table.delete()

# Delete with condition
delta_table.delete("age < 18")
delta_table.delete(col("age") < 18)

Data Updates

Update existing rows in Delta tables with conditional logic and column mappings.

class DeltaTable:
    def update(
        self,
        condition: Optional[Union[str, Column]] = None,
        set: Optional[Dict[str, Union[str, Column]]] = None
    ) -> None:
        """
        Update rows based on condition and column mappings.
        
        Parameters:
        - condition: Optional SQL condition string or Column expression for filtering rows to update
        - set: Dictionary mapping column names to new values (SQL expressions or Column objects)
        
        Note: Uses method overloading - can be called as update(set=...) or update(condition, set)
        """

class DeltaTable {
  def update(set: Map[String, Column]): Unit
  def update(condition: Column, set: Map[String, Column]): Unit
  def updateExpr(set: Map[String, String]): Unit
  def updateExpr(condition: String, set: Map[String, String]): Unit
}

Usage examples:

# Update all rows
delta_table.update(set={"status": "'active'"})

# Conditional update with SQL expressions
delta_table.update(
    condition="department = 'engineering'",
    set={
        "salary": "salary * 1.1",
        "updated_at": "current_timestamp()"
    }
)

# Update with Column objects
from pyspark.sql.functions import col, current_timestamp
delta_table.update(
    condition=col("department") == "engineering",
    set={
        "salary": col("salary") * 1.1,
        "updated_at": current_timestamp()
    }
)

Table Conversion

Convert existing Parquet tables to Delta format.

class DeltaTable:
    @classmethod
    def convertToDelta(
        cls,
        spark: SparkSession,
        identifier: str,
        partition_schema: Optional[Union[str, StructType]] = None
    ) -> DeltaTable:
        """
        Convert existing Parquet table to Delta format.
        
        Parameters:
        - spark: SparkSession instance
        - identifier: Parquet table identifier (e.g., "parquet.`/path/to/table`")
        - partition_schema: Optional partition schema as DDL string or StructType
        
        Returns:
        DeltaTable instance for converted table
        """

object DeltaTable {
  def convertToDelta(spark: SparkSession, identifier: String): DeltaTable
  def convertToDelta(
    spark: SparkSession, 
    identifier: String, 
    partitionSchema: String
  ): DeltaTable
  def convertToDelta(
    spark: SparkSession,
    identifier: String, 
    partitionSchema: StructType
  ): DeltaTable
}

Usage examples:

# Convert unpartitioned table
delta_table = DeltaTable.convertToDelta(spark, "parquet.`/path/to/parquet/table`")

# Convert partitioned table
delta_table = DeltaTable.convertToDelta(
    spark,
    "parquet.`/path/to/partitioned/table`",
    "year int, month int"
)

Table Details

Get comprehensive information about Delta table structure and metadata.

class DeltaTable:
    def detail(self) -> DataFrame:
        """
        Get detailed information about the Delta table.
        
        Returns:
        DataFrame with table details including format, location, size, etc.
        """

class DeltaTable {
  def detail(): DataFrame
}

The detail operation returns information including:

Table format and version
Location and size
Number of files and rows
Partition columns
Table properties
Created/modified timestamps

Install with Tessl CLI

npx tessl i tessl/pypi-delta-spark

docs

tessl/pypi-delta-spark