Python API for Apache Spark, providing distributed computing, data analysis, and machine learning capabilities
npx @tessl/cli install tessl/pypi-pyspark@4.0.0PySpark is the Python API for Apache Spark, a unified analytics engine for large-scale data processing. It provides high-level APIs for distributed computing, data analysis, and machine learning workloads across clusters, enabling Python developers to leverage Spark's distributed computing capabilities through familiar Python syntax while handling large-scale datasets.
pip install pysparkimport pyspark
from pyspark import SparkContext, SparkConfFor SQL and DataFrame operations:
from pyspark.sql import SparkSession, DataFrame, Column, Row
from pyspark.sql.functions import col, lit, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerTypeFor window operations:
from pyspark.sql.window import Window, WindowSpecFor streaming operations:
from pyspark.sql.streaming import StreamingQuery, StreamingQueryManagerFor machine learning:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssemblerfrom pyspark.sql import SparkSession
# Create Spark session
spark = SparkSession.builder \
.appName("MyApp") \
.getOrCreate()
# Create DataFrame from data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)
# Basic DataFrame operations
df.show()
df.filter(df.age > 25).show()
# SQL operations
df.createOrReplaceTempView("people")
result = spark.sql("SELECT name FROM people WHERE age > 25")
result.show()
# Stop the session
spark.stop()PySpark's architecture enables distributed computing through several key components:
This architecture allows PySpark to process large datasets by distributing computations across cluster nodes while providing fault tolerance, automatic optimization, and familiar Python APIs for data manipulation, SQL queries, machine learning, and streaming analytics.
Low-level distributed computing with Resilient Distributed Datasets (RDDs), broadcast variables, accumulators, and Spark configuration. Provides foundational distributed computing primitives.
class SparkContext:
def __init__(self, master=None, appName=None, conf=None): ...
def parallelize(self, c, numSlices=None): ...
def textFile(self, name, minPartitions=None, use_unicode=True): ...
class RDD:
def map(self, f): ...
def filter(self, f): ...
def collect(self): ...
class SparkConf:
def setAppName(self, value): ...
def setMaster(self, value): ...Structured data processing with DataFrames, SQL queries, data I/O, and 500+ built-in functions. Provides the primary interface for structured data analysis and processing.
class SparkSession:
def createDataFrame(self, data, schema=None): ...
def sql(self, sqlQuery): ...
def read: DataFrameReader
def table(self, tableName): ...
class DataFrame:
def select(self, *cols): ...
def filter(self, condition): ...
def groupBy(self, *cols): ...
def show(self, n=20, truncate=True): ...Modern machine learning pipeline API with estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing.
class Pipeline:
def __init__(self, stages=None): ...
def fit(self, dataset): ...
class LogisticRegression:
def __init__(self, featuresCol="features", labelCol="label"): ...
def fit(self, dataset): ...
class VectorAssembler:
def __init__(self, inputCols=None, outputCol=None): ...Legacy machine learning library with RDD-based algorithms for classification, regression, clustering, and collaborative filtering. Maintained for backward compatibility.
class LogisticRegressionWithSGD:
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0): ...
class KMeans:
@classmethod
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): ...Pandas-compatible API for familiar pandas operations on distributed datasets. Enables seamless scaling of pandas workflows to large datasets.
class DataFrame:
def head(self, n=5): ...
def describe(self): ...
def groupby(self, by=None): ...
def merge(self, right, how="inner", on=None): ...
def read_csv(path, **kwargs): ...
def concat(objs, axis=0): ...Real-time data processing with structured streaming for continuous data ingestion, processing, and output to various sinks.
class StreamingContext:
def __init__(self, sparkContext, batchDuration): ...
def socketTextStream(self, hostname, port): ...
def start(self): ...
class DStream:
def map(self, f): ...
def filter(self, f): ...
def foreachRDD(self, func): ...Resource allocation and management for Spark applications including task resources, executor resources, and resource profiles for optimized cluster utilization.
class ResourceProfile:
def __init__(self, executorResources=None, taskResources=None): ...
class ExecutorResourceRequests:
def cores(self, amount): ...
def memory(self, amount): ...class StorageLevel:
DISK_ONLY: StorageLevel
MEMORY_ONLY: StorageLevel
MEMORY_AND_DISK: StorageLevel
class TaskContext:
def partitionId(self): ...
def stageId(self): ...
def taskAttemptId(self): ...
class Row:
def __init__(self, **kwargs): ...
def asDict(self): ...
# Data Types
class DataType:
"""Base class for data types."""
class StringType(DataType):
"""String data type."""
class IntegerType(DataType):
"""Integer data type."""
class LongType(DataType):
"""Long integer data type."""
class FloatType(DataType):
"""Float data type."""
class DoubleType(DataType):
"""Double precision float data type."""
class BooleanType(DataType):
"""Boolean data type."""
class TimestampType(DataType):
"""Timestamp data type."""
class DateType(DataType):
"""Date data type."""
class ArrayType(DataType):
"""Array data type."""
def __init__(self, elementType, containsNull=True): ...
class MapType(DataType):
"""Map data type."""
def __init__(self, keyType, valueType, valueContainsNull=True): ...
class StructType(DataType):
"""Struct data type representing a row."""
def __init__(self, fields=None): ...
def add(self, field, data_type=None, nullable=True, metadata=None): ...
class StructField:
"""Field in a StructType."""
def __init__(self, name, dataType, nullable=True, metadata=None): ...
# Common Exception Types
class PySparkException(Exception):
"""Base exception for PySpark errors."""
class AnalysisException(PySparkException):
"""Exception thrown when analysis of SQL query fails."""
class ParseException(PySparkException):
"""Exception thrown when parsing of SQL query fails."""
class StreamingQueryException(PySparkException):
"""Exception thrown by streaming queries."""