Python API for Apache Spark, providing distributed computing, data analysis, and machine learning capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Low-level distributed computing functionality providing the foundational building blocks for Spark applications. This includes SparkContext for cluster coordination, RDDs for distributed data processing, broadcast variables for efficient data sharing, and accumulators for distributed counters and sums.
Main entry point for Spark functionality that coordinates the Spark application and manages cluster resources.
class SparkContext:
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
environment=None, batchSize=0, serializer=CPickleSerializer(),
conf=None, gateway=None, jsc=None, profiler_cls=BasicProfiler):
"""
Create a new SparkContext.
Parameters:
- master (str): Cluster URL to connect to (e.g. "local", "local[4]", or "spark://master:7077")
- appName (str): Name of the application
- sparkHome (str): Spark installation directory on cluster nodes
- pyFiles (list): Python files to send to the cluster
- environment (dict): Environment variables to set on worker nodes
- batchSize (int): Number of Python objects represented as a single Java object
- serializer: Serializer for RDDs
- conf (SparkConf): SparkConf object with configuration
- profiler_cls: Profiler class to use for profiling
"""
def parallelize(self, c, numSlices=None):
"""
Distribute a local Python collection to form an RDD.
Parameters:
- c: Collection to distribute (list, tuple, etc.)
- numSlices (int): Number of partitions to create
Returns:
RDD containing the elements of the collection
"""
def textFile(self, name, minPartitions=None, use_unicode=True):
"""
Read a text file from HDFS/local filesystem/any Hadoop-supported filesystem.
Parameters:
- name (str): Path to the text file
- minPartitions (int): Minimum number of partitions
- use_unicode (bool): Whether to convert to unicode
Returns:
RDD of strings
"""
def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
"""
Read text files from a directory, returning each file as a (filename, content) pair.
Parameters:
- path (str): Directory path
- minPartitions (int): Minimum number of partitions
- use_unicode (bool): Whether to convert to unicode
Returns:
RDD of (filename, content) pairs
"""
def broadcast(self, value):
"""
Broadcast a read-only variable to the cluster.
Parameters:
- value: Value to broadcast
Returns:
Broadcast variable
"""
def accumulator(self, value, accum_param=None):
"""
Create an accumulator with the given initial value.
Parameters:
- value: Initial value
- accum_param: AccumulatorParam object
Returns:
Accumulator
"""
def stop(self):
"""Shut down the SparkContext."""
def setCheckpointDir(self, dirName):
"""
Set the directory under which RDDs are going to be checkpointed.
Parameters:
- dirName (str): Checkpoint directory path
"""
def setLogLevel(self, logLevel):
"""
Control the global logging level.
Parameters:
- logLevel (str): Log level ("ALL", "DEBUG", "ERROR", "FATAL", "INFO", "OFF", "TRACE", "WARN")
"""Fundamental distributed data abstraction that represents an immutable, partitioned collection of elements.
class RDD:
def map(self, f):
"""
Return a new RDD by applying a function to each element.
Parameters:
- f: Function to apply to each element
Returns:
New RDD with transformed elements
"""
def filter(self, f):
"""
Return a new RDD containing only elements that satisfy a predicate.
Parameters:
- f: Predicate function
Returns:
Filtered RDD
"""
def flatMap(self, f):
"""
Return a new RDD by first applying a function and then flattening the results.
Parameters:
- f: Function that returns a sequence
Returns:
Flattened RDD
"""
def mapPartitions(self, f, preservesPartitioning=False):
"""
Return a new RDD by applying a function to each partition.
Parameters:
- f: Function to apply to each partition iterator
- preservesPartitioning (bool): Whether partitioning is preserved
Returns:
New RDD
"""
def reduce(self, f):
"""
Reduce the elements of the RDD using the specified commutative and associative binary operator.
Parameters:
- f: Binary function for reduction
Returns:
Single reduced value
"""
def fold(self, zeroValue, op):
"""
Aggregate the elements using a given associative function and a neutral "zero value".
Parameters:
- zeroValue: Neutral zero value
- op: Associative function
Returns:
Aggregated result
"""
def aggregate(self, zeroValue, seqOp, combOp):
"""
Aggregate elements using given combine functions and a neutral "zero value".
Parameters:
- zeroValue: Neutral zero value
- seqOp: Function to combine elements within partitions
- combOp: Function to combine results from partitions
Returns:
Aggregated result
"""
def collect(self):
"""
Return all elements of the RDD as a list.
Returns:
List containing all RDD elements
"""
def take(self, num):
"""
Take the first num elements of the RDD.
Parameters:
- num (int): Number of elements to take
Returns:
List of first num elements
"""
def first(self):
"""
Return the first element of the RDD.
Returns:
First element
"""
def count(self):
"""
Return the number of elements in the RDD.
Returns:
Number of elements
"""
def distinct(self, numPartitions=None):
"""
Return a new RDD containing distinct elements.
Parameters:
- numPartitions (int): Number of partitions in result
Returns:
RDD with distinct elements
"""
def union(self, other):
"""
Return the union of this RDD and another one.
Parameters:
- other (RDD): Another RDD
Returns:
Union RDD
"""
def intersection(self, other):
"""
Return the intersection of this RDD and another one.
Parameters:
- other (RDD): Another RDD
Returns:
Intersection RDD
"""
def groupBy(self, f, numPartitions=None):
"""
Group RDD elements by a key function.
Parameters:
- f: Key function
- numPartitions (int): Number of partitions
Returns:
RDD of grouped elements
"""
def sortBy(self, keyfunc, ascending=True, numPartitions=None):
"""
Sort the RDD by a key function.
Parameters:
- keyfunc: Function to compute key for sorting
- ascending (bool): Sort in ascending order
- numPartitions (int): Number of partitions
Returns:
Sorted RDD
"""
def cache(self):
"""
Persist this RDD with the default storage level (MEMORY_ONLY).
Returns:
This RDD
"""
def persist(self, storageLevel=StorageLevel.MEMORY_ONLY):
"""
Persist this RDD with the specified storage level.
Parameters:
- storageLevel (StorageLevel): Storage level
Returns:
This RDD
"""
def checkpoint(self):
"""Mark this RDD for checkpointing."""
def getNumPartitions(self):
"""
Return the number of partitions of this RDD.
Returns:
Number of partitions
"""
def coalesce(self, numPartitions, shuffle=False):
"""
Return a new RDD with reduced number of partitions.
Parameters:
- numPartitions (int): Target number of partitions
- shuffle (bool): Whether to shuffle data
Returns:
Coalesced RDD
"""
def repartition(self, numPartitions):
"""
Return a new RDD with exactly numPartitions partitions.
Parameters:
- numPartitions (int): Number of partitions
Returns:
Repartitioned RDD
"""Operations available on RDDs of key-value pairs.
class RDD:
def groupByKey(self, numPartitions=None):
"""
Group values for each key in the RDD into a single sequence.
Parameters:
- numPartitions (int): Number of partitions
Returns:
RDD of (key, iterable of values) pairs
"""
def reduceByKey(self, func, numPartitions=None):
"""
Merge values for each key using an associative reduce function.
Parameters:
- func: Associative reduce function
- numPartitions (int): Number of partitions
Returns:
RDD of (key, reduced value) pairs
"""
def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
"""
Aggregate values for each key using given combine functions.
Parameters:
- zeroValue: Initial value for each key
- seqFunc: Function to combine values within partitions
- combFunc: Function to combine results across partitions
- numPartitions (int): Number of partitions
Returns:
RDD of (key, aggregated value) pairs
"""
def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
"""
Sort RDD by keys.
Parameters:
- ascending (bool): Sort in ascending order
- numPartitions (int): Number of partitions
- keyfunc: Function to compute sort key
Returns:
Sorted RDD
"""
def join(self, other, numPartitions=None):
"""
Return an RDD containing all pairs of elements with matching keys.
Parameters:
- other (RDD): Another RDD to join with
- numPartitions (int): Number of partitions
Returns:
RDD of (key, (value1, value2)) pairs
"""
def leftOuterJoin(self, other, numPartitions=None):
"""
Perform a left outer join of this RDD and another one.
Parameters:
- other (RDD): Another RDD to join with
- numPartitions (int): Number of partitions
Returns:
RDD of (key, (value1, Optional[value2])) pairs
"""
def rightOuterJoin(self, other, numPartitions=None):
"""
Perform a right outer join of this RDD and another one.
Parameters:
- other (RDD): Another RDD to join with
- numPartitions (int): Number of partitions
Returns:
RDD of (key, (Optional[value1], value2)) pairs
"""
def fullOuterJoin(self, other, numPartitions=None):
"""
Perform a full outer join of this RDD and another one.
Parameters:
- other (RDD): Another RDD to join with
- numPartitions (int): Number of partitions
Returns:
RDD of (key, (Optional[value1], Optional[value2])) pairs
"""Read-only variables cached on each machine for efficient data sharing.
class Broadcast:
def value(self):
"""
Return the broadcasted value.
Returns:
The broadcasted value
"""
def destroy(self):
"""Destroy all data and metadata related to this broadcast variable."""
def unpersist(self, blocking=False):
"""
Delete cached copies of this broadcast on the executors.
Parameters:
- blocking (bool): Whether to block until unpersisting is complete
"""Shared variables that can be accumulated across tasks.
class Accumulator:
def add(self, term):
"""
Add a term to this accumulator.
Parameters:
- term: Value to add
"""
def value(self):
"""
Get the accumulator's value.
Returns:
Current accumulator value
"""
class AccumulatorParam:
def zero(self, value):
"""
Provide a "zero value" for the accumulator type.
Parameters:
- value: Sample value
Returns:
Zero value
"""
def addInPlace(self, value1, value2):
"""
Add two values of the accumulator's data type.
Parameters:
- value1: First value
- value2: Second value
Returns:
Sum of the values
"""Configuration settings for Spark applications.
class SparkConf:
def __init__(self, loadDefaults=True, _jvm=None, _jconf=None):
"""
Create a new Spark configuration.
Parameters:
- loadDefaults (bool): Whether to load default values
"""
def setAppName(self, value):
"""
Set application name.
Parameters:
- value (str): Application name
Returns:
This SparkConf object
"""
def setMaster(self, value):
"""
Set master URL.
Parameters:
- value (str): Master URL
Returns:
This SparkConf object
"""
def set(self, key, value):
"""
Set a configuration property.
Parameters:
- key (str): Configuration key
- value (str): Configuration value
Returns:
This SparkConf object
"""
def get(self, key, defaultValue=None):
"""
Get a configuration value.
Parameters:
- key (str): Configuration key
- defaultValue (str): Default value if key not found
Returns:
Configuration value
"""
def setSparkHome(self, value):
"""
Set Spark installation path.
Parameters:
- value (str): Spark home directory
Returns:
This SparkConf object
"""
def setExecutorEnv(self, key=None, value=None, pairs=None):
"""
Set environment variables for executor processes.
Parameters:
- key (str): Environment variable name
- value (str): Environment variable value
- pairs (list): List of (key, value) pairs
Returns:
This SparkConf object
"""class StorageLevel:
"""Storage levels for RDD persistence."""
DISK_ONLY: StorageLevel
DISK_ONLY_2: StorageLevel
MEMORY_ONLY: StorageLevel
MEMORY_ONLY_2: StorageLevel
MEMORY_ONLY_SER: StorageLevel
MEMORY_ONLY_SER_2: StorageLevel
MEMORY_AND_DISK: StorageLevel
MEMORY_AND_DISK_2: StorageLevel
MEMORY_AND_DISK_SER: StorageLevel
MEMORY_AND_DISK_SER_2: StorageLevel
OFF_HEAP: StorageLevel
class TaskContext:
"""Information about the task currently being executed."""
def attemptNumber(self):
"""How many times this task has been attempted."""
def partitionId(self):
"""The ID of the RDD partition that is computed by this task."""
def stageId(self):
"""The ID of the stage that this task belong to."""
def taskAttemptId(self):
"""An ID that is unique to this task attempt."""
class StatusTracker:
"""Low-level status reporting APIs for monitoring job and stage progress."""
def getJobIdsForGroup(self, jobGroup):
"""Return a list of all known jobs in a particular job group."""
def getActiveStageIds(self):
"""Returns an array containing the ids of all active stages."""
def getExecutorInfos(self):
"""Returns information about all known executors."""Install with Tessl CLI
npx tessl i tessl/pypi-pyspark