tessl/pypi-pyspark

Python API for Apache Spark, providing distributed computing, data analysis, and machine learning capabilities

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Machine Learning (ML)

Name: tessl/pypi-pyspark
Author: tessl

Modern machine learning pipeline API providing estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing. The ML package provides a high-level API built on DataFrames for constructing ML pipelines.

Capabilities

Pipeline Components

Core abstractions for building machine learning workflows.

class Pipeline:
    """A simple pipeline that chains multiple Transformers and Estimators together."""
    
    def __init__(self, stages=None):
        """
        Initialize Pipeline.
        
        Parameters:
        - stages (list): List of pipeline stages (Transformers and Estimators)
        """
    
    def fit(self, dataset, params=None):
        """
        Fit the pipeline to training data.
        
        Parameters:
        - dataset (DataFrame): Training dataset
        - params (dict): Additional parameters
        
        Returns:
        PipelineModel: Fitted pipeline model
        """
    
    def setStages(self, value):
        """Set pipeline stages."""

class PipelineModel:
    """A fitted pipeline model."""
    
    def transform(self, dataset):
        """
        Transform the dataset using the fitted pipeline.
        
        Parameters:
        - dataset (DataFrame): Dataset to transform
        
        Returns:
        DataFrame: Transformed dataset
        """
    
    def save(self, path):
        """Save the pipeline model to the given path."""
    
    @classmethod
    def load(cls, path):
        """Load a pipeline model from the given path."""

class Estimator:
    """Abstract class for estimators that can be fit on a DataFrame to produce a Model."""
    
    def fit(self, dataset, params=None):
        """
        Fit model to training data.
        
        Parameters:
        - dataset (DataFrame): Training dataset
        - params (dict): Additional parameters
        
        Returns:
        Model: Fitted model
        """

class Transformer:
    """Abstract class for transformers that transform DataFrames into DataFrames."""
    
    def transform(self, dataset):
        """
        Transform the dataset.
        
        Parameters:
        - dataset (DataFrame): Dataset to transform
        
        Returns:
        DataFrame: Transformed dataset
        """

class Model:
    """Abstract class for models that are fitted by estimators."""
    
    def transform(self, dataset):
        """
        Transform the dataset using the fitted model.
        
        Parameters:
        - dataset (DataFrame): Dataset to transform
        
        Returns:
        DataFrame: Transformed dataset
        """

class Predictor(Estimator):
    """Base class for predictors that make predictions on feature vectors."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction"):
        """
        Initialize Predictor.
        
        Parameters:
        - featuresCol (str): Features column name
        - labelCol (str): Label column name  
        - predictionCol (str): Prediction column name
        """

class PredictionModel(Model):
    """Base class for prediction models."""
    
    def predict(self, value):
        """Make a prediction on a single feature vector."""
    
    def transform(self, dataset):
        """Transform dataset to include predictions."""

Classification Algorithms

Supervised learning algorithms for classification tasks.

class LogisticRegression(Predictor):
    """Logistic regression classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                 threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction",
                 standardization=True, weightCol=None, aggregationDepth=2, family="auto",
                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, maxBlockSizeInMB=0.0):
        """
        Initialize LogisticRegression.
        
        Parameters:
        - featuresCol (str): Features column name
        - labelCol (str): Label column name
        - predictionCol (str): Prediction column name
        - maxIter (int): Maximum number of iterations
        - regParam (float): Regularization parameter
        - elasticNetParam (float): ElasticNet mixing parameter
        - tol (float): Convergence tolerance
        - fitIntercept (bool): Whether to fit intercept
        - threshold (float): Binary classification threshold
        - thresholds (list): Thresholds for multiclass classification
        - probabilityCol (str): Probability column name
        - rawPredictionCol (str): Raw prediction column name
        - standardization (bool): Whether to standardize features
        - weightCol (str): Weight column name
        - aggregationDepth (int): Aggregation depth for treeAggregate
        - family (str): Name of family for GLM
        - lowerBoundsOnCoefficients (Matrix): Lower bounds on coefficients
        - upperBoundsOnCoefficients (Matrix): Upper bounds on coefficients
        - lowerBoundsOnIntercepts (Vector): Lower bounds on intercepts
        - upperBoundsOnIntercepts (Vector): Upper bounds on intercepts
        - maxBlockSizeInMB (float): Maximum memory for stacking input data
        """

class DecisionTreeClassifier(Predictor):
    """Decision tree classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
                 maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
                 cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None,
                 weightCol=None, leafCol="", minWeightFractionPerNode=0.0):
        """
        Initialize DecisionTreeClassifier.
        
        Parameters:
        - maxDepth (int): Maximum depth of tree
        - maxBins (int): Maximum number of bins for discretizing continuous features
        - minInstancesPerNode (int): Minimum number of instances each child must have
        - minInfoGain (float): Minimum information gain for split
        - maxMemoryInMB (int): Maximum memory in MB allocated to histogram aggregation
        - cacheNodeIds (bool): Whether to cache node IDs
        - checkpointInterval (int): Checkpoint interval
        - impurity (str): Impurity measure ("gini" or "entropy")
        - seed (int): Random seed
        - weightCol (str): Weight column name
        - leafCol (str): Leaf index column name
        - minWeightFractionPerNode (float): Minimum weighted fraction of total weight
        """

class RandomForestClassifier(Predictor):
    """Random forest classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
                 maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
                 cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20,
                 featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, weightCol=None,
                 leafCol="", minWeightFractionPerNode=0.0, bootstrap=True):
        """
        Initialize RandomForestClassifier.
        
        Parameters:
        - numTrees (int): Number of trees in the forest
        - featureSubsetStrategy (str): Number of features to consider for splits
        - subsamplingRate (float): Fraction of training data used for learning
        - bootstrap (bool): Whether bootstrap samples are used when building trees
        """

class GBTClassifier(Predictor):
    """Gradient-boosted tree classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None,
                 subsamplingRate=1.0, featureSubsetStrategy="all", validationTol=0.01,
                 validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,
                 weightCol=None):
        """
        Initialize GBTClassifier.
        
        Parameters:
        - lossType (str): Loss function type
        - maxIter (int): Maximum number of iterations
        - stepSize (float): Step size for gradient descent
        - subsamplingRate (float): Fraction of training data used for learning
        - featureSubsetStrategy (str): Number of features to consider for splits
        - validationTol (float): Validation tolerance for early stopping
        - validationIndicatorCol (str): Validation indicator column name
        """

class NaiveBayes(Predictor):
    """Naive Bayes classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
                 modelType="multinomial", thresholds=None, weightCol=None):
        """
        Initialize NaiveBayes.
        
        Parameters:
        - smoothing (float): Smoothing parameter
        - modelType (str): Model type ("multinomial" or "bernoulli")
        - thresholds (list): Thresholds for binary classification
        - weightCol (str): Weight column name
        """

class LinearSVC(Predictor):
    """Linear Support Vector Classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 rawPredictionCol="rawPrediction", maxIter=100, regParam=0.0, tol=1e-6,
                 fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
                 aggregationDepth=2, blockSize=1):
        """
        Initialize LinearSVC.
        
        Parameters:
        - maxIter (int): Maximum number of iterations
        - regParam (float): Regularization parameter
        - tol (float): Convergence tolerance
        - fitIntercept (bool): Whether to fit intercept
        - standardization (bool): Whether to standardize features
        - threshold (float): Classification threshold
        - weightCol (str): Weight column name
        - aggregationDepth (int): Aggregation depth for treeAggregate
        - blockSize (int): Block size for stacking input data
        """

class MultilayerPerceptronClassifier(Predictor):
    """Multilayer perceptron classifier."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
                 solver="l-bfgs", initialWeights=None, probabilityCol="probability",
                 rawPredictionCol="rawPrediction"):
        """
        Initialize MultilayerPerceptronClassifier.
        
        Parameters:
        - maxIter (int): Maximum number of iterations
        - tol (float): Convergence tolerance
        - seed (int): Random seed
        - layers (list): Sizes of layers from input to output
        - blockSize (int): Block size for stacking input data
        - stepSize (float): Step size for gradient descent
        - solver (str): Solver algorithm ("l-bfgs" or "gd")
        - initialWeights (Vector): Initial weights
        - probabilityCol (str): Probability column name
        - rawPredictionCol (str): Raw prediction column name
        """

Regression Algorithms

Supervised learning algorithms for regression tasks.

class LinearRegression(Predictor):
    """Linear regression."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
                 loss="squaredError", epsilon=1.35):
        """
        Initialize LinearRegression.
        
        Parameters:
        - maxIter (int): Maximum number of iterations
        - regParam (float): Regularization parameter
        - elasticNetParam (float): ElasticNet mixing parameter
        - tol (float): Convergence tolerance
        - fitIntercept (bool): Whether to fit intercept
        - standardization (bool): Whether to standardize features
        - solver (str): Solver algorithm ("auto", "normal", "l-bfgs")
        - weightCol (str): Weight column name
        - aggregationDepth (int): Aggregation depth for treeAggregate
        - loss (str): Loss function ("squaredError" or "huber")
        - epsilon (float): Shape parameter for Huber loss
        """

class DecisionTreeRegressor(Predictor):
    """Decision tree regressor."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                 impurity="variance", seed=None, varianceCol=None, weightCol=None,
                 leafCol="", minWeightFractionPerNode=0.0):
        """
        Initialize DecisionTreeRegressor.
        
        Parameters:
        - impurity (str): Impurity measure ("variance")
        - varianceCol (str): Variance column name
        """

class RandomForestRegressor(Predictor):
    """Random forest regressor."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                 impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=None,
                 subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,
                 weightCol=None, bootstrap=True):
        """Initialize RandomForestRegressor."""

class GBTRegressor(Predictor):
    """Gradient-boosted tree regressor."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                 maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                 lossType="squared", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
                 featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,
                 leafCol="", minWeightFractionPerNode=0.0, weightCol=None):
        """Initialize GBTRegressor."""

class IsotonicRegression(Estimator):
    """Isotonic regression."""
    
    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                 weightCol=None, isotonic=True, featureIndex=0):
        """
        Initialize IsotonicRegression.
        
        Parameters:
        - isotonic (bool): Whether the output sequence should be isotonic/increasing
        - featureIndex (int): Index of the feature to use if featuresCol is a vector
        """

Clustering Algorithms

Unsupervised learning algorithms for clustering tasks.

class KMeans(Estimator):
    """K-means clustering."""
    
    def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
                 initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, seed=None,
                 distanceMeasure="euclidean", weightCol=None):
        """
        Initialize KMeans.
        
        Parameters:
        - k (int): Number of clusters
        - initMode (str): Initialization algorithm ("k-means||" or "random")
        - initSteps (int): Number of steps for k-means|| initialization
        - tol (float): Convergence tolerance
        - maxIter (int): Maximum number of iterations
        - seed (int): Random seed
        - distanceMeasure (str): Distance measure ("euclidean" or "cosine")
        - weightCol (str): Weight column name
        """

class BisectingKMeans(Estimator):
    """Bisecting k-means clustering."""
    
    def __init__(self, featuresCol="features", predictionCol="prediction", k=4,
                 maxIter=20, seed=None, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
        """
        Initialize BisectingKMeans.
        
        Parameters:
        - k (int): Number of clusters
        - maxIter (int): Maximum number of iterations
        - seed (int): Random seed
        - minDivisibleClusterSize (float): Minimum divisible cluster size
        - distanceMeasure (str): Distance measure
        """

class GaussianMixture(Estimator):
    """Gaussian Mixture Model."""
    
    def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
                 probabilityCol="probability", tol=0.01, maxIter=100, seed=None,
                 aggregationDepth=2, weightCol=None):
        """
        Initialize GaussianMixture.
        
        Parameters:
        - k (int): Number of components
        - probabilityCol (str): Probability column name
        - tol (float): Convergence tolerance
        - maxIter (int): Maximum number of iterations
        - seed (int): Random seed
        - aggregationDepth (int): Aggregation depth for treeAggregate
        - weightCol (str): Weight column name
        """

class LDA(Estimator):
    """Latent Dirichlet Allocation."""
    
    def __init__(self, featuresCol="features", maxIter=100, seed=None, checkpointInterval=10,
                 k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
                 subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None,
                 topicConcentration=None, topicDistributionCol="topicDistribution",
                 keepLastCheckpoint=True):
        """
        Initialize LDA.
        
        Parameters:
        - k (int): Number of topics
        - optimizer (str): Optimizer ("online" or "em")
        - learningOffset (float): Learning offset for online optimizer
        - learningDecay (float): Learning decay rate
        - subsamplingRate (float): Subsampling rate for online optimizer
        - optimizeDocConcentration (bool): Whether to optimize document concentration
        - docConcentration (Vector): Document concentration parameters
        - topicConcentration (float): Topic concentration parameter
        - topicDistributionCol (str): Topic distribution column name
        - keepLastCheckpoint (bool): Whether to keep last checkpoint
        """

Feature Processing

Transformers for feature engineering and preprocessing.

class VectorAssembler(Transformer):
    """Combine multiple columns into a vector column."""
    
    def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"):
        """
        Initialize VectorAssembler.
        
        Parameters:
        - inputCols (list): Input column names
        - outputCol (str): Output column name
        - handleInvalid (str): How to handle invalid data ("error", "skip", "keep")
        """

class StandardScaler(Estimator):
    """Standardize features by removing mean and scaling to unit variance."""
    
    def __init__(self, inputCol=None, outputCol=None, withMean=False, withStd=True):
        """
        Initialize StandardScaler.
        
        Parameters:
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - withMean (bool): Whether to center data with mean
        - withStd (bool): Whether to scale to unit standard deviation
        """

class MinMaxScaler(Estimator):
    """Transform features by scaling to a given range."""
    
    def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
        """
        Initialize MinMaxScaler.
        
        Parameters:
        - min (float): Lower bound after transformation
        - max (float): Upper bound after transformation
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        """

class StringIndexer(Estimator):
    """Encode string labels to label indices."""
    
    def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None,
                 handleInvalid="error", stringOrderType="frequencyDesc"):
        """
        Initialize StringIndexer.
        
        Parameters:
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - inputCols (list): Input column names
        - outputCols (list): Output column names
        - handleInvalid (str): How to handle invalid data
        - stringOrderType (str): How to order labels ("frequencyDesc", "frequencyAsc", "alphabetDesc", "alphabetAsc")
        """

class IndexToString(Transformer):
    """Map label indices back to label strings."""
    
    def __init__(self, inputCol=None, outputCol=None, labels=None, inputCols=None,
                 outputCols=None):
        """
        Initialize IndexToString.
        
        Parameters:
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - labels (list): Ordered list of labels
        - inputCols (list): Input column names
        - outputCols (list): Output column names
        """

class OneHotEncoder(Estimator):
    """One-hot encode categorical features."""
    
    def __init__(self, inputCols=None, outputCols=None, dropLast=True, handleInvalid="error",
                 inputCol=None, outputCol=None):
        """
        Initialize OneHotEncoder.
        
        Parameters:
        - inputCols (list): Input column names
        - outputCols (list): Output column names
        - dropLast (bool): Whether to drop the last category
        - handleInvalid (str): How to handle invalid data
        - inputCol (str): Input column name (deprecated)
        - outputCol (str): Output column name (deprecated)
        """

class PCA(Estimator):
    """Principal component analysis dimensionality reduction."""
    
    def __init__(self, k=None, inputCol=None, outputCol=None):
        """
        Initialize PCA.
        
        Parameters:
        - k (int): Number of principal components
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        """

class Word2Vec(Estimator):
    """Word2Vec transforms a dataset of text documents to vectors."""
    
    def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
                 maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5,
                 maxSentenceLength=1000):
        """
        Initialize Word2Vec.
        
        Parameters:
        - vectorSize (int): Dimension of the code that maps words to
        - minCount (int): Minimum number of times a token must appear
        - numPartitions (int): Number of partitions for sentences
        - stepSize (float): Step size for gradient descent
        - maxIter (int): Maximum number of iterations
        - seed (int): Random seed
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - windowSize (int): Window size for Word2Vec
        - maxSentenceLength (int): Maximum sentence length
        """

class CountVectorizer(Estimator):
    """Convert text documents to vectors of token counts."""
    
    def __init__(self, inputCol=None, outputCol=None, vocabSize=1 << 18, minDF=1.0,
                 maxDF=None, minTF=1.0, binary=False):
        """
        Initialize CountVectorizer.
        
        Parameters:
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - vocabSize (int): Maximum vocabulary size
        - minDF (float): Minimum document frequency
        - maxDF (float): Maximum document frequency
        - minTF (float): Minimum term frequency
        - binary (bool): Binary toggle to control term frequency counts
        """

class IDF(Estimator):
    """Compute Inverse Document Frequency (IDF) for TF-IDF."""
    
    def __init__(self, inputCol=None, outputCol=None, minDocFreq=0):
        """
        Initialize IDF.
        
        Parameters:
        - inputCol (str): Input column name
        - outputCol (str): Output column name
        - minDocFreq (int): Minimum document frequency
        """

Model Evaluation

Evaluation metrics for assessing model performance.

class Evaluator:
    """Base class for evaluators."""
    
    def evaluate(self, dataset, params=None):
        """
        Evaluate the dataset and return a scalar metric.
        
        Parameters:
        - dataset (DataFrame): Dataset to evaluate
        - params (dict): Additional parameters
        
        Returns:
        float: Evaluation metric
        """

class BinaryClassificationEvaluator(Evaluator):
    """Evaluator for binary classification."""
    
    def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
                 metricName="areaUnderROC", weightCol=None, numBins=1000):
        """
        Initialize BinaryClassificationEvaluator.
        
        Parameters:
        - rawPredictionCol (str): Raw prediction column name
        - labelCol (str): Label column name
        - metricName (str): Metric name ("areaUnderROC" or "areaUnderPR")
        - weightCol (str): Weight column name
        - numBins (int): Number of bins for ROC curve
        """

class MulticlassClassificationEvaluator(Evaluator):
    """Evaluator for multiclass classification."""
    
    def __init__(self, predictionCol="prediction", labelCol="label", metricName="f1",
                 metricLabel=0.0, beta=1.0, probabilityCol="probability", eps=1e-15):
        """
        Initialize MulticlassClassificationEvaluator.
        
        Parameters:
        - predictionCol (str): Prediction column name
        - labelCol (str): Label column name
        - metricName (str): Metric name ("f1", "accuracy", "weightedPrecision", etc.)
        - metricLabel (float): Label for metric calculation
        - beta (float): Beta value for F-beta score
        - probabilityCol (str): Probability column name
        - eps (float): Epsilon value to avoid division by zero
        """

class RegressionEvaluator(Evaluator):
    """Evaluator for regression."""
    
    def __init__(self, predictionCol="prediction", labelCol="label", metricName="rmse",
                 weightCol=None, throughOrigin=False):
        """
        Initialize RegressionEvaluator.
        
        Parameters:
        - predictionCol (str): Prediction column name
        - labelCol (str): Label column name
        - metricName (str): Metric name ("rmse", "mse", "r2", "mae", "var")
        - weightCol (str): Weight column name
        - throughOrigin (bool): Whether to fit line through origin for r2
        """

class ClusteringEvaluator(Evaluator):
    """Evaluator for clustering."""
    
    def __init__(self, predictionCol="prediction", featuresCol="features",
                 metricName="silhouette", distanceMeasure="squaredEuclidean",
                 weightCol=None):
        """
        Initialize ClusteringEvaluator.
        
        Parameters:
        - predictionCol (str): Prediction column name
        - featuresCol (str): Features column name
        - metricName (str): Metric name ("silhouette")
        - distanceMeasure (str): Distance measure
        - weightCol (str): Weight column name
        """

Hyperparameter Tuning

Tools for hyperparameter optimization and model selection.

class ParamGridBuilder:
    """Builder for a param grid used in grid search-based model selection."""
    
    def __init__(self):
        """Initialize ParamGridBuilder."""
    
    def addGrid(self, param, values):
        """
        Add parameter values to the grid.
        
        Parameters:
        - param (Param): Parameter to tune
        - values (list): List of parameter values
        
        Returns:
        ParamGridBuilder
        """
    
    def build(self):
        """
        Build and return the parameter grid.
        
        Returns:
        list: List of parameter maps
        """

class CrossValidator(Estimator):
    """K-fold cross validation."""
    
    def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
                 numFolds=3, seed=None, parallelism=1, collectSubModels=False,
                 foldCol=""):
        """
        Initialize CrossValidator.
        
        Parameters:
        - estimator (Estimator): Estimator to cross-validate
        - estimatorParamMaps (list): Parameter maps to evaluate
        - evaluator (Evaluator): Evaluator for model selection
        - numFolds (int): Number of folds for cross validation
        - seed (int): Random seed
        - parallelism (int): Number of threads to use for fitting models
        - collectSubModels (bool): Whether to collect sub-models
        - foldCol (str): Fold column name
        """

class TrainValidationSplit(Estimator):
    """Train-validation split for model selection."""
    
    def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
                 trainRatio=0.75, seed=None, parallelism=1, collectSubModels=False):
        """
        Initialize TrainValidationSplit.
        
        Parameters:
        - estimator (Estimator): Estimator to tune
        - estimatorParamMaps (list): Parameter maps to evaluate
        - evaluator (Evaluator): Evaluator for model selection
        - trainRatio (float): Ratio of training data
        - seed (int): Random seed
        - parallelism (int): Number of threads to use for fitting models
        - collectSubModels (bool): Whether to collect sub-models
        """

Types

from pyspark.ml.linalg import Vector, DenseVector, SparseVector, Vectors
from pyspark.ml.linalg import Matrix, DenseMatrix, SparseMatrix, Matrices

class Vector:
    """Abstract base class for ML vector types."""
    
    def toArray(self):
        """Convert to numpy array."""

class DenseVector(Vector):
    """Dense vector representation."""
    
    def __init__(self, ar):
        """Create from array-like object."""

class SparseVector(Vector):
    """Sparse vector representation."""
    
    def __init__(self, size, *args):
        """Create sparse vector."""

class Vectors:
    """Factory methods for creating vectors."""
    
    @staticmethod
    def dense(*values):
        """Create dense vector."""
    
    @staticmethod
    def sparse(size, *args):
        """Create sparse vector."""

class Matrix:
    """Abstract base class for ML matrix types."""
    
    def numRows(self):
        """Number of rows."""
    
    def numCols(self):
        """Number of columns."""

class DenseMatrix(Matrix):
    """Dense matrix representation."""

class SparseMatrix(Matrix):
    """Sparse matrix representation in CSC format."""

class Matrices:
    """Factory methods for creating matrices."""
    
    @staticmethod
    def dense(numRows, numCols, values):
        """Create dense matrix."""
    
    @staticmethod
    def sparse(numRows, numCols, colPtrs, rowIndices, values):
        """Create sparse matrix."""

Install with Tessl CLI