Python API for Apache Spark, providing distributed computing, data analysis, and machine learning capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Modern machine learning pipeline API providing estimators, transformers, and comprehensive algorithms for classification, regression, clustering, and feature processing. The ML package provides a high-level API built on DataFrames for constructing ML pipelines.
Core abstractions for building machine learning workflows.
class Pipeline:
"""A simple pipeline that chains multiple Transformers and Estimators together."""
def __init__(self, stages=None):
"""
Initialize Pipeline.
Parameters:
- stages (list): List of pipeline stages (Transformers and Estimators)
"""
def fit(self, dataset, params=None):
"""
Fit the pipeline to training data.
Parameters:
- dataset (DataFrame): Training dataset
- params (dict): Additional parameters
Returns:
PipelineModel: Fitted pipeline model
"""
def setStages(self, value):
"""Set pipeline stages."""
class PipelineModel:
"""A fitted pipeline model."""
def transform(self, dataset):
"""
Transform the dataset using the fitted pipeline.
Parameters:
- dataset (DataFrame): Dataset to transform
Returns:
DataFrame: Transformed dataset
"""
def save(self, path):
"""Save the pipeline model to the given path."""
@classmethod
def load(cls, path):
"""Load a pipeline model from the given path."""
class Estimator:
"""Abstract class for estimators that can be fit on a DataFrame to produce a Model."""
def fit(self, dataset, params=None):
"""
Fit model to training data.
Parameters:
- dataset (DataFrame): Training dataset
- params (dict): Additional parameters
Returns:
Model: Fitted model
"""
class Transformer:
"""Abstract class for transformers that transform DataFrames into DataFrames."""
def transform(self, dataset):
"""
Transform the dataset.
Parameters:
- dataset (DataFrame): Dataset to transform
Returns:
DataFrame: Transformed dataset
"""
class Model:
"""Abstract class for models that are fitted by estimators."""
def transform(self, dataset):
"""
Transform the dataset using the fitted model.
Parameters:
- dataset (DataFrame): Dataset to transform
Returns:
DataFrame: Transformed dataset
"""
class Predictor(Estimator):
"""Base class for predictors that make predictions on feature vectors."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction"):
"""
Initialize Predictor.
Parameters:
- featuresCol (str): Features column name
- labelCol (str): Label column name
- predictionCol (str): Prediction column name
"""
class PredictionModel(Model):
"""Base class for prediction models."""
def predict(self, value):
"""Make a prediction on a single feature vector."""
def transform(self, dataset):
"""Transform dataset to include predictions."""Supervised learning algorithms for classification tasks.
class LogisticRegression(Predictor):
"""Logistic regression classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction",
standardization=True, weightCol=None, aggregationDepth=2, family="auto",
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, maxBlockSizeInMB=0.0):
"""
Initialize LogisticRegression.
Parameters:
- featuresCol (str): Features column name
- labelCol (str): Label column name
- predictionCol (str): Prediction column name
- maxIter (int): Maximum number of iterations
- regParam (float): Regularization parameter
- elasticNetParam (float): ElasticNet mixing parameter
- tol (float): Convergence tolerance
- fitIntercept (bool): Whether to fit intercept
- threshold (float): Binary classification threshold
- thresholds (list): Thresholds for multiclass classification
- probabilityCol (str): Probability column name
- rawPredictionCol (str): Raw prediction column name
- standardization (bool): Whether to standardize features
- weightCol (str): Weight column name
- aggregationDepth (int): Aggregation depth for treeAggregate
- family (str): Name of family for GLM
- lowerBoundsOnCoefficients (Matrix): Lower bounds on coefficients
- upperBoundsOnCoefficients (Matrix): Upper bounds on coefficients
- lowerBoundsOnIntercepts (Vector): Lower bounds on intercepts
- upperBoundsOnIntercepts (Vector): Upper bounds on intercepts
- maxBlockSizeInMB (float): Maximum memory for stacking input data
"""
class DecisionTreeClassifier(Predictor):
"""Decision tree classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None,
weightCol=None, leafCol="", minWeightFractionPerNode=0.0):
"""
Initialize DecisionTreeClassifier.
Parameters:
- maxDepth (int): Maximum depth of tree
- maxBins (int): Maximum number of bins for discretizing continuous features
- minInstancesPerNode (int): Minimum number of instances each child must have
- minInfoGain (float): Minimum information gain for split
- maxMemoryInMB (int): Maximum memory in MB allocated to histogram aggregation
- cacheNodeIds (bool): Whether to cache node IDs
- checkpointInterval (int): Checkpoint interval
- impurity (str): Impurity measure ("gini" or "entropy")
- seed (int): Random seed
- weightCol (str): Weight column name
- leafCol (str): Leaf index column name
- minWeightFractionPerNode (float): Minimum weighted fraction of total weight
"""
class RandomForestClassifier(Predictor):
"""Random forest classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5,
maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20,
featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, weightCol=None,
leafCol="", minWeightFractionPerNode=0.0, bootstrap=True):
"""
Initialize RandomForestClassifier.
Parameters:
- numTrees (int): Number of trees in the forest
- featureSubsetStrategy (str): Number of features to consider for splits
- subsamplingRate (float): Fraction of training data used for learning
- bootstrap (bool): Whether bootstrap samples are used when building trees
"""
class GBTClassifier(Predictor):
"""Gradient-boosted tree classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="logistic", maxIter=20, stepSize=0.1, seed=None,
subsamplingRate=1.0, featureSubsetStrategy="all", validationTol=0.01,
validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,
weightCol=None):
"""
Initialize GBTClassifier.
Parameters:
- lossType (str): Loss function type
- maxIter (int): Maximum number of iterations
- stepSize (float): Step size for gradient descent
- subsamplingRate (float): Fraction of training data used for learning
- featureSubsetStrategy (str): Number of features to consider for splits
- validationTol (float): Validation tolerance for early stopping
- validationIndicatorCol (str): Validation indicator column name
"""
class NaiveBayes(Predictor):
"""Naive Bayes classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
modelType="multinomial", thresholds=None, weightCol=None):
"""
Initialize NaiveBayes.
Parameters:
- smoothing (float): Smoothing parameter
- modelType (str): Model type ("multinomial" or "bernoulli")
- thresholds (list): Thresholds for binary classification
- weightCol (str): Weight column name
"""
class LinearSVC(Predictor):
"""Linear Support Vector Classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
rawPredictionCol="rawPrediction", maxIter=100, regParam=0.0, tol=1e-6,
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
aggregationDepth=2, blockSize=1):
"""
Initialize LinearSVC.
Parameters:
- maxIter (int): Maximum number of iterations
- regParam (float): Regularization parameter
- tol (float): Convergence tolerance
- fitIntercept (bool): Whether to fit intercept
- standardization (bool): Whether to standardize features
- threshold (float): Classification threshold
- weightCol (str): Weight column name
- aggregationDepth (int): Aggregation depth for treeAggregate
- blockSize (int): Block size for stacking input data
"""
class MultilayerPerceptronClassifier(Predictor):
"""Multilayer perceptron classifier."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
solver="l-bfgs", initialWeights=None, probabilityCol="probability",
rawPredictionCol="rawPrediction"):
"""
Initialize MultilayerPerceptronClassifier.
Parameters:
- maxIter (int): Maximum number of iterations
- tol (float): Convergence tolerance
- seed (int): Random seed
- layers (list): Sizes of layers from input to output
- blockSize (int): Block size for stacking input data
- stepSize (float): Step size for gradient descent
- solver (str): Solver algorithm ("l-bfgs" or "gd")
- initialWeights (Vector): Initial weights
- probabilityCol (str): Probability column name
- rawPredictionCol (str): Raw prediction column name
"""Supervised learning algorithms for regression tasks.
class LinearRegression(Predictor):
"""Linear regression."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
loss="squaredError", epsilon=1.35):
"""
Initialize LinearRegression.
Parameters:
- maxIter (int): Maximum number of iterations
- regParam (float): Regularization parameter
- elasticNetParam (float): ElasticNet mixing parameter
- tol (float): Convergence tolerance
- fitIntercept (bool): Whether to fit intercept
- standardization (bool): Whether to standardize features
- solver (str): Solver algorithm ("auto", "normal", "l-bfgs")
- weightCol (str): Weight column name
- aggregationDepth (int): Aggregation depth for treeAggregate
- loss (str): Loss function ("squaredError" or "huber")
- epsilon (float): Shape parameter for Huber loss
"""
class DecisionTreeRegressor(Predictor):
"""Decision tree regressor."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="variance", seed=None, varianceCol=None, weightCol=None,
leafCol="", minWeightFractionPerNode=0.0):
"""
Initialize DecisionTreeRegressor.
Parameters:
- impurity (str): Impurity measure ("variance")
- varianceCol (str): Variance column name
"""
class RandomForestRegressor(Predictor):
"""Random forest regressor."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="variance", numTrees=20, featureSubsetStrategy="auto", seed=None,
subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,
weightCol=None, bootstrap=True):
"""Initialize RandomForestRegressor."""
class GBTRegressor(Predictor):
"""Gradient-boosted tree regressor."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="squared", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,
leafCol="", minWeightFractionPerNode=0.0, weightCol=None):
"""Initialize GBTRegressor."""
class IsotonicRegression(Estimator):
"""Isotonic regression."""
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
weightCol=None, isotonic=True, featureIndex=0):
"""
Initialize IsotonicRegression.
Parameters:
- isotonic (bool): Whether the output sequence should be isotonic/increasing
- featureIndex (int): Index of the feature to use if featuresCol is a vector
"""Unsupervised learning algorithms for clustering tasks.
class KMeans(Estimator):
"""K-means clustering."""
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, seed=None,
distanceMeasure="euclidean", weightCol=None):
"""
Initialize KMeans.
Parameters:
- k (int): Number of clusters
- initMode (str): Initialization algorithm ("k-means||" or "random")
- initSteps (int): Number of steps for k-means|| initialization
- tol (float): Convergence tolerance
- maxIter (int): Maximum number of iterations
- seed (int): Random seed
- distanceMeasure (str): Distance measure ("euclidean" or "cosine")
- weightCol (str): Weight column name
"""
class BisectingKMeans(Estimator):
"""Bisecting k-means clustering."""
def __init__(self, featuresCol="features", predictionCol="prediction", k=4,
maxIter=20, seed=None, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
"""
Initialize BisectingKMeans.
Parameters:
- k (int): Number of clusters
- maxIter (int): Maximum number of iterations
- seed (int): Random seed
- minDivisibleClusterSize (float): Minimum divisible cluster size
- distanceMeasure (str): Distance measure
"""
class GaussianMixture(Estimator):
"""Gaussian Mixture Model."""
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
probabilityCol="probability", tol=0.01, maxIter=100, seed=None,
aggregationDepth=2, weightCol=None):
"""
Initialize GaussianMixture.
Parameters:
- k (int): Number of components
- probabilityCol (str): Probability column name
- tol (float): Convergence tolerance
- maxIter (int): Maximum number of iterations
- seed (int): Random seed
- aggregationDepth (int): Aggregation depth for treeAggregate
- weightCol (str): Weight column name
"""
class LDA(Estimator):
"""Latent Dirichlet Allocation."""
def __init__(self, featuresCol="features", maxIter=100, seed=None, checkpointInterval=10,
k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
subsamplingRate=0.05, optimizeDocConcentration=True, docConcentration=None,
topicConcentration=None, topicDistributionCol="topicDistribution",
keepLastCheckpoint=True):
"""
Initialize LDA.
Parameters:
- k (int): Number of topics
- optimizer (str): Optimizer ("online" or "em")
- learningOffset (float): Learning offset for online optimizer
- learningDecay (float): Learning decay rate
- subsamplingRate (float): Subsampling rate for online optimizer
- optimizeDocConcentration (bool): Whether to optimize document concentration
- docConcentration (Vector): Document concentration parameters
- topicConcentration (float): Topic concentration parameter
- topicDistributionCol (str): Topic distribution column name
- keepLastCheckpoint (bool): Whether to keep last checkpoint
"""Transformers for feature engineering and preprocessing.
class VectorAssembler(Transformer):
"""Combine multiple columns into a vector column."""
def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"):
"""
Initialize VectorAssembler.
Parameters:
- inputCols (list): Input column names
- outputCol (str): Output column name
- handleInvalid (str): How to handle invalid data ("error", "skip", "keep")
"""
class StandardScaler(Estimator):
"""Standardize features by removing mean and scaling to unit variance."""
def __init__(self, inputCol=None, outputCol=None, withMean=False, withStd=True):
"""
Initialize StandardScaler.
Parameters:
- inputCol (str): Input column name
- outputCol (str): Output column name
- withMean (bool): Whether to center data with mean
- withStd (bool): Whether to scale to unit standard deviation
"""
class MinMaxScaler(Estimator):
"""Transform features by scaling to a given range."""
def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
"""
Initialize MinMaxScaler.
Parameters:
- min (float): Lower bound after transformation
- max (float): Upper bound after transformation
- inputCol (str): Input column name
- outputCol (str): Output column name
"""
class StringIndexer(Estimator):
"""Encode string labels to label indices."""
def __init__(self, inputCol=None, outputCol=None, inputCols=None, outputCols=None,
handleInvalid="error", stringOrderType="frequencyDesc"):
"""
Initialize StringIndexer.
Parameters:
- inputCol (str): Input column name
- outputCol (str): Output column name
- inputCols (list): Input column names
- outputCols (list): Output column names
- handleInvalid (str): How to handle invalid data
- stringOrderType (str): How to order labels ("frequencyDesc", "frequencyAsc", "alphabetDesc", "alphabetAsc")
"""
class IndexToString(Transformer):
"""Map label indices back to label strings."""
def __init__(self, inputCol=None, outputCol=None, labels=None, inputCols=None,
outputCols=None):
"""
Initialize IndexToString.
Parameters:
- inputCol (str): Input column name
- outputCol (str): Output column name
- labels (list): Ordered list of labels
- inputCols (list): Input column names
- outputCols (list): Output column names
"""
class OneHotEncoder(Estimator):
"""One-hot encode categorical features."""
def __init__(self, inputCols=None, outputCols=None, dropLast=True, handleInvalid="error",
inputCol=None, outputCol=None):
"""
Initialize OneHotEncoder.
Parameters:
- inputCols (list): Input column names
- outputCols (list): Output column names
- dropLast (bool): Whether to drop the last category
- handleInvalid (str): How to handle invalid data
- inputCol (str): Input column name (deprecated)
- outputCol (str): Output column name (deprecated)
"""
class PCA(Estimator):
"""Principal component analysis dimensionality reduction."""
def __init__(self, k=None, inputCol=None, outputCol=None):
"""
Initialize PCA.
Parameters:
- k (int): Number of principal components
- inputCol (str): Input column name
- outputCol (str): Output column name
"""
class Word2Vec(Estimator):
"""Word2Vec transforms a dataset of text documents to vectors."""
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5,
maxSentenceLength=1000):
"""
Initialize Word2Vec.
Parameters:
- vectorSize (int): Dimension of the code that maps words to
- minCount (int): Minimum number of times a token must appear
- numPartitions (int): Number of partitions for sentences
- stepSize (float): Step size for gradient descent
- maxIter (int): Maximum number of iterations
- seed (int): Random seed
- inputCol (str): Input column name
- outputCol (str): Output column name
- windowSize (int): Window size for Word2Vec
- maxSentenceLength (int): Maximum sentence length
"""
class CountVectorizer(Estimator):
"""Convert text documents to vectors of token counts."""
def __init__(self, inputCol=None, outputCol=None, vocabSize=1 << 18, minDF=1.0,
maxDF=None, minTF=1.0, binary=False):
"""
Initialize CountVectorizer.
Parameters:
- inputCol (str): Input column name
- outputCol (str): Output column name
- vocabSize (int): Maximum vocabulary size
- minDF (float): Minimum document frequency
- maxDF (float): Maximum document frequency
- minTF (float): Minimum term frequency
- binary (bool): Binary toggle to control term frequency counts
"""
class IDF(Estimator):
"""Compute Inverse Document Frequency (IDF) for TF-IDF."""
def __init__(self, inputCol=None, outputCol=None, minDocFreq=0):
"""
Initialize IDF.
Parameters:
- inputCol (str): Input column name
- outputCol (str): Output column name
- minDocFreq (int): Minimum document frequency
"""Evaluation metrics for assessing model performance.
class Evaluator:
"""Base class for evaluators."""
def evaluate(self, dataset, params=None):
"""
Evaluate the dataset and return a scalar metric.
Parameters:
- dataset (DataFrame): Dataset to evaluate
- params (dict): Additional parameters
Returns:
float: Evaluation metric
"""
class BinaryClassificationEvaluator(Evaluator):
"""Evaluator for binary classification."""
def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
metricName="areaUnderROC", weightCol=None, numBins=1000):
"""
Initialize BinaryClassificationEvaluator.
Parameters:
- rawPredictionCol (str): Raw prediction column name
- labelCol (str): Label column name
- metricName (str): Metric name ("areaUnderROC" or "areaUnderPR")
- weightCol (str): Weight column name
- numBins (int): Number of bins for ROC curve
"""
class MulticlassClassificationEvaluator(Evaluator):
"""Evaluator for multiclass classification."""
def __init__(self, predictionCol="prediction", labelCol="label", metricName="f1",
metricLabel=0.0, beta=1.0, probabilityCol="probability", eps=1e-15):
"""
Initialize MulticlassClassificationEvaluator.
Parameters:
- predictionCol (str): Prediction column name
- labelCol (str): Label column name
- metricName (str): Metric name ("f1", "accuracy", "weightedPrecision", etc.)
- metricLabel (float): Label for metric calculation
- beta (float): Beta value for F-beta score
- probabilityCol (str): Probability column name
- eps (float): Epsilon value to avoid division by zero
"""
class RegressionEvaluator(Evaluator):
"""Evaluator for regression."""
def __init__(self, predictionCol="prediction", labelCol="label", metricName="rmse",
weightCol=None, throughOrigin=False):
"""
Initialize RegressionEvaluator.
Parameters:
- predictionCol (str): Prediction column name
- labelCol (str): Label column name
- metricName (str): Metric name ("rmse", "mse", "r2", "mae", "var")
- weightCol (str): Weight column name
- throughOrigin (bool): Whether to fit line through origin for r2
"""
class ClusteringEvaluator(Evaluator):
"""Evaluator for clustering."""
def __init__(self, predictionCol="prediction", featuresCol="features",
metricName="silhouette", distanceMeasure="squaredEuclidean",
weightCol=None):
"""
Initialize ClusteringEvaluator.
Parameters:
- predictionCol (str): Prediction column name
- featuresCol (str): Features column name
- metricName (str): Metric name ("silhouette")
- distanceMeasure (str): Distance measure
- weightCol (str): Weight column name
"""Tools for hyperparameter optimization and model selection.
class ParamGridBuilder:
"""Builder for a param grid used in grid search-based model selection."""
def __init__(self):
"""Initialize ParamGridBuilder."""
def addGrid(self, param, values):
"""
Add parameter values to the grid.
Parameters:
- param (Param): Parameter to tune
- values (list): List of parameter values
Returns:
ParamGridBuilder
"""
def build(self):
"""
Build and return the parameter grid.
Returns:
list: List of parameter maps
"""
class CrossValidator(Estimator):
"""K-fold cross validation."""
def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
numFolds=3, seed=None, parallelism=1, collectSubModels=False,
foldCol=""):
"""
Initialize CrossValidator.
Parameters:
- estimator (Estimator): Estimator to cross-validate
- estimatorParamMaps (list): Parameter maps to evaluate
- evaluator (Evaluator): Evaluator for model selection
- numFolds (int): Number of folds for cross validation
- seed (int): Random seed
- parallelism (int): Number of threads to use for fitting models
- collectSubModels (bool): Whether to collect sub-models
- foldCol (str): Fold column name
"""
class TrainValidationSplit(Estimator):
"""Train-validation split for model selection."""
def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None,
trainRatio=0.75, seed=None, parallelism=1, collectSubModels=False):
"""
Initialize TrainValidationSplit.
Parameters:
- estimator (Estimator): Estimator to tune
- estimatorParamMaps (list): Parameter maps to evaluate
- evaluator (Evaluator): Evaluator for model selection
- trainRatio (float): Ratio of training data
- seed (int): Random seed
- parallelism (int): Number of threads to use for fitting models
- collectSubModels (bool): Whether to collect sub-models
"""from pyspark.ml.linalg import Vector, DenseVector, SparseVector, Vectors
from pyspark.ml.linalg import Matrix, DenseMatrix, SparseMatrix, Matrices
class Vector:
"""Abstract base class for ML vector types."""
def toArray(self):
"""Convert to numpy array."""
class DenseVector(Vector):
"""Dense vector representation."""
def __init__(self, ar):
"""Create from array-like object."""
class SparseVector(Vector):
"""Sparse vector representation."""
def __init__(self, size, *args):
"""Create sparse vector."""
class Vectors:
"""Factory methods for creating vectors."""
@staticmethod
def dense(*values):
"""Create dense vector."""
@staticmethod
def sparse(size, *args):
"""Create sparse vector."""
class Matrix:
"""Abstract base class for ML matrix types."""
def numRows(self):
"""Number of rows."""
def numCols(self):
"""Number of columns."""
class DenseMatrix(Matrix):
"""Dense matrix representation."""
class SparseMatrix(Matrix):
"""Sparse matrix representation in CSC format."""
class Matrices:
"""Factory methods for creating matrices."""
@staticmethod
def dense(numRows, numCols, values):
"""Create dense matrix."""
@staticmethod
def sparse(numRows, numCols, colPtrs, rowIndices, values):
"""Create sparse matrix."""Install with Tessl CLI
npx tessl i tessl/pypi-pyspark