tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

0.98x

Overview

Eval results

Files

Feature Extraction

Name: tessl/pypi-scikit-learn
Rating: 0.87 (1 reviews)
Author: tessl

Feature extraction utilities for converting raw data into numerical features suitable for machine learning algorithms. This includes text processing, image processing, and dictionary-based feature extraction.

Text Feature Extraction

CountVectorizer

Convert a collection of text documents to a matrix of token counts.

from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer(
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    analyzer: str = "word",
    max_df: float | int = 1.0,
    min_df: float | int = 1,
    max_features: int | None = None,
    vocabulary: dict | None = None,
    binary: bool = False,
    dtype: type = np.int64
)

TfidfVectorizer

Convert a collection of raw documents to a matrix of TF-IDF features.

from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer(
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    analyzer: str = "word",
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    max_df: float | int = 1.0,
    min_df: float | int = 1,
    max_features: int | None = None,
    vocabulary: dict | None = None,
    binary: bool = False,
    dtype: type = np.float64,
    norm: str = "l2",
    use_idf: bool = True,
    smooth_idf: bool = True,
    sublinear_tf: bool = False
)

TfidfTransformer

Transform a count matrix to a normalized tf or tf-idf representation.

from sklearn.feature_extraction.text import TfidfTransformer

TfidfTransformer(
    norm: str = "l2",
    use_idf: bool = True,
    smooth_idf: bool = True,
    sublinear_tf: bool = False
)

HashingVectorizer

Convert a collection of text documents to a matrix of token occurrences using hashing trick.

from sklearn.feature_extraction.text import HashingVectorizer

HashingVectorizer(
    n_features: int = 2**20,
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    analyzer: str = "word",
    binary: bool = False,
    norm: str = "l2",
    alternate_sign: bool = True,
    dtype: type = np.float64
)

Text Preprocessing Functions

from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode, strip_tags

def strip_accents_ascii(s: str) -> str: ...
def strip_accents_unicode(s: str) -> str: ...
def strip_tags(s: str) -> str: ...

Dictionary Feature Extraction

DictVectorizer

Transform lists of feature-value mappings to vectors.

from sklearn.feature_extraction import DictVectorizer

DictVectorizer(
    dtype: type = np.float64,
    separator: str = "=",
    sparse: bool = True,
    sort: bool = True
)

Hashing Feature Extraction

FeatureHasher

Implements feature hashing for high-speed, low-memory vectorization.

from sklearn.feature_extraction import FeatureHasher

FeatureHasher(
    n_features: int = 2**20,
    input_type: str = "dict",
    dtype: type = np.float64,
    alternate_sign: bool = True
)

Image Feature Extraction

Image to Graph

Convert images to graphs for machine learning applications.

from sklearn.feature_extraction.image import img_to_graph, grid_to_graph

def img_to_graph(
    img: ndarray,
    mask: ndarray | None = None,
    return_as: type = np.ndarray,
    dtype: type | None = None
) -> ndarray | csr_matrix: ...

def grid_to_graph(
    n_x: int,
    n_y: int,
    n_z: int = 1,
    mask: ndarray | None = None,
    return_as: type = np.ndarray,
    dtype: type = np.int32
) -> ndarray | csr_matrix: ...

Usage Examples

Text Vectorization

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Basic count vectorization
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.']

# Count vectorizer
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

Dictionary Vectorization

from sklearn.feature_extraction import DictVectorizer

# Convert list of dictionaries to feature vectors
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

vec = DictVectorizer()
X = vec.fit_transform(measurements)
print(vec.get_feature_names_out())

Feature Hashing

from sklearn.feature_extraction import FeatureHasher

# Hash features for large-scale learning
h = FeatureHasher(n_features=10)
D = [{'dog': 1, 'cat': 2, 'elephant': 4},
     {'dog': 2, 'run': 5}]
f = h.transform(D)

Constants

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

ENGLISH_STOP_WORDS: frozenset  # Set of common English stop words

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn

tessl/pypi-scikit-learn