CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-scikit-learn

A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.

87

0.98x
Overview
Eval results
Files

feature-extraction.mddocs/

Feature Extraction

Feature extraction utilities for converting raw data into numerical features suitable for machine learning algorithms. This includes text processing, image processing, and dictionary-based feature extraction.

Text Feature Extraction

CountVectorizer

Convert a collection of text documents to a matrix of token counts.

from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer(
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    analyzer: str = "word",
    max_df: float | int = 1.0,
    min_df: float | int = 1,
    max_features: int | None = None,
    vocabulary: dict | None = None,
    binary: bool = False,
    dtype: type = np.int64
)

TfidfVectorizer

Convert a collection of raw documents to a matrix of TF-IDF features.

from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer(
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    analyzer: str = "word",
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    max_df: float | int = 1.0,
    min_df: float | int = 1,
    max_features: int | None = None,
    vocabulary: dict | None = None,
    binary: bool = False,
    dtype: type = np.float64,
    norm: str = "l2",
    use_idf: bool = True,
    smooth_idf: bool = True,
    sublinear_tf: bool = False
)

TfidfTransformer

Transform a count matrix to a normalized tf or tf-idf representation.

from sklearn.feature_extraction.text import TfidfTransformer

TfidfTransformer(
    norm: str = "l2",
    use_idf: bool = True,
    smooth_idf: bool = True,
    sublinear_tf: bool = False
)

HashingVectorizer

Convert a collection of text documents to a matrix of token occurrences using hashing trick.

from sklearn.feature_extraction.text import HashingVectorizer

HashingVectorizer(
    n_features: int = 2**20,
    input: str = "content",
    encoding: str = "utf-8",
    decode_error: str = "strict",
    strip_accents: str | None = None,
    lowercase: bool = True,
    preprocessor: callable | None = None,
    tokenizer: callable | None = None,
    stop_words: str | list | None = None,
    token_pattern: str = r"(?u)\b\w\w+\b",
    ngram_range: tuple = (1, 1),
    analyzer: str = "word",
    binary: bool = False,
    norm: str = "l2",
    alternate_sign: bool = True,
    dtype: type = np.float64
)

Text Preprocessing Functions

from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode, strip_tags

def strip_accents_ascii(s: str) -> str: ...
def strip_accents_unicode(s: str) -> str: ...
def strip_tags(s: str) -> str: ...

Dictionary Feature Extraction

DictVectorizer

Transform lists of feature-value mappings to vectors.

from sklearn.feature_extraction import DictVectorizer

DictVectorizer(
    dtype: type = np.float64,
    separator: str = "=",
    sparse: bool = True,
    sort: bool = True
)

Hashing Feature Extraction

FeatureHasher

Implements feature hashing for high-speed, low-memory vectorization.

from sklearn.feature_extraction import FeatureHasher

FeatureHasher(
    n_features: int = 2**20,
    input_type: str = "dict",
    dtype: type = np.float64,
    alternate_sign: bool = True
)

Image Feature Extraction

Image to Graph

Convert images to graphs for machine learning applications.

from sklearn.feature_extraction.image import img_to_graph, grid_to_graph

def img_to_graph(
    img: ndarray,
    mask: ndarray | None = None,
    return_as: type = np.ndarray,
    dtype: type | None = None
) -> ndarray | csr_matrix: ...

def grid_to_graph(
    n_x: int,
    n_y: int,
    n_z: int = 1,
    mask: ndarray | None = None,
    return_as: type = np.ndarray,
    dtype: type = np.int32
) -> ndarray | csr_matrix: ...

Usage Examples

Text Vectorization

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Basic count vectorization
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.']

# Count vectorizer
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

Dictionary Vectorization

from sklearn.feature_extraction import DictVectorizer

# Convert list of dictionaries to feature vectors
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

vec = DictVectorizer()
X = vec.fit_transform(measurements)
print(vec.get_feature_names_out())

Feature Hashing

from sklearn.feature_extraction import FeatureHasher

# Hash features for large-scale learning
h = FeatureHasher(n_features=10)
D = [{'dog': 1, 'cat': 2, 'elephant': 4},
     {'dog': 2, 'run': 5}]
f = h.transform(D)

Constants

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

ENGLISH_STOP_WORDS: frozenset  # Set of common English stop words

Install with Tessl CLI

npx tessl i tessl/pypi-scikit-learn

docs

datasets.md

feature-extraction.md

index.md

metrics.md

model-selection.md

neighbors.md

pipelines.md

preprocessing.md

supervised-learning.md

unsupervised-learning.md

utilities.md

tile.json