A comprehensive machine learning library providing supervised and unsupervised learning algorithms with consistent APIs and extensive tools for data preprocessing, model evaluation, and deployment.
87
Feature extraction utilities for converting raw data into numerical features suitable for machine learning algorithms. This includes text processing, image processing, and dictionary-based feature extraction.
Convert a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer(
input: str = "content",
encoding: str = "utf-8",
decode_error: str = "strict",
strip_accents: str | None = None,
lowercase: bool = True,
preprocessor: callable | None = None,
tokenizer: callable | None = None,
stop_words: str | list | None = None,
token_pattern: str = r"(?u)\b\w\w+\b",
ngram_range: tuple = (1, 1),
analyzer: str = "word",
max_df: float | int = 1.0,
min_df: float | int = 1,
max_features: int | None = None,
vocabulary: dict | None = None,
binary: bool = False,
dtype: type = np.int64
)Convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer(
input: str = "content",
encoding: str = "utf-8",
decode_error: str = "strict",
strip_accents: str | None = None,
lowercase: bool = True,
preprocessor: callable | None = None,
tokenizer: callable | None = None,
analyzer: str = "word",
stop_words: str | list | None = None,
token_pattern: str = r"(?u)\b\w\w+\b",
ngram_range: tuple = (1, 1),
max_df: float | int = 1.0,
min_df: float | int = 1,
max_features: int | None = None,
vocabulary: dict | None = None,
binary: bool = False,
dtype: type = np.float64,
norm: str = "l2",
use_idf: bool = True,
smooth_idf: bool = True,
sublinear_tf: bool = False
)Transform a count matrix to a normalized tf or tf-idf representation.
from sklearn.feature_extraction.text import TfidfTransformer
TfidfTransformer(
norm: str = "l2",
use_idf: bool = True,
smooth_idf: bool = True,
sublinear_tf: bool = False
)Convert a collection of text documents to a matrix of token occurrences using hashing trick.
from sklearn.feature_extraction.text import HashingVectorizer
HashingVectorizer(
n_features: int = 2**20,
input: str = "content",
encoding: str = "utf-8",
decode_error: str = "strict",
strip_accents: str | None = None,
lowercase: bool = True,
preprocessor: callable | None = None,
tokenizer: callable | None = None,
stop_words: str | list | None = None,
token_pattern: str = r"(?u)\b\w\w+\b",
ngram_range: tuple = (1, 1),
analyzer: str = "word",
binary: bool = False,
norm: str = "l2",
alternate_sign: bool = True,
dtype: type = np.float64
)from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode, strip_tags
def strip_accents_ascii(s: str) -> str: ...
def strip_accents_unicode(s: str) -> str: ...
def strip_tags(s: str) -> str: ...Transform lists of feature-value mappings to vectors.
from sklearn.feature_extraction import DictVectorizer
DictVectorizer(
dtype: type = np.float64,
separator: str = "=",
sparse: bool = True,
sort: bool = True
)Implements feature hashing for high-speed, low-memory vectorization.
from sklearn.feature_extraction import FeatureHasher
FeatureHasher(
n_features: int = 2**20,
input_type: str = "dict",
dtype: type = np.float64,
alternate_sign: bool = True
)Convert images to graphs for machine learning applications.
from sklearn.feature_extraction.image import img_to_graph, grid_to_graph
def img_to_graph(
img: ndarray,
mask: ndarray | None = None,
return_as: type = np.ndarray,
dtype: type | None = None
) -> ndarray | csr_matrix: ...
def grid_to_graph(
n_x: int,
n_y: int,
n_z: int = 1,
mask: ndarray | None = None,
return_as: type = np.ndarray,
dtype: type = np.int32
) -> ndarray | csr_matrix: ...from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Basic count vectorization
corpus = ['This is the first document.',
'This document is the second document.',
'And this is the third one.']
# Count vectorizer
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)from sklearn.feature_extraction import DictVectorizer
# Convert list of dictionaries to feature vectors
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Francisco', 'temperature': 18.},
]
vec = DictVectorizer()
X = vec.fit_transform(measurements)
print(vec.get_feature_names_out())from sklearn.feature_extraction import FeatureHasher
# Hash features for large-scale learning
h = FeatureHasher(n_features=10)
D = [{'dog': 1, 'cat': 2, 'elephant': 4},
{'dog': 2, 'run': 5}]
f = h.transform(D)from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
ENGLISH_STOP_WORDS: frozenset # Set of common English stop wordsInstall with Tessl CLI
npx tessl i tessl/pypi-scikit-learndocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10