HuggingFace community-driven open-source library of datasets for machine learning with one-line dataloaders, efficient preprocessing, and multi-framework support
—
Comprehensive type system for defining and validating dataset schemas, supporting primitive types, complex nested structures, and multimedia data. The Features system enables schema validation, data encoding/decoding, and seamless integration with Apache Arrow for efficient data storage.
The main schema container that defines the internal structure of a dataset as a dictionary mapping column names to feature types.
class Features(dict):
"""A special dictionary that defines the internal structure of a dataset."""
def __init__(self, *args, **kwargs): ...
@classmethod
def from_arrow_schema(cls, pa_schema) -> "Features": ...
@classmethod
def from_dict(cls, dic) -> "Features": ...
def to_dict(self) -> dict: ...
def encode_example(self, example: dict) -> dict: ...
def decode_example(self, example: dict) -> dict: ...
def encode_batch(self, batch: dict) -> dict: ...
def decode_batch(self, batch: dict) -> dict: ...
def flatten(self, max_depth: int = 16) -> "Features": ...
def copy(self) -> "Features": ...
def reorder_fields_as(self, other: "Features") -> "Features": ...
# Properties
@property
def type(self): ... # PyArrow DataType representation
@property
def arrow_schema(self): ... # PyArrow Schema with metadataUsage Examples:
from datasets import Features, Value, ClassLabel, List
# Define dataset schema
features = Features({
'text': Value('string'),
'label': ClassLabel(names=['negative', 'positive']),
'embeddings': List(Value('float32')),
'metadata': {
'source': Value('string'),
'confidence': Value('float64')
}
})
# Encode data for Arrow storage
example = {'text': 'Hello world', 'label': 'positive', 'embeddings': [0.1, 0.2]}
encoded = features.encode_example(example)
# Decode data with feature-specific logic
decoded = features.decode_example(encoded)Feature type for scalar values with support for all Arrow data types including numeric, temporal, string, and binary types.
class Value:
"""Scalar feature value of a particular data type."""
def __init__(self, dtype: str, id: Optional[str] = None): ...
def __call__(self): ... # Returns PyArrow type
def encode_example(self, value): ...Supported Data Types:
int8, int16, int32, int64, uint8, uint16, uint32, uint64float16, float32, float64time32[s|ms], time64[us|ns], timestamp[unit], date32, date64, duration[unit]decimal128(precision, scale), decimal256(precision, scale)binary, large_binarystring, large_stringnull, boolUsage Examples:
# Basic types
text_feature = Value('string')
integer_feature = Value('int64')
float_feature = Value('float32')
boolean_feature = Value('bool')
# Temporal types
timestamp_feature = Value('timestamp[ms]')
date_feature = Value('date32')
# High precision numbers
decimal_feature = Value('decimal128(10, 2)')Feature type for integer class labels with automatic string-to-integer conversion and label name management.
class ClassLabel:
"""Feature type for integer class labels."""
def __init__(
self,
num_classes: Optional[int] = None,
names: Optional[List[str]] = None,
names_file: Optional[str] = None,
id: Optional[str] = None,
): ...
def str2int(self, values: Union[str, Iterable]) -> Union[int, Iterable]: ...
def int2str(self, values: Union[int, Iterable]) -> Union[str, Iterable]: ...
def encode_example(self, example_data): ...
def cast_storage(self, storage) -> pa.Int64Array: ...Usage Examples:
# Define with explicit names
sentiment = ClassLabel(names=['negative', 'neutral', 'positive'])
# Define with number of classes (creates 0, 1, 2, ...)
digits = ClassLabel(num_classes=10)
# Define from file
categories = ClassLabel(names_file='categories.txt')
# Convert between strings and integers
label_int = sentiment.str2int('positive') # Returns 2
label_str = sentiment.int2str(2) # Returns 'positive'
# Batch conversion
labels = sentiment.str2int(['positive', 'negative', 'positive']) # [2, 0, 2]Feature types for list data with support for both variable-length and fixed-length sequences, including multi-dimensional arrays.
class List:
"""Feature type for list data with 32-bit offsets."""
def __init__(
self,
feature: Any, # Child feature type
length: int = -1, # Fixed length (-1 = variable)
id: Optional[str] = None,
): ...
class LargeList:
"""Feature type for large list data with 64-bit offsets."""
def __init__(
self,
feature: Any, # Child feature type
id: Optional[str] = None,
): ...
class Sequence:
"""Utility for TensorFlow Datasets compatibility."""
def __new__(cls, feature=None, length=-1, **kwargs): ...
class Array2D:
"""Create a two-dimensional array."""
def __init__(self, shape: tuple, dtype: str): ...
class Array3D:
"""Create a three-dimensional array."""
def __init__(self, shape: tuple, dtype: str): ...
class Array4D:
"""Create a four-dimensional array."""
def __init__(self, shape: tuple, dtype: str): ...
class Array5D:
"""Create a five-dimensional array."""
def __init__(self, shape: tuple, dtype: str): ...Usage Examples:
# Variable-length list of floats
embeddings = List(Value('float32'))
# Fixed-length list of 100 integers
fixed_sequence = List(Value('int32'), length=100)
# List of categorical labels
label_sequence = List(ClassLabel(names=['A', 'B', 'C']))
# Multi-dimensional arrays
image_array = Array3D(shape=(224, 224, 3), dtype='uint8')
feature_matrix = Array2D(shape=(50, 768), dtype='float32')
# Large lists for big data
large_embeddings = LargeList(Value('float64'))Feature type for audio data with automatic format handling and optional decoding control.
class Audio:
"""Audio Feature to extract audio data from files."""
def __init__(
self,
sampling_rate: Optional[int] = None,
decode: bool = True,
stream_index: Optional[int] = None,
id: Optional[str] = None,
): ...
def encode_example(self, value) -> dict: ...
def decode_example(self, value, token_per_repo_id=None): ...
def cast_storage(self, storage) -> pa.StructArray: ...
def embed_storage(self, storage) -> pa.StructArray: ...
def flatten(self) -> dict: ...Input Formats:
str: Absolute path to audio filedict: {"path": str, "bytes": bytes}dict: {"array": ndarray, "sampling_rate": int}Usage Examples:
# Basic audio feature
audio = Audio()
# Audio with specific sampling rate
speech = Audio(sampling_rate=16000)
# Audio without decoding (store as bytes)
raw_audio = Audio(decode=False)
# Use in dataset features
features = Features({
'audio': Audio(sampling_rate=22050),
'transcript': Value('string')
})Feature type for image data with automatic format handling and optional PIL mode conversion.
class Image:
"""Image Feature to read image data from files."""
def __init__(
self,
mode: Optional[str] = None, # PIL mode conversion
decode: bool = True,
id: Optional[str] = None,
): ...
def encode_example(self, value) -> dict: ...
def decode_example(self, value, token_per_repo_id=None): ...
def cast_storage(self, storage) -> pa.StructArray: ...
def embed_storage(self, storage) -> pa.StructArray: ...
def flatten(self): ...Input Formats:
str: Absolute path to image filedict: {"path": str, "bytes": bytes}np.ndarray: NumPy array representing imagePIL.Image.Image: PIL image objectUsage Examples:
# Basic image feature
image = Image()
# Image with mode conversion
rgb_image = Image(mode='RGB')
# Image without decoding (store as bytes)
raw_image = Image(decode=False)
# Use in computer vision dataset
features = Features({
'image': Image(mode='RGB'),
'label': ClassLabel(names=['cat', 'dog']),
'bbox': List(Value('float32'), length=4)
})Feature type for video data with TorchCodec integration and flexible decoding options.
class Video:
"""Video Feature to read video data from files."""
def __init__(
self,
decode: bool = True,
stream_index: Optional[int] = None,
dimension_order: str = "NCHW", # "NCHW" or "NHWC"
num_ffmpeg_threads: int = 1,
device: Optional[Union[str, "torch.device"]] = "cpu",
seek_mode: str = "exact", # "exact" or "approximate"
id: Optional[str] = None,
): ...
def encode_example(self, value): ...
def decode_example(self, value, token_per_repo_id=None): ...
def cast_storage(self, storage) -> pa.StructArray: ...
def flatten(self): ...Usage Examples:
# Basic video feature
video = Video()
# Video with specific configuration
optimized_video = Video(
dimension_order="NHWC",
num_ffmpeg_threads=4,
device="cuda",
seek_mode="approximate"
)
# Video without decoding
raw_video = Video(decode=False)Feature type for PDF document processing with pdfplumber integration.
class Pdf:
"""Pdf Feature to read PDF documents from files."""
def __init__(
self,
decode: bool = True,
id: Optional[str] = None,
): ...
def encode_example(self, value) -> dict: ...
def decode_example(self, value, token_per_repo_id=None): ...
def cast_storage(self, storage) -> pa.StructArray: ...
def embed_storage(self, storage) -> pa.StructArray: ...
def flatten(self): ...Usage Examples:
# Basic PDF feature
pdf = Pdf()
# PDF without decoding (store as bytes)
raw_pdf = Pdf(decode=False)
# Use in document processing dataset
features = Features({
'document': Pdf(),
'title': Value('string'),
'summary': Value('string')
})Feature types for machine translation tasks with support for both fixed and variable language sets.
class Translation:
"""Feature for translations with fixed languages per example."""
def __init__(
self,
languages: List[str],
id: Optional[str] = None,
): ...
def flatten(self) -> dict: ...
class TranslationVariableLanguages:
"""Feature for translations with variable languages per example."""
def __init__(
self,
languages: Optional[List] = None,
num_languages: Optional[int] = None,
id: Optional[str] = None,
): ...
def encode_example(self, translation_dict): ...
def flatten(self) -> dict: ...Usage Examples:
# Fixed languages translation
translation = Translation(languages=['en', 'fr', 'de'])
# Data format for fixed languages
example = {
'en': 'the cat',
'fr': 'le chat',
'de': 'die katze'
}
# Variable languages translation
var_translation = TranslationVariableLanguages(languages=['en', 'fr', 'de', 'es'])
# Input format (variable number of translations per language)
variable_example = {
'en': 'the cat',
'fr': ['le chat', 'la chatte'],
'de': 'die katze'
}
# Encoded output format
encoded = {
'language': ['en', 'de', 'fr', 'fr'],
'translation': ['the cat', 'die katze', 'la chatte', 'le chat']
}# Complex nested schema
features = Features({
'metadata': {
'id': Value('string'),
'timestamp': Value('timestamp[ms]'),
'source': {
'name': Value('string'),
'version': Value('string')
}
},
'content': {
'text': Value('string'),
'tokens': List(Value('string')),
'entities': List({
'start': Value('int32'),
'end': Value('int32'),
'label': ClassLabel(names=['PERSON', 'ORG', 'LOC']),
'confidence': Value('float32')
})
},
'multimedia': {
'images': List(Image()),
'audio': Audio(sampling_rate=16000),
'video': Video(decode=False)
}
})# Convert to Arrow schema
arrow_schema = features.arrow_schema
# Serialize for storage
features_dict = features.to_dict()
# Reconstruct from serialization
reconstructed = Features.from_dict(features_dict)
# Reconstruct from Arrow schema
from_arrow = Features.from_arrow_schema(arrow_schema)# Batch processing with schema
batch = {
'text': ['Hello', 'World'],
'labels': ['positive', 'negative'],
'embeddings': [[0.1, 0.2], [0.3, 0.4]]
}
# Encode batch for Arrow storage
encoded_batch = features.encode_batch(batch)
# Decode batch for processing
decoded_batch = features.decode_batch(encoded_batch)decode=False for multimedia when raw bytes are sufficientencode_batch/decode_batch for better performance with large datasetsInstall with Tessl CLI
npx tessl i tessl/pypi-datasets