Scalable Python data science, in an API compatible & lightning fast way.
Large-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns. Xorbits datasets module provides scalable data loading and processing capabilities for machine learning workflows.
Core dataset class for handling large datasets with distributed computing support.
class Dataset:
"""
Dataset class for handling large datasets with distributed computing.
Provides efficient loading, processing, and manipulation of datasets
that exceed single-machine memory through distributed processing.
"""Direct integration with Hugging Face datasets for loading popular machine learning datasets.
def from_huggingface(dataset_name: str, **kwargs):
"""
Load datasets from Hugging Face Hub with distributed support.
Parameters:
- dataset_name: str, name of the dataset on Hugging Face Hub
- **kwargs: Additional parameters for dataset loading including:
- split: str, dataset split to load ('train', 'test', 'validation')
- streaming: bool, whether to stream the dataset
- cache_dir: str, directory to cache downloaded files
- revision: str, specific revision/version of the dataset
- use_auth_token: bool or str, authentication token for private datasets
- trust_remote_code: bool, whether to trust remote code execution
Returns:
- Dataset object with distributed computing capabilities
"""Usage Examples:
import xorbits
from xorbits.datasets import Dataset
xorbits.init()
# Create dataset from local data
dataset = Dataset.from_csv('large_dataset.csv')
# Basic dataset operations
filtered_dataset = dataset.filter(lambda x: x['value'] > 100)
mapped_dataset = dataset.map(lambda x: {'processed': x['value'] * 2})
# Dataset info
print(f"Dataset size: {len(dataset)}")
print(f"Dataset columns: {dataset.column_names}")
# Execute operations
result = xorbits.run(mapped_dataset.to_pandas())
xorbits.shutdown()import xorbits
from xorbits.datasets import from_huggingface
xorbits.init()
# Load popular datasets from Hugging Face
# Text classification dataset
imdb_dataset = from_huggingface("imdb", split="train")
# Natural language inference dataset
glue_dataset = from_huggingface("glue", "mnli", split="train")
# Image classification dataset
cifar10_dataset = from_huggingface("cifar10", split="train")
# Question answering dataset
squad_dataset = from_huggingface("squad", split="train")
# Load with specific parameters
custom_dataset = from_huggingface(
"my_dataset",
split="train",
cache_dir="/tmp/datasets",
streaming=False,
trust_remote_code=True
)
# Process datasets with distributed computing
processed_imdb = imdb_dataset.map(
lambda example: {
'text_length': len(example['text']),
'label': example['label']
}
)
# Filter large datasets efficiently
long_texts = processed_imdb.filter(lambda x: x['text_length'] > 1000)
# Execute computations
results = xorbits.run(long_texts.to_pandas())
xorbits.shutdown()import xorbits
from xorbits.datasets import from_huggingface
import xorbits.pandas as pd
xorbits.init()
# Load dataset
dataset = from_huggingface("imdb", split="train")
# Define preprocessing functions
def tokenize_text(example):
# Tokenization logic here
tokens = example['text'].split()
return {
'tokens': tokens,
'token_count': len(tokens),
'label': example['label']
}
def filter_by_length(example):
return 10 <= example['token_count'] <= 500
# Build preprocessing pipeline
processed_dataset = (dataset
.map(tokenize_text)
.filter(filter_by_length)
)
# Convert to pandas for further processing
df = processed_dataset.to_pandas()
# Additional pandas operations
analysis = df.groupby('label').agg({
'token_count': ['mean', 'std', 'min', 'max']
})
# Execute pipeline
results = xorbits.run(analysis)
xorbits.shutdown()import xorbits
from xorbits.datasets import from_huggingface
from xorbits.datasets import Dataset
xorbits.init()
# Load multiple datasets
train_dataset = from_huggingface("imdb", split="train")
test_dataset = from_huggingface("imdb", split="test")
# Load local dataset
local_dataset = Dataset.from_json('local_reviews.json')
# Combine datasets
combined_dataset = Dataset.concatenate([
train_dataset,
test_dataset,
local_dataset
])
# Process combined dataset
def standardize_format(example):
return {
'text': example.get('text', example.get('review', '')),
'sentiment': example.get('label', example.get('sentiment', 0)),
'source': example.get('source', 'unknown')
}
standardized_dataset = combined_dataset.map(standardize_format)
# Analyze dataset composition
source_counts = standardized_dataset.to_pandas().groupby('source').size()
# Execute computation
results = xorbits.run(source_counts)
xorbits.shutdown()import xorbits
from xorbits.datasets import from_huggingface
xorbits.init()
# Load large dataset with streaming for memory efficiency
large_dataset = from_huggingface(
"c4",
"en",
split="train",
streaming=True # Stream for very large datasets
)
# Process in batches for efficiency
def batch_process(batch):
# Process batch of examples
processed_batch = []
for example in batch:
processed_example = {
'text_length': len(example['text']),
'url_domain': example['url'].split('//')[1].split('/')[0] if '//' in example['url'] else 'unknown',
'timestamp': example['timestamp']
}
processed_batch.append(processed_example)
return processed_batch
# Apply batch processing
processed_dataset = large_dataset.map(
batch_process,
batched=True,
batch_size=1000
)
# Sample for analysis
sample_data = processed_dataset.take(10000)
# Convert sample to pandas for analysis
sample_df = sample_data.to_pandas()
domain_analysis = sample_df.groupby('url_domain').size().sort_values(ascending=False)
# Execute computation
results = xorbits.run(domain_analysis)
xorbits.shutdown()Install with Tessl CLI
npx tessl i tessl/pypi-xorbits