tessl/pypi-xorbits

Scalable Python data science, in an API compatible & lightning fast way.

Overview

Eval results

Files

Datasets

Name: tessl/pypi-xorbits
Author: tessl

Large-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns. Xorbits datasets module provides scalable data loading and processing capabilities for machine learning workflows.

Capabilities

Dataset Class

Core dataset class for handling large datasets with distributed computing support.

class Dataset:
    """
    Dataset class for handling large datasets with distributed computing.
    
    Provides efficient loading, processing, and manipulation of datasets
    that exceed single-machine memory through distributed processing.
    """

Hugging Face Integration

Direct integration with Hugging Face datasets for loading popular machine learning datasets.

def from_huggingface(dataset_name: str, **kwargs):
    """
    Load datasets from Hugging Face Hub with distributed support.
    
    Parameters:
    - dataset_name: str, name of the dataset on Hugging Face Hub
    - **kwargs: Additional parameters for dataset loading including:
        - split: str, dataset split to load ('train', 'test', 'validation')
        - streaming: bool, whether to stream the dataset
        - cache_dir: str, directory to cache downloaded files
        - revision: str, specific revision/version of the dataset
        - use_auth_token: bool or str, authentication token for private datasets
        - trust_remote_code: bool, whether to trust remote code execution
    
    Returns:
    - Dataset object with distributed computing capabilities
    """

Usage Examples:

Basic Dataset Usage

import xorbits
from xorbits.datasets import Dataset

xorbits.init()

# Create dataset from local data
dataset = Dataset.from_csv('large_dataset.csv')

# Basic dataset operations
filtered_dataset = dataset.filter(lambda x: x['value'] > 100)
mapped_dataset = dataset.map(lambda x: {'processed': x['value'] * 2})

# Dataset info
print(f"Dataset size: {len(dataset)}")
print(f"Dataset columns: {dataset.column_names}")

# Execute operations
result = xorbits.run(mapped_dataset.to_pandas())

xorbits.shutdown()

Hugging Face Integration Examples

import xorbits
from xorbits.datasets import from_huggingface

xorbits.init()

# Load popular datasets from Hugging Face
# Text classification dataset
imdb_dataset = from_huggingface("imdb", split="train")

# Natural language inference dataset  
glue_dataset = from_huggingface("glue", "mnli", split="train")

# Image classification dataset
cifar10_dataset = from_huggingface("cifar10", split="train")

# Question answering dataset
squad_dataset = from_huggingface("squad", split="train")

# Load with specific parameters
custom_dataset = from_huggingface(
    "my_dataset",
    split="train",
    cache_dir="/tmp/datasets",
    streaming=False,
    trust_remote_code=True
)

# Process datasets with distributed computing
processed_imdb = imdb_dataset.map(
    lambda example: {
        'text_length': len(example['text']),
        'label': example['label']
    }
)

# Filter large datasets efficiently
long_texts = processed_imdb.filter(lambda x: x['text_length'] > 1000)

# Execute computations
results = xorbits.run(long_texts.to_pandas())

xorbits.shutdown()

Dataset Preprocessing Pipeline

import xorbits
from xorbits.datasets import from_huggingface
import xorbits.pandas as pd

xorbits.init()

# Load dataset
dataset = from_huggingface("imdb", split="train")

# Define preprocessing functions
def tokenize_text(example):
    # Tokenization logic here
    tokens = example['text'].split()
    return {
        'tokens': tokens,
        'token_count': len(tokens),
        'label': example['label']
    }

def filter_by_length(example):
    return 10 <= example['token_count'] <= 500

# Build preprocessing pipeline
processed_dataset = (dataset
    .map(tokenize_text)
    .filter(filter_by_length)
)

# Convert to pandas for further processing
df = processed_dataset.to_pandas()

# Additional pandas operations
analysis = df.groupby('label').agg({
    'token_count': ['mean', 'std', 'min', 'max']
})

# Execute pipeline
results = xorbits.run(analysis)

xorbits.shutdown()

Working with Multiple Datasets

import xorbits
from xorbits.datasets import from_huggingface
from xorbits.datasets import Dataset

xorbits.init()

# Load multiple datasets
train_dataset = from_huggingface("imdb", split="train")
test_dataset = from_huggingface("imdb", split="test")

# Load local dataset
local_dataset = Dataset.from_json('local_reviews.json')

# Combine datasets
combined_dataset = Dataset.concatenate([
    train_dataset, 
    test_dataset, 
    local_dataset
])

# Process combined dataset
def standardize_format(example):
    return {
        'text': example.get('text', example.get('review', '')),
        'sentiment': example.get('label', example.get('sentiment', 0)),
        'source': example.get('source', 'unknown')
    }

standardized_dataset = combined_dataset.map(standardize_format)

# Analyze dataset composition
source_counts = standardized_dataset.to_pandas().groupby('source').size()

# Execute computation
results = xorbits.run(source_counts)

xorbits.shutdown()

Large-Scale Data Processing

import xorbits
from xorbits.datasets import from_huggingface

xorbits.init()

# Load large dataset with streaming for memory efficiency
large_dataset = from_huggingface(
    "c4", 
    "en", 
    split="train",
    streaming=True  # Stream for very large datasets
)

# Process in batches for efficiency
def batch_process(batch):
    # Process batch of examples
    processed_batch = []
    for example in batch:
        processed_example = {
            'text_length': len(example['text']),
            'url_domain': example['url'].split('//')[1].split('/')[0] if '//' in example['url'] else 'unknown',
            'timestamp': example['timestamp']
        }
        processed_batch.append(processed_example)
    return processed_batch

# Apply batch processing
processed_dataset = large_dataset.map(
    batch_process, 
    batched=True, 
    batch_size=1000
)

# Sample for analysis
sample_data = processed_dataset.take(10000)

# Convert sample to pandas for analysis
sample_df = sample_data.to_pandas()
domain_analysis = sample_df.groupby('url_domain').size().sort_values(ascending=False)

# Execute computation
results = xorbits.run(domain_analysis)

xorbits.shutdown()

Install with Tessl CLI