Tessl Tile for pypi/xorbits@0.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md datasets.md index.md machine-learning.md numpy-integration.md pandas-integration.md remote-computing.md runtime-management.md

datasets.mddocs/

0
# Datasets
1

2
Large-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns. Xorbits datasets module provides scalable data loading and processing capabilities for machine learning workflows.
3

4
## Capabilities
5

6
### Dataset Class
7

8
Core dataset class for handling large datasets with distributed computing support.
9

10
```python { .api }
11
class Dataset:
12
    """
13
    Dataset class for handling large datasets with distributed computing.
14
    
15
    Provides efficient loading, processing, and manipulation of datasets
16
    that exceed single-machine memory through distributed processing.
17
    """
18
```
19

20
### Hugging Face Integration
21

22
Direct integration with Hugging Face datasets for loading popular machine learning datasets.
23

24
```python { .api }
25
def from_huggingface(dataset_name: str, **kwargs):
26
    """
27
    Load datasets from Hugging Face Hub with distributed support.
28
    
29
    Parameters:
30
    - dataset_name: str, name of the dataset on Hugging Face Hub
31
    - **kwargs: Additional parameters for dataset loading including:
32
        - split: str, dataset split to load ('train', 'test', 'validation')
33
        - streaming: bool, whether to stream the dataset
34
        - cache_dir: str, directory to cache downloaded files
35
        - revision: str, specific revision/version of the dataset
36
        - use_auth_token: bool or str, authentication token for private datasets
37
        - trust_remote_code: bool, whether to trust remote code execution
38
    
39
    Returns:
40
    - Dataset object with distributed computing capabilities
41
    """
42
```
43

44
**Usage Examples:**
45

46
### Basic Dataset Usage
47

48
```python
49
import xorbits
50
from xorbits.datasets import Dataset
51

52
xorbits.init()
53

54
# Create dataset from local data
55
dataset = Dataset.from_csv('large_dataset.csv')
56

57
# Basic dataset operations
58
filtered_dataset = dataset.filter(lambda x: x['value'] > 100)
59
mapped_dataset = dataset.map(lambda x: {'processed': x['value'] * 2})
60

61
# Dataset info
62
print(f"Dataset size: {len(dataset)}")
63
print(f"Dataset columns: {dataset.column_names}")
64

65
# Execute operations
66
result = xorbits.run(mapped_dataset.to_pandas())
67

68
xorbits.shutdown()
69
```
70

71
### Hugging Face Integration Examples
72

73
```python
74
import xorbits
75
from xorbits.datasets import from_huggingface
76

77
xorbits.init()
78

79
# Load popular datasets from Hugging Face
80
# Text classification dataset
81
imdb_dataset = from_huggingface("imdb", split="train")
82

83
# Natural language inference dataset  
84
glue_dataset = from_huggingface("glue", "mnli", split="train")
85

86
# Image classification dataset
87
cifar10_dataset = from_huggingface("cifar10", split="train")
88

89
# Question answering dataset
90
squad_dataset = from_huggingface("squad", split="train")
91

92
# Load with specific parameters
93
custom_dataset = from_huggingface(
94
    "my_dataset",
95
    split="train",
96
    cache_dir="/tmp/datasets",
97
    streaming=False,
98
    trust_remote_code=True
99
)
100

101
# Process datasets with distributed computing
102
processed_imdb = imdb_dataset.map(
103
    lambda example: {
104
        'text_length': len(example['text']),
105
        'label': example['label']
106
    }
107
)
108

109
# Filter large datasets efficiently
110
long_texts = processed_imdb.filter(lambda x: x['text_length'] > 1000)
111

112
# Execute computations
113
results = xorbits.run(long_texts.to_pandas())
114

115
xorbits.shutdown()
116
```
117

118
### Dataset Preprocessing Pipeline
119

120
```python
121
import xorbits
122
from xorbits.datasets import from_huggingface
123
import xorbits.pandas as pd
124

125
xorbits.init()
126

127
# Load dataset
128
dataset = from_huggingface("imdb", split="train")
129

130
# Define preprocessing functions
131
def tokenize_text(example):
132
    # Tokenization logic here
133
    tokens = example['text'].split()
134
    return {
135
        'tokens': tokens,
136
        'token_count': len(tokens),
137
        'label': example['label']
138
    }
139

140
def filter_by_length(example):
141
    return 10 <= example['token_count'] <= 500
142

143
# Build preprocessing pipeline
144
processed_dataset = (dataset
145
    .map(tokenize_text)
146
    .filter(filter_by_length)
147
)
148

149
# Convert to pandas for further processing
150
df = processed_dataset.to_pandas()
151

152
# Additional pandas operations
153
analysis = df.groupby('label').agg({
154
    'token_count': ['mean', 'std', 'min', 'max']
155
})
156

157
# Execute pipeline
158
results = xorbits.run(analysis)
159

160
xorbits.shutdown()
161
```
162

163
### Working with Multiple Datasets
164

165
```python
166
import xorbits
167
from xorbits.datasets import from_huggingface
168
from xorbits.datasets import Dataset
169

170
xorbits.init()
171

172
# Load multiple datasets
173
train_dataset = from_huggingface("imdb", split="train")
174
test_dataset = from_huggingface("imdb", split="test")
175

176
# Load local dataset
177
local_dataset = Dataset.from_json('local_reviews.json')
178

179
# Combine datasets
180
combined_dataset = Dataset.concatenate([
181
    train_dataset, 
182
    test_dataset, 
183
    local_dataset
184
])
185

186
# Process combined dataset
187
def standardize_format(example):
188
    return {
189
        'text': example.get('text', example.get('review', '')),
190
        'sentiment': example.get('label', example.get('sentiment', 0)),
191
        'source': example.get('source', 'unknown')
192
    }
193

194
standardized_dataset = combined_dataset.map(standardize_format)
195

196
# Analyze dataset composition
197
source_counts = standardized_dataset.to_pandas().groupby('source').size()
198

199
# Execute computation
200
results = xorbits.run(source_counts)
201

202
xorbits.shutdown()
203
```
204

205
### Large-Scale Data Processing
206

207
```python
208
import xorbits
209
from xorbits.datasets import from_huggingface
210

211
xorbits.init()
212

213
# Load large dataset with streaming for memory efficiency
214
large_dataset = from_huggingface(
215
    "c4", 
216
    "en", 
217
    split="train",
218
    streaming=True  # Stream for very large datasets
219
)
220

221
# Process in batches for efficiency
222
def batch_process(batch):
223
    # Process batch of examples
224
    processed_batch = []
225
    for example in batch:
226
        processed_example = {
227
            'text_length': len(example['text']),
228
            'url_domain': example['url'].split('//')[1].split('/')[0] if '//' in example['url'] else 'unknown',
229
            'timestamp': example['timestamp']
230
        }
231
        processed_batch.append(processed_example)
232
    return processed_batch
233

234
# Apply batch processing
235
processed_dataset = large_dataset.map(
236
    batch_process, 
237
    batched=True, 
238
    batch_size=1000
239
)
240

241
# Sample for analysis
242
sample_data = processed_dataset.take(10000)
243

244
# Convert sample to pandas for analysis
245
sample_df = sample_data.to_pandas()
246
domain_analysis = sample_df.groupby('url_domain').size().sort_values(ascending=False)
247

248
# Execute computation
249
results = xorbits.run(domain_analysis)
250

251
xorbits.shutdown()
252
```

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/