Tessl Tile for pypi/catboost@1.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-features.md core-models.md data-handling.md datasets.md evaluation.md feature-analysis.md index.md metrics.md training-evaluation.md utilities.md visualization.md

datasets.mddocs/

0
# Dataset Utilities
1

2
CatBoost includes built-in datasets for testing, learning, and benchmarking machine learning algorithms. These datasets cover various domains including classification, regression, and ranking tasks, with proper preprocessing and metadata.
3

4
## Capabilities
5

6
### Built-in Dataset Loading Functions
7

8
Pre-processed datasets ready for immediate use with CatBoost models.
9

10
```python { .api }
11
def titanic():
12
    """
13
    Load the famous Titanic survival dataset for binary classification.
14
    
15
    Returns:
16
    tuple: (train_df, test_df)
17
        - train_df: Training DataFrame with features and 'Survived' target
18
        - test_df: Test DataFrame with features (no target)
19
        
20
    Features:
21
    - Passenger class, sex, age, siblings/spouses, parents/children
22
    - Fare, embarked port, cabin, ticket information
23
    - Mixed categorical and numerical features
24
    - Target: Binary survival (0/1)
25
    """
26

27
def amazon():
28
    """
29
    Load Amazon employee access dataset for binary classification.
30
    
31
    Returns:
32
    tuple: (train_df, test_df)
33
        - train_df: Training DataFrame with features and 'ACTION' target
34
        - test_df: Test DataFrame with features (no target)
35
        
36
    Features:
37
    - Employee resource access request attributes
38
    - All categorical features (role, department, etc.)
39
    - Target: Binary access approval (0/1)
40
    """
41

42
def adult():
43
    """
44
    Load Adult (Census Income) dataset for binary classification.
45
    
46
    Returns:
47
    tuple: (train_df, test_df)
48
        - train_df: Training DataFrame with features and income target
49
        - test_df: Test DataFrame with features (no target)
50
        
51
    Features:
52
    - Demographics (age, workclass, education, marital status)
53
    - Work information (occupation, relationship, race, sex)
54
    - Financial information (capital gain/loss, hours per week)
55
    - Mixed categorical and numerical features
56
    - Target: Binary income level (<=50K, >50K)
57
    """
58

59
def epsilon():
60
    """
61
    Load Epsilon dataset for binary classification (large-scale dataset).
62
    
63
    Returns:
64
    tuple: (train_df, test_df)
65
        - train_df: Training DataFrame (400,000 samples)
66
        - test_df: Test DataFrame (100,000 samples)
67
        
68
    Features:
69
    - 2000 numerical features
70
    - Sparse feature representation
71
    - Target: Binary classification (0/1)
72
    - Commonly used for large-scale ML benchmarking
73
    """
74

75
def higgs():
76
    """
77
    Load HIGGS dataset for binary classification (physics domain).
78
    
79
    Returns:
80
    tuple: (train_df, test_df)
81
        - train_df: Training DataFrame (10.5M samples)
82
        - test_df: Test DataFrame (500K samples)
83
        
84
    Features:
85
    - 28 numerical features from particle physics simulations
86
    - High-energy physics particle collision data
87
    - Target: Binary classification (signal/background)
88
    - Benchmark for large-scale classification
89
    """
90
```
91

92
### Text and Sentiment Datasets
93

94
Datasets specifically designed for text classification and sentiment analysis tasks.
95

96
```python { .api }
97
def imdb():
98
    """
99
    Load IMDB movie reviews dataset for sentiment classification.
100
    
101
    Returns:
102
    tuple: (train_df, test_df)
103
        - train_df: Training DataFrame with 'text' and 'label' columns
104
        - test_df: Test DataFrame with 'text' and 'label' columns
105
        
106
    Features:
107
    - Movie review text (strings)
108
    - Preprocessed and cleaned text data
109
    - Target: Binary sentiment (positive/negative)
110
    - Suitable for text feature processing in CatBoost
111
    """
112

113
def rotten_tomatoes():
114
    """
115
    Load Rotten Tomatoes movie reviews for sentiment classification.
116
    
117
    Returns:
118
    tuple: (train_df, test_df)
119
        - train_df: Training DataFrame with review text and sentiment
120
        - test_df: Test DataFrame with review text and sentiment
121
        
122
    Features:
123
    - Short movie review snippets
124
    - Text preprocessing for CatBoost text features
125
    - Target: Binary sentiment classification
126
    - Smaller dataset compared to IMDB
127
    """
128
```
129

130
### Ranking Datasets
131

132
Specialized datasets for learning-to-rank and information retrieval tasks.
133

134
```python { .api }
135
def msrank():
136
    """
137
    Load Microsoft Learning-to-Rank dataset (full version).
138
    
139
    Returns:
140
    tuple: (train_df, test_df)
141
        - train_df: Training DataFrame with features, relevance, and query_id
142
        - test_df: Test DataFrame with features, relevance, and query_id
143
        
144
    Features:
145
    - 136 numerical features from web search
146
    - Query-document relevance scores (0-4 scale)
147
    - Query group identifiers for ranking evaluation
148
    - Standard benchmark for learning-to-rank algorithms
149
    """
150

151
def msrank_10k():
152
    """
153
    Load Microsoft Learning-to-Rank dataset (10K subset).
154
    
155
    Returns:
156
    tuple: (train_df, test_df)
157
        - train_df: Training DataFrame (subset of msrank)
158
        - test_df: Test DataFrame (subset of msrank)
159
        
160
    Features:
161
    - Same features as msrank() but smaller size
162
    - Suitable for quick testing and prototyping
163
    - Maintains query group structure for ranking
164
    """
165
```
166

167
### Synthetic and Mathematical Datasets
168

169
Datasets with known mathematical properties for algorithm testing.
170

171
```python { .api }
172
def monotonic1():
173
    """
174
    Load first monotonic regression dataset.
175
    
176
    Returns:
177
    tuple: (train_df, test_df)
178
        - train_df: Training DataFrame with monotonic relationships
179
        - test_df: Test DataFrame for evaluation
180
        
181
    Features:
182
    - Features with known monotonic relationships to target
183
    - Useful for testing monotonic constraints in CatBoost
184
    - Synthetic data with controlled properties
185
    """
186

187
def monotonic2():
188
    """
189
    Load second monotonic regression dataset.
190
    
191
    Returns:
192
    tuple: (train_df, test_df)
193
        - train_df: Training DataFrame with different monotonic patterns
194
        - test_df: Test DataFrame for evaluation
195
        
196
    Features:
197
    - Alternative monotonic feature patterns
198
    - Complementary to monotonic1() for comprehensive testing
199
    - Different complexity and noise levels
200
    """
201
```
202

203
### Dataset Cache Management
204

205
Functions for managing dataset storage and caching.
206

207
```python { .api }
208
def set_cache_path(path):
209
    """
210
    Set the cache directory for downloaded datasets.
211
    
212
    Parameters:
213
    - path: Directory path for caching datasets (string)
214
        - Must be writable directory
215
        - Datasets will be downloaded and stored here
216
        - Subsequent calls will use cached versions
217
    
218
    Example:
219
    set_cache_path('/path/to/dataset/cache')
220
    """
221
222
```
223

224
## Dataset Usage Examples
225

226
### Basic Dataset Loading
227

228
```python
229
from catboost.datasets import titanic, adult, amazon
230
from catboost import CatBoostClassifier, Pool
231

232
# Load Titanic dataset
233
train_df, test_df = titanic()
234
print(f"Titanic - Train shape: {train_df.shape}, Test shape: {test_df.shape}")
235

236
# Prepare features and target
237
X_train = train_df.drop('Survived', axis=1)
238
y_train = train_df['Survived']
239

240
# Identify categorical features
241
cat_features = ['Sex', 'Embarked', 'Pclass']
242

243
# Train model
244
model = CatBoostClassifier(
245
    iterations=100,
246
    verbose=False,
247
    cat_features=cat_features
248
)
249

250
model.fit(X_train, y_train)
251
print("Model trained on Titanic dataset")
252
```
253

254
### Text Dataset Processing
255

256
```python
257
from catboost.datasets import imdb
258
from catboost import CatBoostClassifier, Pool
259

260
# Load IMDB dataset
261
train_df, test_df = imdb()
262
print(f"IMDB - Train shape: {train_df.shape}")
263

264
# Create pools with text features
265
train_pool = Pool(
266
    data=train_df,
267
    label=train_df['label'],
268
    text_features=['text']  # Specify text column
269
)
270

271
test_pool = Pool(
272
    data=test_df,
273
    label=test_df['label'],
274
    text_features=['text']
275
)
276

277
# Train model with text processing
278
model = CatBoostClassifier(
279
    iterations=200,
280
    verbose=50,
281
    text_processing={
282
        'tokenizers': [{'tokenizer_id': 'Space', 'separator_type': 'ByDelimiter', 'delimiter': ' '}],
283
        'dictionaries': [{'dictionary_id': 'Word', 'max_dictionary_size': '50000'}],
284
        'feature_processing': {
285
            'default': [{'dictionaries_names': ['Word'], 'feature_calcers': ['BoW']}]
286
        }
287
    }
288
)
289

290
model.fit(train_pool, eval_set=test_pool)
291
print("Model trained on IMDB text data")
292
```
293

294
### Ranking Dataset Usage
295

296
```python
297
from catboost.datasets import msrank_10k
298
from catboost import CatBoostRanker, Pool
299

300
# Load ranking dataset
301
train_df, test_df = msrank_10k()
302
print(f"MSRank 10K - Train shape: {train_df.shape}")
303

304
# Extract features, labels, and group IDs
305
feature_cols = [col for col in train_df.columns if col not in ['label', 'query_id']]
306
X_train = train_df[feature_cols]
307
y_train = train_df['label']
308
group_id_train = train_df['query_id']
309

310
X_test = test_df[feature_cols]
311
y_test = test_df['label']
312
group_id_test = test_df['query_id']
313

314
# Create pools for ranking
315
train_pool = Pool(
316
    data=X_train,
317
    label=y_train,
318
    group_id=group_id_train
319
)
320

321
test_pool = Pool(
322
    data=X_test,
323
    label=y_test,
324
    group_id=group_id_test
325
)
326

327
# Train ranking model
328
ranker = CatBoostRanker(
329
    iterations=200,
330
    learning_rate=0.1,
331
    depth=6,
332
    loss_function='YetiRank',
333
    eval_metric='NDCG',
334
    verbose=50
335
)
336

337
ranker.fit(train_pool, eval_set=test_pool)
338
print("Ranking model trained on MSRank dataset")
339
```
340

341
### Large Dataset Handling
342

343
```python
344
from catboost.datasets import epsilon, higgs, set_cache_path
345
from catboost import CatBoostClassifier
346
import os
347

348
# Set cache directory for large datasets
349
cache_dir = '/tmp/catboost_datasets'
350
os.makedirs(cache_dir, exist_ok=True)
351
set_cache_path(cache_dir)
352

353
# Load large dataset (this may take time on first run)
354
print("Loading epsilon dataset...")
355
train_df, test_df = epsilon()
356
print(f"Epsilon - Train: {train_df.shape}, Test: {test_df.shape}")
357

358
# For very large datasets, consider using file-based training
359
# Save to files and use Pool with file paths
360
train_df.to_csv('epsilon_train.tsv', sep='\t', index=False)
361
test_df.to_csv('epsilon_test.tsv', sep='\t', index=False)
362

363
# Create pools from files for memory efficiency
364
from catboost import Pool
365
train_pool = Pool('epsilon_train.tsv', delimiter='\t', has_header=True)
366
test_pool = Pool('epsilon_test.tsv', delimiter='\t', has_header=True)
367

368
# Train with limited memory usage
369
model = CatBoostClassifier(
370
    iterations=100,
371
    learning_rate=0.1,
372
    depth=6,
373
    verbose=25,
374
    used_ram_limit='4gb'  # Limit RAM usage
375
)
376

377
model.fit(train_pool, eval_set=test_pool)
378
print("Large dataset model training completed")
379
```
380

381
### Dataset Comparison and Analysis
382

383
```python
384
from catboost.datasets import titanic, adult, amazon
385
import pandas as pd
386

387
def analyze_dataset(load_func, name):
388
    """Analyze a CatBoost dataset."""
389
    train_df, test_df = load_func()
390
    
391
    print(f"\n{name} Dataset Analysis:")
392
    print(f"  Train shape: {train_df.shape}")
393
    print(f"  Test shape: {test_df.shape}")
394
    print(f"  Features: {train_df.shape[1] - 1}")  # Excluding target
395
    
396
    # Identify column types
397
    numeric_cols = train_df.select_dtypes(include=['number']).columns
398
    categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
399
    
400
    print(f"  Numeric features: {len(numeric_cols)}")
401
    print(f"  Categorical features: {len(categorical_cols)}")
402
    
403
    # Target analysis
404
    target_col = train_df.columns[-1]  # Assume last column is target
405
    if target_col in train_df.columns:
406
        target_unique = train_df[target_col].nunique()
407
        print(f"  Target classes: {target_unique}")
408
        print(f"  Target distribution: {dict(train_df[target_col].value_counts())}")
409

410
# Analyze multiple datasets
411
datasets = [
412
    (titanic, "Titanic"),
413
    (adult, "Adult"),
414
    (amazon, "Amazon")
415
]
416

417
for load_func, name in datasets:
418
    analyze_dataset(load_func, name)
419
```
420

421
### Custom Dataset Cache Management
422

423
```python
424
from catboost.datasets import set_cache_path
425
import os
426

427
# Set custom cache location
428
custom_cache = "/home/user/ml_datasets"
429
os.makedirs(custom_cache, exist_ok=True)
430
set_cache_path(custom_cache)
431

432
print(f"Cache path set to: {custom_cache}")
433

434
# Load dataset (will cache in new location)
435
from catboost.datasets import titanic
436
train_df, test_df = titanic()
437

438
# List cached files
439
cache_files = os.listdir(custom_cache)
440
print(f"Cached files: {cache_files}")
441
```

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/