0
# Machine Learning
1
2
Distributed machine learning capabilities through sklearn, XGBoost, and LightGBM integrations. Xorbits enables scalable model training and prediction on large datasets that exceed single-machine memory.
3
4
## Capabilities
5
6
### Scikit-learn Integration
7
8
Complete sklearn API with distributed computing capabilities across all major machine learning algorithms and utilities.
9
10
```python { .api }
11
# Core sklearn submodules - all classes and functions available via dynamic import
12
from xorbits.sklearn import cluster # Clustering algorithms
13
from xorbits.sklearn import datasets # Dataset loading utilities
14
from xorbits.sklearn import decomposition # Matrix decomposition algorithms
15
from xorbits.sklearn import ensemble # Ensemble methods
16
from xorbits.sklearn import linear_model # Linear models
17
from xorbits.sklearn import metrics # Model evaluation metrics
18
from xorbits.sklearn import model_selection # Model selection and validation
19
from xorbits.sklearn import neighbors # Nearest neighbors algorithms
20
from xorbits.sklearn import preprocessing # Data preprocessing
21
from xorbits.sklearn import semi_supervised # Semi-supervised learning
22
```
23
24
Example clustering algorithm:
25
26
```python { .api }
27
class KMeans:
28
"""
29
K-Means clustering algorithm with distributed computing support.
30
31
Provides the same API as sklearn.cluster.KMeans but enables
32
clustering of large datasets across multiple workers.
33
"""
34
```
35
36
### XGBoost Integration
37
38
XGBoost distributed training and prediction with Xorbits data structures.
39
40
```python { .api }
41
# XGBoost classes and functions available via dynamic import
42
class DMatrix:
43
"""
44
Data matrix for XGBoost with distributed computing support.
45
46
Equivalent to xgboost.DMatrix but works with Xorbits
47
distributed arrays and DataFrames.
48
"""
49
50
def train(params, dtrain, **kwargs):
51
"""
52
Train XGBoost model with distributed data.
53
54
Parameters:
55
- params: dict, XGBoost parameters
56
- dtrain: DMatrix, training data
57
- **kwargs: Additional training parameters
58
59
Returns:
60
- Trained XGBoost model
61
"""
62
63
def predict(model, dtest, **kwargs):
64
"""
65
Make predictions with XGBoost model.
66
67
Parameters:
68
- model: Trained XGBoost model
69
- dtest: DMatrix, test data
70
- **kwargs: Additional prediction parameters
71
72
Returns:
73
- Predictions array
74
"""
75
```
76
77
### LightGBM Integration
78
79
LightGBM distributed training with Xorbits data structures.
80
81
```python { .api }
82
# LightGBM classes and functions available via dynamic import
83
class Dataset:
84
"""
85
LightGBM dataset with distributed computing support.
86
87
Equivalent to lightgbm.Dataset but works with Xorbits
88
distributed arrays and DataFrames.
89
"""
90
91
def train(params, train_set, **kwargs):
92
"""
93
Train LightGBM model with distributed data.
94
95
Parameters:
96
- params: dict, LightGBM parameters
97
- train_set: Dataset, training data
98
- **kwargs: Additional training parameters
99
100
Returns:
101
- Trained LightGBM model
102
"""
103
```
104
105
**Usage Examples:**
106
107
### Scikit-learn Examples
108
109
```python
110
import xorbits
111
import xorbits.pandas as pd
112
import xorbits.numpy as np
113
from xorbits.sklearn.cluster import KMeans
114
from xorbits.sklearn.model_selection import train_test_split
115
from xorbits.sklearn.preprocessing import StandardScaler
116
from xorbits.sklearn.linear_model import LinearRegression
117
from xorbits.sklearn.metrics import accuracy_score
118
119
xorbits.init()
120
121
# Load large dataset
122
data = pd.read_csv('large_dataset.csv')
123
X = data.drop('target', axis=1)
124
y = data['target']
125
126
# Preprocessing with distributed computing
127
scaler = StandardScaler()
128
X_scaled = scaler.fit_transform(X)
129
130
# Train-test split
131
X_train, X_test, y_train, y_test = train_test_split(
132
X_scaled, y, test_size=0.2, random_state=42
133
)
134
135
# Clustering
136
kmeans = KMeans(n_clusters=10, random_state=42)
137
clusters = kmeans.fit_predict(X_train)
138
139
# Linear regression
140
lr = LinearRegression()
141
lr.fit(X_train, y_train)
142
predictions = lr.predict(X_test)
143
144
# Evaluate model
145
accuracy = accuracy_score(y_test, predictions > 0.5)
146
147
# Execute computations
148
results = xorbits.run(clusters, predictions, accuracy)
149
150
xorbits.shutdown()
151
```
152
153
### XGBoost Examples
154
155
```python
156
import xorbits
157
import xorbits.pandas as pd
158
import xorbits.xgboost as xgb
159
160
xorbits.init()
161
162
# Load distributed data
163
train_data = pd.read_csv('train.csv')
164
test_data = pd.read_csv('test.csv')
165
166
# Prepare XGBoost data matrices
167
X_train = train_data.drop('target', axis=1)
168
y_train = train_data['target']
169
X_test = test_data.drop('target', axis=1)
170
y_test = test_data['target']
171
172
dtrain = xgb.DMatrix(X_train, label=y_train)
173
dtest = xgb.DMatrix(X_test, label=y_test)
174
175
# Set XGBoost parameters
176
params = {
177
'objective': 'binary:logistic',
178
'max_depth': 6,
179
'eta': 0.3,
180
'subsample': 0.8,
181
'colsample_bytree': 0.8
182
}
183
184
# Train model with distributed computing
185
model = xgb.train(
186
params,
187
dtrain,
188
num_boost_round=100,
189
evals=[(dtest, 'test')],
190
early_stopping_rounds=10
191
)
192
193
# Make predictions
194
predictions = xgb.predict(model, dtest)
195
196
# Execute computation
197
computed_predictions = xorbits.run(predictions)
198
199
xorbits.shutdown()
200
```
201
202
### LightGBM Examples
203
204
```python
205
import xorbits
206
import xorbits.pandas as pd
207
import xorbits.lightgbm as lgb
208
209
xorbits.init()
210
211
# Load distributed data
212
train_data = pd.read_csv('train.csv')
213
test_data = pd.read_csv('test.csv')
214
215
# Prepare LightGBM datasets
216
X_train = train_data.drop('target', axis=1)
217
y_train = train_data['target']
218
X_test = test_data.drop('target', axis=1)
219
y_test = test_data['target']
220
221
train_dataset = lgb.Dataset(X_train, label=y_train)
222
test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)
223
224
# Set LightGBM parameters
225
params = {
226
'objective': 'binary',
227
'metric': 'binary_logloss',
228
'num_leaves': 31,
229
'learning_rate': 0.05,
230
'feature_fraction': 0.9
231
}
232
233
# Train model with distributed computing
234
model = lgb.train(
235
params,
236
train_dataset,
237
valid_sets=[test_dataset],
238
num_boost_round=100,
239
callbacks=[lgb.early_stopping(10)]
240
)
241
242
# Make predictions
243
predictions = model.predict(X_test)
244
245
# Execute computation
246
computed_predictions = xorbits.run(predictions)
247
248
xorbits.shutdown()
249
```
250
251
### Advanced ML Pipeline Example
252
253
```python
254
import xorbits
255
import xorbits.pandas as pd
256
from xorbits.sklearn.pipeline import Pipeline
257
from xorbits.sklearn.preprocessing import StandardScaler, OneHotEncoder
258
from xorbits.sklearn.compose import ColumnTransformer
259
from xorbits.sklearn.ensemble import RandomForestClassifier
260
from xorbits.sklearn.model_selection import GridSearchCV, cross_val_score
261
262
xorbits.init()
263
264
# Load large dataset
265
data = pd.read_csv('large_ml_dataset.csv')
266
X = data.drop('target', axis=1)
267
y = data['target']
268
269
# Define preprocessing for different column types
270
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
271
categorical_features = X.select_dtypes(include=['object']).columns
272
273
preprocessor = ColumnTransformer(
274
transformers=[
275
('num', StandardScaler(), numeric_features),
276
('cat', OneHotEncoder(drop='first'), categorical_features)
277
]
278
)
279
280
# Create ML pipeline
281
pipeline = Pipeline([
282
('preprocessor', preprocessor),
283
('classifier', RandomForestClassifier(random_state=42))
284
])
285
286
# Hyperparameter tuning with distributed computing
287
param_grid = {
288
'classifier__n_estimators': [100, 200],
289
'classifier__max_depth': [10, 20, None],
290
'classifier__min_samples_split': [2, 5]
291
}
292
293
grid_search = GridSearchCV(
294
pipeline,
295
param_grid,
296
cv=5,
297
scoring='accuracy',
298
n_jobs=-1
299
)
300
301
# Fit with distributed computing
302
grid_search.fit(X, y)
303
304
# Cross-validation scores
305
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
306
307
# Execute computations
308
results = xorbits.run(grid_search.best_params_, cv_scores)
309
310
xorbits.shutdown()
311
```