or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mddatasets.mdindex.mdmachine-learning.mdnumpy-integration.mdpandas-integration.mdremote-computing.mdruntime-management.md

machine-learning.mddocs/

0

# Machine Learning

1

2

Distributed machine learning capabilities through sklearn, XGBoost, and LightGBM integrations. Xorbits enables scalable model training and prediction on large datasets that exceed single-machine memory.

3

4

## Capabilities

5

6

### Scikit-learn Integration

7

8

Complete sklearn API with distributed computing capabilities across all major machine learning algorithms and utilities.

9

10

```python { .api }

11

# Core sklearn submodules - all classes and functions available via dynamic import

12

from xorbits.sklearn import cluster # Clustering algorithms

13

from xorbits.sklearn import datasets # Dataset loading utilities

14

from xorbits.sklearn import decomposition # Matrix decomposition algorithms

15

from xorbits.sklearn import ensemble # Ensemble methods

16

from xorbits.sklearn import linear_model # Linear models

17

from xorbits.sklearn import metrics # Model evaluation metrics

18

from xorbits.sklearn import model_selection # Model selection and validation

19

from xorbits.sklearn import neighbors # Nearest neighbors algorithms

20

from xorbits.sklearn import preprocessing # Data preprocessing

21

from xorbits.sklearn import semi_supervised # Semi-supervised learning

22

```

23

24

Example clustering algorithm:

25

26

```python { .api }

27

class KMeans:

28

"""

29

K-Means clustering algorithm with distributed computing support.

30

31

Provides the same API as sklearn.cluster.KMeans but enables

32

clustering of large datasets across multiple workers.

33

"""

34

```

35

36

### XGBoost Integration

37

38

XGBoost distributed training and prediction with Xorbits data structures.

39

40

```python { .api }

41

# XGBoost classes and functions available via dynamic import

42

class DMatrix:

43

"""

44

Data matrix for XGBoost with distributed computing support.

45

46

Equivalent to xgboost.DMatrix but works with Xorbits

47

distributed arrays and DataFrames.

48

"""

49

50

def train(params, dtrain, **kwargs):

51

"""

52

Train XGBoost model with distributed data.

53

54

Parameters:

55

- params: dict, XGBoost parameters

56

- dtrain: DMatrix, training data

57

- **kwargs: Additional training parameters

58

59

Returns:

60

- Trained XGBoost model

61

"""

62

63

def predict(model, dtest, **kwargs):

64

"""

65

Make predictions with XGBoost model.

66

67

Parameters:

68

- model: Trained XGBoost model

69

- dtest: DMatrix, test data

70

- **kwargs: Additional prediction parameters

71

72

Returns:

73

- Predictions array

74

"""

75

```

76

77

### LightGBM Integration

78

79

LightGBM distributed training with Xorbits data structures.

80

81

```python { .api }

82

# LightGBM classes and functions available via dynamic import

83

class Dataset:

84

"""

85

LightGBM dataset with distributed computing support.

86

87

Equivalent to lightgbm.Dataset but works with Xorbits

88

distributed arrays and DataFrames.

89

"""

90

91

def train(params, train_set, **kwargs):

92

"""

93

Train LightGBM model with distributed data.

94

95

Parameters:

96

- params: dict, LightGBM parameters

97

- train_set: Dataset, training data

98

- **kwargs: Additional training parameters

99

100

Returns:

101

- Trained LightGBM model

102

"""

103

```

104

105

**Usage Examples:**

106

107

### Scikit-learn Examples

108

109

```python

110

import xorbits

111

import xorbits.pandas as pd

112

import xorbits.numpy as np

113

from xorbits.sklearn.cluster import KMeans

114

from xorbits.sklearn.model_selection import train_test_split

115

from xorbits.sklearn.preprocessing import StandardScaler

116

from xorbits.sklearn.linear_model import LinearRegression

117

from xorbits.sklearn.metrics import accuracy_score

118

119

xorbits.init()

120

121

# Load large dataset

122

data = pd.read_csv('large_dataset.csv')

123

X = data.drop('target', axis=1)

124

y = data['target']

125

126

# Preprocessing with distributed computing

127

scaler = StandardScaler()

128

X_scaled = scaler.fit_transform(X)

129

130

# Train-test split

131

X_train, X_test, y_train, y_test = train_test_split(

132

X_scaled, y, test_size=0.2, random_state=42

133

)

134

135

# Clustering

136

kmeans = KMeans(n_clusters=10, random_state=42)

137

clusters = kmeans.fit_predict(X_train)

138

139

# Linear regression

140

lr = LinearRegression()

141

lr.fit(X_train, y_train)

142

predictions = lr.predict(X_test)

143

144

# Evaluate model

145

accuracy = accuracy_score(y_test, predictions > 0.5)

146

147

# Execute computations

148

results = xorbits.run(clusters, predictions, accuracy)

149

150

xorbits.shutdown()

151

```

152

153

### XGBoost Examples

154

155

```python

156

import xorbits

157

import xorbits.pandas as pd

158

import xorbits.xgboost as xgb

159

160

xorbits.init()

161

162

# Load distributed data

163

train_data = pd.read_csv('train.csv')

164

test_data = pd.read_csv('test.csv')

165

166

# Prepare XGBoost data matrices

167

X_train = train_data.drop('target', axis=1)

168

y_train = train_data['target']

169

X_test = test_data.drop('target', axis=1)

170

y_test = test_data['target']

171

172

dtrain = xgb.DMatrix(X_train, label=y_train)

173

dtest = xgb.DMatrix(X_test, label=y_test)

174

175

# Set XGBoost parameters

176

params = {

177

'objective': 'binary:logistic',

178

'max_depth': 6,

179

'eta': 0.3,

180

'subsample': 0.8,

181

'colsample_bytree': 0.8

182

}

183

184

# Train model with distributed computing

185

model = xgb.train(

186

params,

187

dtrain,

188

num_boost_round=100,

189

evals=[(dtest, 'test')],

190

early_stopping_rounds=10

191

)

192

193

# Make predictions

194

predictions = xgb.predict(model, dtest)

195

196

# Execute computation

197

computed_predictions = xorbits.run(predictions)

198

199

xorbits.shutdown()

200

```

201

202

### LightGBM Examples

203

204

```python

205

import xorbits

206

import xorbits.pandas as pd

207

import xorbits.lightgbm as lgb

208

209

xorbits.init()

210

211

# Load distributed data

212

train_data = pd.read_csv('train.csv')

213

test_data = pd.read_csv('test.csv')

214

215

# Prepare LightGBM datasets

216

X_train = train_data.drop('target', axis=1)

217

y_train = train_data['target']

218

X_test = test_data.drop('target', axis=1)

219

y_test = test_data['target']

220

221

train_dataset = lgb.Dataset(X_train, label=y_train)

222

test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)

223

224

# Set LightGBM parameters

225

params = {

226

'objective': 'binary',

227

'metric': 'binary_logloss',

228

'num_leaves': 31,

229

'learning_rate': 0.05,

230

'feature_fraction': 0.9

231

}

232

233

# Train model with distributed computing

234

model = lgb.train(

235

params,

236

train_dataset,

237

valid_sets=[test_dataset],

238

num_boost_round=100,

239

callbacks=[lgb.early_stopping(10)]

240

)

241

242

# Make predictions

243

predictions = model.predict(X_test)

244

245

# Execute computation

246

computed_predictions = xorbits.run(predictions)

247

248

xorbits.shutdown()

249

```

250

251

### Advanced ML Pipeline Example

252

253

```python

254

import xorbits

255

import xorbits.pandas as pd

256

from xorbits.sklearn.pipeline import Pipeline

257

from xorbits.sklearn.preprocessing import StandardScaler, OneHotEncoder

258

from xorbits.sklearn.compose import ColumnTransformer

259

from xorbits.sklearn.ensemble import RandomForestClassifier

260

from xorbits.sklearn.model_selection import GridSearchCV, cross_val_score

261

262

xorbits.init()

263

264

# Load large dataset

265

data = pd.read_csv('large_ml_dataset.csv')

266

X = data.drop('target', axis=1)

267

y = data['target']

268

269

# Define preprocessing for different column types

270

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

271

categorical_features = X.select_dtypes(include=['object']).columns

272

273

preprocessor = ColumnTransformer(

274

transformers=[

275

('num', StandardScaler(), numeric_features),

276

('cat', OneHotEncoder(drop='first'), categorical_features)

277

]

278

)

279

280

# Create ML pipeline

281

pipeline = Pipeline([

282

('preprocessor', preprocessor),

283

('classifier', RandomForestClassifier(random_state=42))

284

])

285

286

# Hyperparameter tuning with distributed computing

287

param_grid = {

288

'classifier__n_estimators': [100, 200],

289

'classifier__max_depth': [10, 20, None],

290

'classifier__min_samples_split': [2, 5]

291

}

292

293

grid_search = GridSearchCV(

294

pipeline,

295

param_grid,

296

cv=5,

297

scoring='accuracy',

298

n_jobs=-1

299

)

300

301

# Fit with distributed computing

302

grid_search.fit(X, y)

303

304

# Cross-validation scores

305

cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)

306

307

# Execute computations

308

results = xorbits.run(grid_search.best_params_, cv_scores)

309

310

xorbits.shutdown()

311

```