0
# Scikit-learn Interface
1
2
High-level, sklearn-compatible interface for gradient boosting tasks. These classes provide familiar scikit-learn APIs with automatic hyperparameter handling, data preprocessing, and integration with the broader sklearn ecosystem.
3
4
## Capabilities
5
6
### Base Model Interface
7
8
The foundational class that provides common functionality for all LightGBM sklearn-style estimators.
9
10
```python { .api }
11
class LGBMModel:
12
"""
13
Base class for LightGBM sklearn-style estimators.
14
15
Common parameters:
16
- boosting_type: str, default='gbdt' - Type of boosting ('gbdt', 'dart', 'goss', 'rf')
17
- num_leaves: int, default=31 - Maximum tree leaves for base learners
18
- max_depth: int, default=-1 - Maximum tree depth for base learners (-1 means no limit)
19
- learning_rate: float, default=0.1 - Boosting learning rate
20
- n_estimators: int, default=100 - Number of boosted trees to fit
21
- subsample_for_bin: int, default=200000 - Number of samples for constructing bins
22
- objective: str or callable, default=None - Specify the learning task and loss function
23
- class_weight: dict, 'balanced' or None, default=None - Weights associated with classes
24
- min_split_gain: float, default=0. - Minimum loss reduction required to make split
25
- min_child_weight: float, default=1e-3 - Minimum sum of instance weight in a child
26
- min_child_samples: int, default=20 - Minimum number of data needed in a child
27
- subsample: float, default=1. - Subsample ratio of the training instance
28
- subsample_freq: int, default=0 - Frequency of subsample, <=0 means no enable
29
- colsample_bytree: float, default=1. - Subsample ratio of columns when constructing each tree
30
- reg_alpha: float, default=0. - L1 regularization term on weights
31
- reg_lambda: float, default=0. - L2 regularization term on weights
32
- random_state: int, RandomState object or None, default=None - Random number seed
33
- n_jobs: int, default=None - Number of parallel threads
34
- importance_type: str, default='split' - Feature importance type ('split', 'gain')
35
"""
36
37
def fit(self, X, y, sample_weight=None, init_score=None, eval_set=None,
38
eval_names=None, eval_sample_weight=None, eval_init_score=None,
39
eval_metric=None, feature_name='auto', categorical_feature='auto',
40
early_stopping_rounds=None, verbose=True, log_evaluation=None,
41
callbacks=None):
42
"""
43
Fit the gradient boosting model.
44
45
Parameters:
46
- X: array-like, shape=(n_samples, n_features) - Input features
47
- y: array-like, shape=(n_samples,) - Target values
48
- sample_weight: array-like, shape=(n_samples,), optional - Sample weights
49
- init_score: array-like, shape=(n_samples,), optional - Initial prediction scores
50
- eval_set: list of (X, y) tuples, optional - Evaluation datasets
51
- eval_names: list of strings, optional - Names for evaluation datasets
52
- eval_sample_weight: list of arrays, optional - Sample weights for evaluation sets
53
- eval_init_score: list of arrays, optional - Initial scores for evaluation sets
54
- eval_metric: str, list of str, or None, optional - Evaluation metrics
55
- feature_name: list of strings or 'auto', optional - Feature names
56
- categorical_feature: list of strings/ints or 'auto', optional - Categorical features
57
- early_stopping_rounds: int or None, optional - Early stopping rounds
58
- verbose: bool or int, optional - Controls verbosity of training
59
- log_evaluation: bool, int, or None, optional - Evaluation logging frequency
60
- callbacks: list of callback functions, optional - Custom callbacks
61
62
Returns:
63
- self: Returns self
64
"""
65
66
def predict(self, X, num_iteration=None, **kwargs):
67
"""
68
Make predictions on input data.
69
70
Parameters:
71
- X: array-like, shape=(n_samples, n_features) - Input features
72
- num_iteration: int or None, optional - Limit number of iterations for prediction
73
74
Returns:
75
- array-like, shape=(n_samples,) - Predicted values
76
"""
77
78
@property
79
def booster_(self):
80
"""Get the underlying Booster object."""
81
82
@property
83
def feature_importances_(self):
84
"""Get feature importances array."""
85
86
@property
87
def feature_name_(self):
88
"""Get feature names list."""
89
90
@property
91
def n_features_(self):
92
"""Get number of features."""
93
94
@property
95
def objective_(self):
96
"""Get the concrete objective used by this model."""
97
```
98
99
### Regression
100
101
LightGBM regressor for continuous target variables. Optimized for regression tasks with support for various loss functions and evaluation metrics.
102
103
```python { .api }
104
class LGBMRegressor(LGBMModel):
105
"""
106
LightGBM regressor for regression tasks.
107
108
Additional parameters:
109
- objective: str, default='regression' - Regression objective ('regression', 'regression_l1', 'huber', 'quantile', etc.)
110
"""
111
112
def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
113
learning_rate=0.1, n_estimators=100, subsample_for_bin=200000,
114
objective=None, class_weight=None, min_split_gain=0.,
115
min_child_weight=1e-3, min_child_samples=20, subsample=1.,
116
subsample_freq=0, colsample_bytree=1., reg_alpha=0.,
117
reg_lambda=0., random_state=None, n_jobs=None,
118
importance_type='split', **kwargs):
119
"""Initialize LGBMRegressor with regression-specific defaults."""
120
121
def fit(self, X, y, **kwargs):
122
"""Fit regression model. Inherits from LGBMModel.fit()."""
123
124
def predict(self, X, num_iteration=None, **kwargs):
125
"""
126
Predict regression target for X.
127
128
Returns:
129
- array-like, shape=(n_samples,) - Predicted regression values
130
"""
131
132
def score(self, X, y, sample_weight=None):
133
"""
134
Return the coefficient of determination R^2 of the prediction.
135
136
Parameters:
137
- X: array-like, shape=(n_samples, n_features) - Test samples
138
- y: array-like, shape=(n_samples,) - True values for X
139
- sample_weight: array-like, shape=(n_samples,), optional - Sample weights
140
141
Returns:
142
- float: R^2 of self.predict(X) wrt. y
143
"""
144
```
145
146
### Classification
147
148
LightGBM classifier for discrete target variables. Supports both binary and multiclass classification with probability estimation and class prediction.
149
150
```python { .api }
151
class LGBMClassifier(LGBMModel):
152
"""
153
LightGBM classifier for classification tasks.
154
155
Additional parameters:
156
- objective: str, default='binary' or 'multiclass' - Classification objective
157
"""
158
159
def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
160
learning_rate=0.1, n_estimators=100, subsample_for_bin=200000,
161
objective=None, class_weight=None, min_split_gain=0.,
162
min_child_weight=1e-3, min_child_samples=20, subsample=1.,
163
subsample_freq=0, colsample_bytree=1., reg_alpha=0.,
164
reg_lambda=0., random_state=None, n_jobs=None,
165
importance_type='split', **kwargs):
166
"""Initialize LGBMClassifier with classification-specific defaults."""
167
168
def fit(self, X, y, **kwargs):
169
"""Fit classification model. Inherits from LGBMModel.fit()."""
170
171
def predict(self, X, num_iteration=None, **kwargs):
172
"""
173
Predict class labels for X.
174
175
Returns:
176
- array-like, shape=(n_samples,) - Predicted class labels
177
"""
178
179
def predict_proba(self, X, num_iteration=None, **kwargs):
180
"""
181
Predict class probabilities for X.
182
183
Returns:
184
- array-like, shape=(n_samples, n_classes) - Class probabilities
185
"""
186
187
def score(self, X, y, sample_weight=None):
188
"""
189
Return the mean accuracy on the given test data and labels.
190
191
Parameters:
192
- X: array-like, shape=(n_samples, n_features) - Test samples
193
- y: array-like, shape=(n_samples,) - True labels for X
194
- sample_weight: array-like, shape=(n_samples,), optional - Sample weights
195
196
Returns:
197
- float: Mean accuracy of self.predict(X) wrt. y
198
"""
199
200
@property
201
def classes_(self):
202
"""Get unique class labels."""
203
204
@property
205
def n_classes_(self):
206
"""Get number of classes."""
207
```
208
209
### Ranking
210
211
LightGBM ranker for learning-to-rank tasks. Optimized for ranking scenarios where the goal is to order items rather than predict absolute values.
212
213
```python { .api }
214
class LGBMRanker(LGBMModel):
215
"""
216
LightGBM ranker for learning-to-rank tasks.
217
218
Additional parameters:
219
- objective: str, default='rank_xendcg' - Ranking objective ('lambdarank', 'rank_xendcg')
220
"""
221
222
def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
223
learning_rate=0.1, n_estimators=100, subsample_for_bin=200000,
224
objective=None, class_weight=None, min_split_gain=0.,
225
min_child_weight=1e-3, min_child_samples=20, subsample=1.,
226
subsample_freq=0, colsample_bytree=1., reg_alpha=0.,
227
reg_lambda=0., random_state=None, n_jobs=None,
228
importance_type='split', **kwargs):
229
"""Initialize LGBMRanker with ranking-specific defaults."""
230
231
def fit(self, X, y, group=None, **kwargs):
232
"""
233
Fit ranking model.
234
235
Parameters:
236
- X: array-like, shape=(n_samples, n_features) - Input features
237
- y: array-like, shape=(n_samples,) - Target ranking scores
238
- group: array-like, shape=(n_groups,) - Group/query sizes for ranking
239
"""
240
241
def predict(self, X, num_iteration=None, **kwargs):
242
"""
243
Predict ranking scores for X.
244
245
Returns:
246
- array-like, shape=(n_samples,) - Predicted ranking scores
247
"""
248
249
def score(self, X, y, sample_weight=None):
250
"""
251
Return the ranking evaluation score.
252
253
Parameters:
254
- X: array-like, shape=(n_samples, n_features) - Test samples
255
- y: array-like, shape=(n_samples,) - True ranking scores for X
256
- sample_weight: array-like, shape=(n_samples,), optional - Sample weights
257
258
Returns:
259
- float: Ranking evaluation score
260
"""
261
```
262
263
## Usage Examples
264
265
### Regression Example
266
267
```python
268
import lightgbm as lgb
269
from sklearn.datasets import load_boston
270
from sklearn.model_selection import train_test_split
271
from sklearn.metrics import mean_squared_error, r2_score
272
273
# Load data
274
X, y = load_boston(return_X_y=True)
275
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
276
277
# Initialize and train regressor
278
regressor = lgb.LGBMRegressor(
279
objective='regression',
280
n_estimators=100,
281
learning_rate=0.1,
282
num_leaves=31,
283
random_state=42
284
)
285
286
regressor.fit(
287
X_train, y_train,
288
eval_set=[(X_test, y_test)],
289
eval_metric='l2',
290
early_stopping_rounds=10,
291
verbose=False
292
)
293
294
# Make predictions
295
predictions = regressor.predict(X_test)
296
print(f"R² Score: {r2_score(y_test, predictions):.4f}")
297
print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False):.4f}")
298
```
299
300
### Classification Example
301
302
```python
303
import lightgbm as lgb
304
from sklearn.datasets import load_iris
305
from sklearn.model_selection import train_test_split
306
from sklearn.metrics import accuracy_score, classification_report
307
308
# Load data
309
X, y = load_iris(return_X_y=True)
310
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
311
312
# Initialize and train classifier
313
classifier = lgb.LGBMClassifier(
314
objective='multiclass',
315
n_estimators=100,
316
learning_rate=0.1,
317
num_leaves=31,
318
random_state=42
319
)
320
321
classifier.fit(
322
X_train, y_train,
323
eval_set=[(X_test, y_test)],
324
eval_metric='multi_logloss',
325
early_stopping_rounds=10,
326
verbose=False
327
)
328
329
# Make predictions
330
predictions = classifier.predict(X_test)
331
probabilities = classifier.predict_proba(X_test)
332
333
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
334
print(f"Classes: {classifier.classes_}")
335
print(f"Feature Importances: {classifier.feature_importances_}")
336
```
337
338
### Ranking Example
339
340
```python
341
import lightgbm as lgb
342
import numpy as np
343
from sklearn.datasets import make_regression
344
345
# Create ranking data
346
X, y = make_regression(n_samples=1000, n_features=10, random_state=42)
347
# Create groups for ranking (query sizes)
348
group = np.random.randint(10, 50, size=20) # 20 queries with varying sizes
349
group = group[group.cumsum() <= 1000] # Ensure total doesn't exceed samples
350
351
# Initialize and train ranker
352
ranker = lgb.LGBMRanker(
353
objective='rank_xendcg',
354
n_estimators=100,
355
learning_rate=0.1,
356
num_leaves=31,
357
random_state=42
358
)
359
360
ranker.fit(X, y, group=group)
361
362
# Make predictions
363
ranking_scores = ranker.predict(X)
364
print(f"Ranking scores shape: {ranking_scores.shape}")
365
print(f"Sample ranking scores: {ranking_scores[:10]}")
366
```