0
# Core Training
1
2
Low-level LightGBM interface providing direct access to the gradient boosting engine. This interface enables advanced model control, custom objectives, evaluation functions, and fine-tuned training procedures for users who need maximum flexibility.
3
4
## Capabilities
5
6
### Model Training and Management
7
8
The core Booster class provides direct access to LightGBM's gradient boosting engine with full control over training parameters and model behavior.
9
10
```python { .api }
11
class Booster:
12
"""
13
Core LightGBM model class for advanced training and prediction control.
14
"""
15
16
def __init__(self, params=None, train_set=None, model_file=None, model_str=None):
17
"""
18
Initialize Booster object.
19
20
Parameters:
21
- params: dict or None - Training parameters
22
- train_set: Dataset or None - Training dataset
23
- model_file: str or None - Path to model file to load
24
- model_str: str or None - Model string to load from
25
"""
26
27
def add_valid(self, data, name):
28
"""
29
Add validation dataset.
30
31
Parameters:
32
- data: Dataset - Validation dataset
33
- name: str - Name for the validation set
34
"""
35
36
def current_iteration(self):
37
"""
38
Get current iteration index.
39
40
Returns:
41
- int: Current iteration number
42
"""
43
44
def dump_model(self, num_iteration=None, start_iteration=0, importance_type='split'):
45
"""
46
Export model structure to JSON format.
47
48
Parameters:
49
- num_iteration: int or None - Number of iterations to export
50
- start_iteration: int - Starting iteration to export
51
- importance_type: str - Feature importance type ('split', 'gain')
52
53
Returns:
54
- dict: Model structure in JSON format
55
"""
56
57
def eval(self, data, name, feval=None):
58
"""
59
Evaluate model on given dataset.
60
61
Parameters:
62
- data: Dataset - Dataset to evaluate on
63
- name: str - Name of the dataset
64
- feval: callable or None - Custom evaluation function
65
66
Returns:
67
- list: Evaluation results
68
"""
69
70
def eval_train(self, feval=None):
71
"""Evaluate model on training data."""
72
73
def eval_valid(self, feval=None):
74
"""Evaluate model on validation data."""
75
76
def feature_importance(self, importance_type='split', iteration=None):
77
"""
78
Get feature importance scores.
79
80
Parameters:
81
- importance_type: str - Type of importance ('split', 'gain')
82
- iteration: int or None - Iteration to get importance for
83
84
Returns:
85
- numpy.ndarray: Feature importance scores
86
"""
87
88
def feature_name(self):
89
"""
90
Get feature names.
91
92
Returns:
93
- list: Feature names
94
"""
95
96
def free_dataset(self):
97
"""Free dataset memory."""
98
99
def get_leaf_output(self, tree_id, leaf_id):
100
"""
101
Get leaf output value.
102
103
Parameters:
104
- tree_id: int - Tree index
105
- leaf_id: int - Leaf index
106
107
Returns:
108
- float: Leaf output value
109
"""
110
111
def set_leaf_output(self, tree_id, leaf_id, val):
112
"""
113
Set leaf output value.
114
115
Parameters:
116
- tree_id: int - Tree index
117
- leaf_id: int - Leaf index
118
- val: float - New leaf value
119
"""
120
121
def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
122
"""
123
Get split value histogram for a feature.
124
125
Parameters:
126
- feature: int or str - Feature index or name
127
- bins: int or None - Number of histogram bins
128
- xgboost_style: bool - Whether to use XGBoost-style binning
129
130
Returns:
131
- tuple: (bin_edges, bin_counts)
132
"""
133
134
def lower_bound(self):
135
"""Get prediction lower bound."""
136
137
def upper_bound(self):
138
"""Get prediction upper bound."""
139
140
def model_from_string(self, model_str):
141
"""
142
Load model from string representation.
143
144
Parameters:
145
- model_str: str - String representation of model
146
"""
147
148
def model_to_string(self, num_iteration=None, start_iteration=0):
149
"""
150
Export model to string representation.
151
152
Parameters:
153
- num_iteration: int or None - Number of iterations to export
154
- start_iteration: int - Starting iteration to export
155
156
Returns:
157
- str: String representation of model
158
"""
159
160
def num_feature(self):
161
"""
162
Get number of features.
163
164
Returns:
165
- int: Number of features
166
"""
167
168
def num_model_per_iteration(self):
169
"""
170
Get number of models per iteration.
171
172
Returns:
173
- int: Number of models per iteration
174
"""
175
176
def num_trees(self):
177
"""
178
Get total number of trees.
179
180
Returns:
181
- int: Total number of trees
182
"""
183
184
def predict(self, data, start_iteration=0, num_iteration=None,
185
pred_leaf=False, pred_contrib=False, **kwargs):
186
"""
187
Make predictions on data.
188
189
Parameters:
190
- data: array-like, Dataset, or str - Input data or filename
191
- start_iteration: int - Starting iteration for prediction
192
- num_iteration: int or None - Number of iterations to use
193
- pred_leaf: bool - Whether to predict leaf indices
194
- pred_contrib: bool - Whether to predict feature contributions
195
196
Returns:
197
- numpy.ndarray: Predictions
198
"""
199
200
def refit(self, data, label, decay_rate=0.9, **kwargs):
201
"""
202
Refit model with new data using online learning.
203
204
Parameters:
205
- data: array-like - New training data
206
- label: array-like - New training labels
207
- decay_rate: float - Decay rate for online learning
208
"""
209
210
def reset_parameter(self, params):
211
"""
212
Reset model parameters.
213
214
Parameters:
215
- params: dict - New parameters to set
216
"""
217
218
def rollback_one_iter(self):
219
"""Rollback one iteration."""
220
221
def save_model(self, filename, num_iteration=None, start_iteration=0):
222
"""
223
Save model to file.
224
225
Parameters:
226
- filename: str - Output filename
227
- num_iteration: int or None - Number of iterations to save
228
- start_iteration: int - Starting iteration to save
229
"""
230
231
def set_network(self, machines, local_listen_port=12400,
232
listen_time_out=120, num_machines=1):
233
"""
234
Setup distributed training network.
235
236
Parameters:
237
- machines: str - Machine list for distributed training
238
- local_listen_port: int - Local listening port
239
- listen_time_out: int - Listen timeout in seconds
240
- num_machines: int - Number of machines
241
"""
242
243
def free_network(self):
244
"""Free network resources."""
245
246
def set_train_data_name(self, name):
247
"""
248
Set training data name.
249
250
Parameters:
251
- name: str - Training data name
252
"""
253
254
def shuffle_models(self, start_iter=0, end_iter=-1):
255
"""
256
Shuffle model order.
257
258
Parameters:
259
- start_iter: int - Starting iteration
260
- end_iter: int - Ending iteration (-1 for all)
261
"""
262
263
def trees_to_dataframe(self):
264
"""
265
Convert trees to pandas DataFrame format.
266
267
Returns:
268
- pandas.DataFrame: Tree structure as DataFrame
269
"""
270
271
def update(self, train_set=None, fobj=None):
272
"""
273
Update model for one iteration.
274
275
Parameters:
276
- train_set: Dataset or None - Training dataset
277
- fobj: callable or None - Custom objective function
278
279
Returns:
280
- bool: True if updated successfully
281
"""
282
```
283
284
### Data Management
285
286
The Dataset class provides efficient data handling and preprocessing capabilities for LightGBM training.
287
288
```python { .api }
289
class Dataset:
290
"""
291
LightGBM dataset wrapper for efficient data handling and preprocessing.
292
"""
293
294
def __init__(self, data, label=None, reference=None, weight=None, group=None,
295
init_score=None, feature_name='auto', categorical_feature='auto',
296
params=None, free_raw_data=True, position=None):
297
"""
298
Initialize Dataset object.
299
300
Parameters:
301
- data: array-like, pandas DataFrame, or str - Input data or filename
302
- label: array-like or None - Target values
303
- reference: Dataset or None - Reference dataset for validation
304
- weight: array-like or None - Sample weights
305
- group: array-like or None - Group/query sizes for ranking
306
- init_score: array-like or None - Initial prediction scores
307
- feature_name: list or 'auto' - Feature names
308
- categorical_feature: list or 'auto' - Categorical feature indices/names
309
- params: dict or None - Dataset parameters
310
- free_raw_data: bool - Whether to free raw data after construction
311
- position: array-like or None - Position information
312
"""
313
314
def add_features_from(self, other):
315
"""
316
Add features from another dataset.
317
318
Parameters:
319
- other: Dataset - Source dataset for additional features
320
"""
321
322
def construct(self):
323
"""Lazy initialization of dataset."""
324
325
def create_valid(self, data, label=None, weight=None, group=None,
326
init_score=None, position=None, **kwargs):
327
"""
328
Create validation dataset with same parameters.
329
330
Parameters:
331
- data: array-like - Validation data
332
- label: array-like or None - Validation labels
333
- weight: array-like or None - Validation sample weights
334
- group: array-like or None - Validation group sizes
335
- init_score: array-like or None - Validation initial scores
336
- position: array-like or None - Validation position info
337
338
Returns:
339
- Dataset: Validation dataset object
340
"""
341
342
def feature_num_bin(self, feature):
343
"""
344
Get number of bins for a feature.
345
346
Parameters:
347
- feature: int or str - Feature index or name
348
349
Returns:
350
- int: Number of bins for the feature
351
"""
352
353
def get_data(self):
354
"""
355
Get raw data reference.
356
357
Returns:
358
- Reference to raw data
359
"""
360
361
def get_field(self, field_name):
362
"""
363
Get dataset field value.
364
365
Parameters:
366
- field_name: str - Field name ('label', 'weight', 'group', etc.)
367
368
Returns:
369
- Field value
370
"""
371
372
def get_feature_name(self):
373
"""
374
Get feature names.
375
376
Returns:
377
- list: Feature names
378
"""
379
380
def get_group(self):
381
"""Get group field."""
382
383
def get_init_score(self):
384
"""Get initial score field."""
385
386
def get_label(self):
387
"""Get label field."""
388
389
def get_position(self):
390
"""Get position field."""
391
392
def get_weight(self):
393
"""Get weight field."""
394
395
def get_ref_chain(self, ref_limit=100):
396
"""
397
Get reference dataset chain.
398
399
Parameters:
400
- ref_limit: int - Maximum reference chain length
401
402
Returns:
403
- list: Reference dataset chain
404
"""
405
406
def num_data(self):
407
"""
408
Get number of data points.
409
410
Returns:
411
- int: Number of data points
412
"""
413
414
def num_feature(self):
415
"""
416
Get number of features.
417
418
Returns:
419
- int: Number of features
420
"""
421
422
def save_binary(self, filename):
423
"""
424
Save dataset in binary format.
425
426
Parameters:
427
- filename: str - Output filename
428
"""
429
430
def set_categorical_feature(self, categorical_feature):
431
"""
432
Set categorical features.
433
434
Parameters:
435
- categorical_feature: list - Categorical feature indices/names
436
"""
437
438
def set_feature_name(self, feature_name):
439
"""
440
Set feature names.
441
442
Parameters:
443
- feature_name: list - Feature names
444
"""
445
446
def set_field(self, field_name, data):
447
"""
448
Set dataset field value.
449
450
Parameters:
451
- field_name: str - Field name
452
- data: array-like - Field data
453
"""
454
455
def set_group(self, group):
456
"""Set group field."""
457
458
def set_init_score(self, init_score):
459
"""Set initial score field."""
460
461
def set_label(self, label):
462
"""Set label field."""
463
464
def set_position(self, position):
465
"""Set position field."""
466
467
def set_weight(self, weight):
468
"""Set weight field."""
469
470
def set_reference(self, reference):
471
"""
472
Set reference dataset.
473
474
Parameters:
475
- reference: Dataset - Reference dataset
476
"""
477
478
def subset(self, used_indices, **kwargs):
479
"""
480
Create dataset subset.
481
482
Parameters:
483
- used_indices: array-like - Indices to include in subset
484
485
Returns:
486
- Dataset: Subset dataset
487
"""
488
```
489
490
### Training Functions
491
492
High-level training functions that provide convenient interfaces for model training and cross-validation.
493
494
```python { .api }
495
def train(params, train_set, num_boost_round=100, valid_sets=None,
496
valid_names=None, feval=None, init_model=None, feature_name='auto',
497
categorical_feature='auto', keep_training_booster=False, callbacks=None):
498
"""
499
Train LightGBM model with specified parameters.
500
501
Parameters:
502
- params: dict - Training parameters
503
- train_set: Dataset - Training dataset
504
- num_boost_round: int - Number of boosting iterations
505
- valid_sets: list or None - List of validation datasets
506
- valid_names: list or None - Names for validation sets
507
- feval: callable or None - Custom evaluation function
508
- init_model: str, Booster, or None - Initial model for continued training
509
- feature_name: list or 'auto' - Feature names
510
- categorical_feature: list or 'auto' - Categorical features
511
- keep_training_booster: bool - Whether to keep training booster
512
- callbacks: list or None - List of callback functions
513
514
Returns:
515
- Booster: Trained model
516
"""
517
518
def cv(params, train_set, num_boost_round=100, folds=None, nfold=5,
519
stratified=True, shuffle=True, metrics=None, feval=None, init_model=None,
520
fpreproc=None, feature_name='auto', categorical_feature='auto',
521
seed=0, callbacks=None, eval_train_metric=False, return_cvbooster=False):
522
"""
523
Perform k-fold cross-validation.
524
525
Parameters:
526
- params: dict - Training parameters
527
- train_set: Dataset - Training dataset
528
- num_boost_round: int - Number of boosting iterations
529
- folds: generator or None - Custom cross-validation generator
530
- nfold: int - Number of CV folds
531
- stratified: bool - Whether to use stratified CV
532
- shuffle: bool - Whether to shuffle data before splitting
533
- metrics: str, list, or None - Evaluation metrics
534
- feval: callable or None - Custom evaluation function
535
- init_model: str, Booster, or None - Initial model
536
- fpreproc: callable or None - Preprocessing function
537
- feature_name: list or 'auto' - Feature names
538
- categorical_feature: list or 'auto' - Categorical features
539
- seed: int - Random seed for CV splits
540
- callbacks: list or None - List of callback functions
541
- eval_train_metric: bool - Whether to evaluate training metric
542
- return_cvbooster: bool - Whether to return CVBooster object
543
544
Returns:
545
- dict or CVBooster: CV results dictionary or CVBooster object
546
"""
547
548
class CVBooster:
549
"""
550
Container for cross-validation boosters and results.
551
"""
552
553
def __init__(self, model_file=None):
554
"""
555
Initialize CVBooster object.
556
557
Parameters:
558
- model_file: str or None - Model file to load from
559
"""
560
561
def model_from_string(self, model_str):
562
"""
563
Load CVBooster from string representation.
564
565
Parameters:
566
- model_str: str - String representation
567
"""
568
569
def model_to_string(self):
570
"""
571
Export CVBooster to string representation.
572
573
Returns:
574
- str: String representation
575
"""
576
577
def save_model(self, filename, num_iteration=None):
578
"""
579
Save CVBooster to file.
580
581
Parameters:
582
- filename: str - Output filename
583
- num_iteration: int or None - Number of iterations to save
584
"""
585
586
@property
587
def boosters(self):
588
"""List of trained booster objects for each fold."""
589
590
@property
591
def best_iteration(self):
592
"""Best iteration number across all folds."""
593
```
594
595
### Data Interface
596
597
Abstract base class for implementing custom data sources.
598
599
```python { .api }
600
class Sequence:
601
"""
602
Generic data access interface for custom data sources.
603
604
This abstract base class allows you to implement custom data loading
605
for scenarios where data cannot fit in memory or needs special handling.
606
"""
607
608
batch_size = 4096 # Default batch size
609
610
def __getitem__(self, idx):
611
"""
612
Abstract method for data access by index.
613
614
Parameters:
615
- idx: int - Data index
616
617
Returns:
618
- Data item at the specified index
619
"""
620
raise NotImplementedError()
621
622
def __len__(self):
623
"""
624
Abstract method returning sequence length.
625
626
Returns:
627
- int: Total number of items in sequence
628
"""
629
raise NotImplementedError()
630
```
631
632
### Utility Functions
633
634
Additional utilities for logging and error handling.
635
636
```python { .api }
637
def register_logger(logger, info_method_name="info", warning_method_name="warning"):
638
"""
639
Register custom logger for LightGBM messages.
640
641
Parameters:
642
- logger: Logger object - Custom logger instance
643
- info_method_name: str - Name of info logging method
644
- warning_method_name: str - Name of warning logging method
645
"""
646
647
class LightGBMError(Exception):
648
"""Custom exception for LightGBM-specific errors."""
649
650
class LGBMDeprecationWarning(UserWarning):
651
"""Custom deprecation warning for LightGBM."""
652
```
653
654
## Usage Examples
655
656
### Basic Training Example
657
658
```python
659
import lightgbm as lgb
660
import numpy as np
661
from sklearn.datasets import load_breast_cancer
662
from sklearn.model_selection import train_test_split
663
664
# Load and prepare data
665
X, y = load_breast_cancer(return_X_y=True)
666
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
667
668
# Create LightGBM datasets
669
train_data = lgb.Dataset(X_train, label=y_train)
670
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
671
672
# Set parameters
673
params = {
674
'objective': 'binary',
675
'metric': 'binary_logloss',
676
'boosting_type': 'gbdt',
677
'num_leaves': 31,
678
'learning_rate': 0.05,
679
'feature_fraction': 0.9,
680
'verbose': -1
681
}
682
683
# Train model
684
model = lgb.train(
685
params,
686
train_data,
687
num_boost_round=100,
688
valid_sets=[test_data],
689
valid_names=['test'],
690
callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
691
)
692
693
# Make predictions
694
predictions = model.predict(X_test)
695
binary_predictions = (predictions > 0.5).astype(int)
696
697
print(f"Accuracy: {(binary_predictions == y_test).mean():.4f}")
698
print(f"Feature importance: {model.feature_importance()[:5]}")
699
```
700
701
### Cross-Validation Example
702
703
```python
704
import lightgbm as lgb
705
import numpy as np
706
from sklearn.datasets import load_diabetes
707
708
# Load data
709
X, y = load_diabetes(return_X_y=True)
710
train_data = lgb.Dataset(X, label=y)
711
712
# Set parameters
713
params = {
714
'objective': 'regression',
715
'metric': 'rmse',
716
'boosting_type': 'gbdt',
717
'num_leaves': 31,
718
'learning_rate': 0.05,
719
'verbose': -1
720
}
721
722
# Perform cross-validation
723
cv_results = lgb.cv(
724
params,
725
train_data,
726
num_boost_round=100,
727
nfold=5,
728
stratified=False,
729
shuffle=True,
730
seed=42,
731
return_cvbooster=True,
732
callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
733
)
734
735
print(f"Best CV score: {cv_results['valid rmse-mean'][-1]:.4f}")
736
print(f"Best iteration: {len(cv_results['valid rmse-mean'])}")
737
738
# Access individual fold models
739
cvbooster = cv_results # When return_cvbooster=True
740
print(f"Number of fold models: {len(cvbooster.boosters)}")
741
```
742
743
### Custom Objective Function Example
744
745
```python
746
import lightgbm as lgb
747
import numpy as np
748
from sklearn.datasets import make_regression
749
750
# Create sample data
751
X, y = make_regression(n_samples=1000, n_features=10, random_state=42)
752
train_data = lgb.Dataset(X, label=y)
753
754
def custom_objective(y_true, y_pred):
755
"""Custom objective function (L1 loss)."""
756
residual = y_pred - y_true
757
grad = np.sign(residual)
758
hess = np.ones_like(residual)
759
return grad, hess
760
761
def custom_eval(y_true, y_pred):
762
"""Custom evaluation function."""
763
residual = y_pred - y_true
764
mae = np.mean(np.abs(residual))
765
return 'mae', mae, False # (eval_name, eval_result, is_higher_better)
766
767
# Train with custom functions
768
model = lgb.train(
769
{'verbose': -1},
770
train_data,
771
num_boost_round=100,
772
fobj=custom_objective,
773
feval=custom_eval
774
)
775
776
predictions = model.predict(X)
777
print(f"Custom MAE: {np.mean(np.abs(predictions - y)):.4f}")
778
```