Tessl Tile for pypi/lightgbm@4.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-training.md distributed-computing.md index.md sklearn-interface.md training-callbacks.md visualization.md

core-training.mddocs/

0
# Core Training
1

2
Low-level LightGBM interface providing direct access to the gradient boosting engine. This interface enables advanced model control, custom objectives, evaluation functions, and fine-tuned training procedures for users who need maximum flexibility.
3

4
## Capabilities
5

6
### Model Training and Management
7

8
The core Booster class provides direct access to LightGBM's gradient boosting engine with full control over training parameters and model behavior.
9

10
```python { .api }
11
class Booster:
12
    """
13
    Core LightGBM model class for advanced training and prediction control.
14
    """
15
    
16
    def __init__(self, params=None, train_set=None, model_file=None, model_str=None):
17
        """
18
        Initialize Booster object.
19
        
20
        Parameters:
21
        - params: dict or None - Training parameters
22
        - train_set: Dataset or None - Training dataset
23
        - model_file: str or None - Path to model file to load
24
        - model_str: str or None - Model string to load from
25
        """
26
    
27
    def add_valid(self, data, name):
28
        """
29
        Add validation dataset.
30
        
31
        Parameters:
32
        - data: Dataset - Validation dataset
33
        - name: str - Name for the validation set
34
        """
35
    
36
    def current_iteration(self):
37
        """
38
        Get current iteration index.
39
        
40
        Returns:
41
        - int: Current iteration number
42
        """
43
    
44
    def dump_model(self, num_iteration=None, start_iteration=0, importance_type='split'):
45
        """
46
        Export model structure to JSON format.
47
        
48
        Parameters:
49
        - num_iteration: int or None - Number of iterations to export
50
        - start_iteration: int - Starting iteration to export
51
        - importance_type: str - Feature importance type ('split', 'gain')
52
        
53
        Returns:
54
        - dict: Model structure in JSON format
55
        """
56
    
57
    def eval(self, data, name, feval=None):
58
        """
59
        Evaluate model on given dataset.
60
        
61
        Parameters:
62
        - data: Dataset - Dataset to evaluate on
63
        - name: str - Name of the dataset
64
        - feval: callable or None - Custom evaluation function
65
        
66
        Returns:
67
        - list: Evaluation results
68
        """
69
    
70
    def eval_train(self, feval=None):
71
        """Evaluate model on training data."""
72
    
73
    def eval_valid(self, feval=None):
74
        """Evaluate model on validation data."""
75
    
76
    def feature_importance(self, importance_type='split', iteration=None):
77
        """
78
        Get feature importance scores.
79
        
80
        Parameters:
81
        - importance_type: str - Type of importance ('split', 'gain')
82
        - iteration: int or None - Iteration to get importance for
83
        
84
        Returns:
85
        - numpy.ndarray: Feature importance scores
86
        """
87
    
88
    def feature_name(self):
89
        """
90
        Get feature names.
91
        
92
        Returns:
93
        - list: Feature names
94
        """
95
    
96
    def free_dataset(self):
97
        """Free dataset memory."""
98
    
99
    def get_leaf_output(self, tree_id, leaf_id):
100
        """
101
        Get leaf output value.
102
        
103
        Parameters:
104
        - tree_id: int - Tree index
105
        - leaf_id: int - Leaf index
106
        
107
        Returns:
108
        - float: Leaf output value
109
        """
110
    
111
    def set_leaf_output(self, tree_id, leaf_id, val):
112
        """
113
        Set leaf output value.
114
        
115
        Parameters:
116
        - tree_id: int - Tree index
117
        - leaf_id: int - Leaf index  
118
        - val: float - New leaf value
119
        """
120
    
121
    def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
122
        """
123
        Get split value histogram for a feature.
124
        
125
        Parameters:
126
        - feature: int or str - Feature index or name
127
        - bins: int or None - Number of histogram bins
128
        - xgboost_style: bool - Whether to use XGBoost-style binning
129
        
130
        Returns:
131
        - tuple: (bin_edges, bin_counts)
132
        """
133
    
134
    def lower_bound(self):
135
        """Get prediction lower bound."""
136
    
137
    def upper_bound(self):
138
        """Get prediction upper bound."""
139
    
140
    def model_from_string(self, model_str):
141
        """
142
        Load model from string representation.
143
        
144
        Parameters:
145
        - model_str: str - String representation of model
146
        """
147
    
148
    def model_to_string(self, num_iteration=None, start_iteration=0):
149
        """
150
        Export model to string representation.
151
        
152
        Parameters:
153
        - num_iteration: int or None - Number of iterations to export
154
        - start_iteration: int - Starting iteration to export
155
        
156
        Returns:
157
        - str: String representation of model
158
        """
159
    
160
    def num_feature(self):
161
        """
162
        Get number of features.
163
        
164
        Returns:
165
        - int: Number of features
166
        """
167
    
168
    def num_model_per_iteration(self):
169
        """
170
        Get number of models per iteration.
171
        
172
        Returns:
173
        - int: Number of models per iteration
174
        """
175
    
176
    def num_trees(self):
177
        """
178
        Get total number of trees.
179
        
180
        Returns:
181
        - int: Total number of trees
182
        """
183
    
184
    def predict(self, data, start_iteration=0, num_iteration=None, 
185
                pred_leaf=False, pred_contrib=False, **kwargs):
186
        """
187
        Make predictions on data.
188
        
189
        Parameters:
190
        - data: array-like, Dataset, or str - Input data or filename
191
        - start_iteration: int - Starting iteration for prediction
192
        - num_iteration: int or None - Number of iterations to use
193
        - pred_leaf: bool - Whether to predict leaf indices
194
        - pred_contrib: bool - Whether to predict feature contributions
195
        
196
        Returns:
197
        - numpy.ndarray: Predictions
198
        """
199
    
200
    def refit(self, data, label, decay_rate=0.9, **kwargs):
201
        """
202
        Refit model with new data using online learning.
203
        
204
        Parameters:
205
        - data: array-like - New training data
206
        - label: array-like - New training labels
207
        - decay_rate: float - Decay rate for online learning
208
        """
209
    
210
    def reset_parameter(self, params):
211
        """
212
        Reset model parameters.
213
        
214
        Parameters:
215
        - params: dict - New parameters to set
216
        """
217
    
218
    def rollback_one_iter(self):
219
        """Rollback one iteration."""
220
    
221
    def save_model(self, filename, num_iteration=None, start_iteration=0):
222
        """
223
        Save model to file.
224
        
225
        Parameters:
226
        - filename: str - Output filename
227
        - num_iteration: int or None - Number of iterations to save
228
        - start_iteration: int - Starting iteration to save
229
        """
230
    
231
    def set_network(self, machines, local_listen_port=12400, 
232
                    listen_time_out=120, num_machines=1):
233
        """
234
        Setup distributed training network.
235
        
236
        Parameters:
237
        - machines: str - Machine list for distributed training
238
        - local_listen_port: int - Local listening port
239
        - listen_time_out: int - Listen timeout in seconds
240
        - num_machines: int - Number of machines
241
        """
242
    
243
    def free_network(self):
244
        """Free network resources."""
245
    
246
    def set_train_data_name(self, name):
247
        """
248
        Set training data name.
249
        
250
        Parameters:
251
        - name: str - Training data name
252
        """
253
    
254
    def shuffle_models(self, start_iter=0, end_iter=-1):
255
        """
256
        Shuffle model order.
257
        
258
        Parameters:
259
        - start_iter: int - Starting iteration
260
        - end_iter: int - Ending iteration (-1 for all)
261
        """
262
    
263
    def trees_to_dataframe(self):
264
        """
265
        Convert trees to pandas DataFrame format.
266
        
267
        Returns:
268
        - pandas.DataFrame: Tree structure as DataFrame
269
        """
270
    
271
    def update(self, train_set=None, fobj=None):
272
        """
273
        Update model for one iteration.
274
        
275
        Parameters:
276
        - train_set: Dataset or None - Training dataset
277
        - fobj: callable or None - Custom objective function
278
        
279
        Returns:
280
        - bool: True if updated successfully
281
        """
282
```
283

284
### Data Management
285

286
The Dataset class provides efficient data handling and preprocessing capabilities for LightGBM training.
287

288
```python { .api }
289
class Dataset:
290
    """
291
    LightGBM dataset wrapper for efficient data handling and preprocessing.
292
    """
293
    
294
    def __init__(self, data, label=None, reference=None, weight=None, group=None,
295
                 init_score=None, feature_name='auto', categorical_feature='auto',
296
                 params=None, free_raw_data=True, position=None):
297
        """
298
        Initialize Dataset object.
299
        
300
        Parameters:
301
        - data: array-like, pandas DataFrame, or str - Input data or filename
302
        - label: array-like or None - Target values
303
        - reference: Dataset or None - Reference dataset for validation
304
        - weight: array-like or None - Sample weights
305
        - group: array-like or None - Group/query sizes for ranking
306
        - init_score: array-like or None - Initial prediction scores
307
        - feature_name: list or 'auto' - Feature names
308
        - categorical_feature: list or 'auto' - Categorical feature indices/names
309
        - params: dict or None - Dataset parameters
310
        - free_raw_data: bool - Whether to free raw data after construction
311
        - position: array-like or None - Position information
312
        """
313
    
314
    def add_features_from(self, other):
315
        """
316
        Add features from another dataset.
317
        
318
        Parameters:
319
        - other: Dataset - Source dataset for additional features
320
        """
321
    
322
    def construct(self):
323
        """Lazy initialization of dataset."""
324
    
325
    def create_valid(self, data, label=None, weight=None, group=None, 
326
                     init_score=None, position=None, **kwargs):
327
        """
328
        Create validation dataset with same parameters.
329
        
330
        Parameters:
331
        - data: array-like - Validation data
332
        - label: array-like or None - Validation labels
333
        - weight: array-like or None - Validation sample weights
334
        - group: array-like or None - Validation group sizes
335
        - init_score: array-like or None - Validation initial scores
336
        - position: array-like or None - Validation position info
337
        
338
        Returns:
339
        - Dataset: Validation dataset object
340
        """
341
    
342
    def feature_num_bin(self, feature):
343
        """
344
        Get number of bins for a feature.
345
        
346
        Parameters:
347
        - feature: int or str - Feature index or name
348
        
349
        Returns:
350
        - int: Number of bins for the feature
351
        """
352
    
353
    def get_data(self):
354
        """
355
        Get raw data reference.
356
        
357
        Returns:
358
        - Reference to raw data
359
        """
360
    
361
    def get_field(self, field_name):
362
        """
363
        Get dataset field value.
364
        
365
        Parameters:
366
        - field_name: str - Field name ('label', 'weight', 'group', etc.)
367
        
368
        Returns:
369
        - Field value
370
        """
371
    
372
    def get_feature_name(self):
373
        """
374
        Get feature names.
375
        
376
        Returns:
377
        - list: Feature names
378
        """
379
    
380
    def get_group(self):
381
        """Get group field."""
382
    
383
    def get_init_score(self):
384
        """Get initial score field."""
385
    
386
    def get_label(self):
387
        """Get label field."""
388
    
389
    def get_position(self):
390
        """Get position field."""
391
    
392
    def get_weight(self):
393
        """Get weight field."""
394
    
395
    def get_ref_chain(self, ref_limit=100):
396
        """
397
        Get reference dataset chain.
398
        
399
        Parameters:
400
        - ref_limit: int - Maximum reference chain length
401
        
402
        Returns:
403
        - list: Reference dataset chain
404
        """
405
    
406
    def num_data(self):
407
        """
408
        Get number of data points.
409
        
410
        Returns:
411
        - int: Number of data points
412
        """
413
    
414
    def num_feature(self):
415
        """
416
        Get number of features.
417
        
418
        Returns:
419
        - int: Number of features
420
        """
421
    
422
    def save_binary(self, filename):
423
        """
424
        Save dataset in binary format.
425
        
426
        Parameters:
427
        - filename: str - Output filename
428
        """
429
    
430
    def set_categorical_feature(self, categorical_feature):
431
        """
432
        Set categorical features.
433
        
434
        Parameters:
435
        - categorical_feature: list - Categorical feature indices/names
436
        """
437
    
438
    def set_feature_name(self, feature_name):
439
        """
440
        Set feature names.
441
        
442
        Parameters:
443
        - feature_name: list - Feature names
444
        """
445
    
446
    def set_field(self, field_name, data):
447
        """
448
        Set dataset field value.
449
        
450
        Parameters:
451
        - field_name: str - Field name
452
        - data: array-like - Field data
453
        """
454
    
455
    def set_group(self, group):
456
        """Set group field."""
457
    
458
    def set_init_score(self, init_score):
459
        """Set initial score field."""
460
    
461
    def set_label(self, label):
462
        """Set label field."""
463
    
464
    def set_position(self, position):
465
        """Set position field."""
466
    
467
    def set_weight(self, weight):
468
        """Set weight field."""
469
    
470
    def set_reference(self, reference):
471
        """
472
        Set reference dataset.
473
        
474
        Parameters:
475
        - reference: Dataset - Reference dataset
476
        """
477
    
478
    def subset(self, used_indices, **kwargs):
479
        """
480
        Create dataset subset.
481
        
482
        Parameters:
483
        - used_indices: array-like - Indices to include in subset
484
        
485
        Returns:
486
        - Dataset: Subset dataset
487
        """
488
```
489

490
### Training Functions
491

492
High-level training functions that provide convenient interfaces for model training and cross-validation.
493

494
```python { .api }
495
def train(params, train_set, num_boost_round=100, valid_sets=None, 
496
          valid_names=None, feval=None, init_model=None, feature_name='auto',
497
          categorical_feature='auto', keep_training_booster=False, callbacks=None):
498
    """
499
    Train LightGBM model with specified parameters.
500
    
501
    Parameters:
502
    - params: dict - Training parameters
503
    - train_set: Dataset - Training dataset
504
    - num_boost_round: int - Number of boosting iterations
505
    - valid_sets: list or None - List of validation datasets
506
    - valid_names: list or None - Names for validation sets
507
    - feval: callable or None - Custom evaluation function
508
    - init_model: str, Booster, or None - Initial model for continued training
509
    - feature_name: list or 'auto' - Feature names
510
    - categorical_feature: list or 'auto' - Categorical features
511
    - keep_training_booster: bool - Whether to keep training booster
512
    - callbacks: list or None - List of callback functions
513
    
514
    Returns:
515
    - Booster: Trained model
516
    """
517

518
def cv(params, train_set, num_boost_round=100, folds=None, nfold=5,
519
       stratified=True, shuffle=True, metrics=None, feval=None, init_model=None,
520
       fpreproc=None, feature_name='auto', categorical_feature='auto',
521
       seed=0, callbacks=None, eval_train_metric=False, return_cvbooster=False):
522
    """
523
    Perform k-fold cross-validation.
524
    
525
    Parameters:
526
    - params: dict - Training parameters
527
    - train_set: Dataset - Training dataset
528
    - num_boost_round: int - Number of boosting iterations
529
    - folds: generator or None - Custom cross-validation generator
530
    - nfold: int - Number of CV folds
531
    - stratified: bool - Whether to use stratified CV
532
    - shuffle: bool - Whether to shuffle data before splitting
533
    - metrics: str, list, or None - Evaluation metrics
534
    - feval: callable or None - Custom evaluation function
535
    - init_model: str, Booster, or None - Initial model
536
    - fpreproc: callable or None - Preprocessing function
537
    - feature_name: list or 'auto' - Feature names
538
    - categorical_feature: list or 'auto' - Categorical features
539
    - seed: int - Random seed for CV splits
540
    - callbacks: list or None - List of callback functions
541
    - eval_train_metric: bool - Whether to evaluate training metric
542
    - return_cvbooster: bool - Whether to return CVBooster object
543
    
544
    Returns:
545
    - dict or CVBooster: CV results dictionary or CVBooster object
546
    """
547

548
class CVBooster:
549
    """
550
    Container for cross-validation boosters and results.
551
    """
552
    
553
    def __init__(self, model_file=None):
554
        """
555
        Initialize CVBooster object.
556
        
557
        Parameters:
558
        - model_file: str or None - Model file to load from
559
        """
560
    
561
    def model_from_string(self, model_str):
562
        """
563
        Load CVBooster from string representation.
564
        
565
        Parameters:
566
        - model_str: str - String representation
567
        """
568
    
569
    def model_to_string(self):
570
        """
571
        Export CVBooster to string representation.
572
        
573
        Returns:
574
        - str: String representation
575
        """
576
    
577
    def save_model(self, filename, num_iteration=None):
578
        """
579
        Save CVBooster to file.
580
        
581
        Parameters:
582
        - filename: str - Output filename
583
        - num_iteration: int or None - Number of iterations to save
584
        """
585
    
586
    @property
587
    def boosters(self):
588
        """List of trained booster objects for each fold."""
589
    
590
    @property
591
    def best_iteration(self):
592
        """Best iteration number across all folds."""
593
```
594

595
### Data Interface
596

597
Abstract base class for implementing custom data sources.
598

599
```python { .api }
600
class Sequence:
601
    """
602
    Generic data access interface for custom data sources.
603
    
604
    This abstract base class allows you to implement custom data loading
605
    for scenarios where data cannot fit in memory or needs special handling.
606
    """
607
    
608
    batch_size = 4096  # Default batch size
609
    
610
    def __getitem__(self, idx):
611
        """
612
        Abstract method for data access by index.
613
        
614
        Parameters:
615
        - idx: int - Data index
616
        
617
        Returns:
618
        - Data item at the specified index
619
        """
620
        raise NotImplementedError()
621
    
622
    def __len__(self):
623
        """
624
        Abstract method returning sequence length.
625
        
626
        Returns:
627
        - int: Total number of items in sequence
628
        """
629
        raise NotImplementedError()
630
```
631

632
### Utility Functions
633

634
Additional utilities for logging and error handling.
635

636
```python { .api }
637
def register_logger(logger, info_method_name="info", warning_method_name="warning"):
638
    """
639
    Register custom logger for LightGBM messages.
640
    
641
    Parameters:
642
    - logger: Logger object - Custom logger instance
643
    - info_method_name: str - Name of info logging method
644
    - warning_method_name: str - Name of warning logging method
645
    """
646

647
class LightGBMError(Exception):
648
    """Custom exception for LightGBM-specific errors."""
649

650
class LGBMDeprecationWarning(UserWarning):
651
    """Custom deprecation warning for LightGBM."""
652
```
653

654
## Usage Examples
655

656
### Basic Training Example
657

658
```python
659
import lightgbm as lgb
660
import numpy as np
661
from sklearn.datasets import load_breast_cancer
662
from sklearn.model_selection import train_test_split
663

664
# Load and prepare data
665
X, y = load_breast_cancer(return_X_y=True)
666
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
667

668
# Create LightGBM datasets
669
train_data = lgb.Dataset(X_train, label=y_train)
670
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
671

672
# Set parameters
673
params = {
674
    'objective': 'binary',
675
    'metric': 'binary_logloss',
676
    'boosting_type': 'gbdt',
677
    'num_leaves': 31,
678
    'learning_rate': 0.05,
679
    'feature_fraction': 0.9,
680
    'verbose': -1
681
}
682

683
# Train model
684
model = lgb.train(
685
    params,
686
    train_data,
687
    num_boost_round=100,
688
    valid_sets=[test_data],
689
    valid_names=['test'],
690
    callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
691
)
692

693
# Make predictions
694
predictions = model.predict(X_test)
695
binary_predictions = (predictions > 0.5).astype(int)
696

697
print(f"Accuracy: {(binary_predictions == y_test).mean():.4f}")
698
print(f"Feature importance: {model.feature_importance()[:5]}")
699
```
700

701
### Cross-Validation Example
702

703
```python
704
import lightgbm as lgb
705
import numpy as np
706
from sklearn.datasets import load_diabetes
707

708
# Load data
709
X, y = load_diabetes(return_X_y=True)
710
train_data = lgb.Dataset(X, label=y)
711

712
# Set parameters
713
params = {
714
    'objective': 'regression',
715
    'metric': 'rmse',
716
    'boosting_type': 'gbdt',
717
    'num_leaves': 31,
718
    'learning_rate': 0.05,
719
    'verbose': -1
720
}
721

722
# Perform cross-validation
723
cv_results = lgb.cv(
724
    params,
725
    train_data,
726
    num_boost_round=100,
727
    nfold=5,
728
    stratified=False,
729
    shuffle=True,
730
    seed=42,
731
    return_cvbooster=True,
732
    callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
733
)
734

735
print(f"Best CV score: {cv_results['valid rmse-mean'][-1]:.4f}")
736
print(f"Best iteration: {len(cv_results['valid rmse-mean'])}")
737

738
# Access individual fold models
739
cvbooster = cv_results  # When return_cvbooster=True
740
print(f"Number of fold models: {len(cvbooster.boosters)}")
741
```
742

743
### Custom Objective Function Example
744

745
```python
746
import lightgbm as lgb
747
import numpy as np
748
from sklearn.datasets import make_regression
749

750
# Create sample data
751
X, y = make_regression(n_samples=1000, n_features=10, random_state=42)
752
train_data = lgb.Dataset(X, label=y)
753

754
def custom_objective(y_true, y_pred):
755
    """Custom objective function (L1 loss)."""
756
    residual = y_pred - y_true
757
    grad = np.sign(residual)
758
    hess = np.ones_like(residual)
759
    return grad, hess
760

761
def custom_eval(y_true, y_pred):
762
    """Custom evaluation function."""
763
    residual = y_pred - y_true
764
    mae = np.mean(np.abs(residual))
765
    return 'mae', mae, False  # (eval_name, eval_result, is_higher_better)
766

767
# Train with custom functions
768
model = lgb.train(
769
    {'verbose': -1},
770
    train_data,
771
    num_boost_round=100,
772
    fobj=custom_objective,
773
    feval=custom_eval
774
)
775

776
predictions = model.predict(X)
777
print(f"Custom MAE: {np.mean(np.abs(predictions - y)):.4f}")
778
```

Version

Tile

Files

core-training.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-training.mddocs/