or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-training.mddistributed-computing.mdindex.mdsklearn-interface.mdtraining-callbacks.mdvisualization.md

core-training.mddocs/

0

# Core Training

1

2

Low-level LightGBM interface providing direct access to the gradient boosting engine. This interface enables advanced model control, custom objectives, evaluation functions, and fine-tuned training procedures for users who need maximum flexibility.

3

4

## Capabilities

5

6

### Model Training and Management

7

8

The core Booster class provides direct access to LightGBM's gradient boosting engine with full control over training parameters and model behavior.

9

10

```python { .api }

11

class Booster:

12

"""

13

Core LightGBM model class for advanced training and prediction control.

14

"""

15

16

def __init__(self, params=None, train_set=None, model_file=None, model_str=None):

17

"""

18

Initialize Booster object.

19

20

Parameters:

21

- params: dict or None - Training parameters

22

- train_set: Dataset or None - Training dataset

23

- model_file: str or None - Path to model file to load

24

- model_str: str or None - Model string to load from

25

"""

26

27

def add_valid(self, data, name):

28

"""

29

Add validation dataset.

30

31

Parameters:

32

- data: Dataset - Validation dataset

33

- name: str - Name for the validation set

34

"""

35

36

def current_iteration(self):

37

"""

38

Get current iteration index.

39

40

Returns:

41

- int: Current iteration number

42

"""

43

44

def dump_model(self, num_iteration=None, start_iteration=0, importance_type='split'):

45

"""

46

Export model structure to JSON format.

47

48

Parameters:

49

- num_iteration: int or None - Number of iterations to export

50

- start_iteration: int - Starting iteration to export

51

- importance_type: str - Feature importance type ('split', 'gain')

52

53

Returns:

54

- dict: Model structure in JSON format

55

"""

56

57

def eval(self, data, name, feval=None):

58

"""

59

Evaluate model on given dataset.

60

61

Parameters:

62

- data: Dataset - Dataset to evaluate on

63

- name: str - Name of the dataset

64

- feval: callable or None - Custom evaluation function

65

66

Returns:

67

- list: Evaluation results

68

"""

69

70

def eval_train(self, feval=None):

71

"""Evaluate model on training data."""

72

73

def eval_valid(self, feval=None):

74

"""Evaluate model on validation data."""

75

76

def feature_importance(self, importance_type='split', iteration=None):

77

"""

78

Get feature importance scores.

79

80

Parameters:

81

- importance_type: str - Type of importance ('split', 'gain')

82

- iteration: int or None - Iteration to get importance for

83

84

Returns:

85

- numpy.ndarray: Feature importance scores

86

"""

87

88

def feature_name(self):

89

"""

90

Get feature names.

91

92

Returns:

93

- list: Feature names

94

"""

95

96

def free_dataset(self):

97

"""Free dataset memory."""

98

99

def get_leaf_output(self, tree_id, leaf_id):

100

"""

101

Get leaf output value.

102

103

Parameters:

104

- tree_id: int - Tree index

105

- leaf_id: int - Leaf index

106

107

Returns:

108

- float: Leaf output value

109

"""

110

111

def set_leaf_output(self, tree_id, leaf_id, val):

112

"""

113

Set leaf output value.

114

115

Parameters:

116

- tree_id: int - Tree index

117

- leaf_id: int - Leaf index

118

- val: float - New leaf value

119

"""

120

121

def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):

122

"""

123

Get split value histogram for a feature.

124

125

Parameters:

126

- feature: int or str - Feature index or name

127

- bins: int or None - Number of histogram bins

128

- xgboost_style: bool - Whether to use XGBoost-style binning

129

130

Returns:

131

- tuple: (bin_edges, bin_counts)

132

"""

133

134

def lower_bound(self):

135

"""Get prediction lower bound."""

136

137

def upper_bound(self):

138

"""Get prediction upper bound."""

139

140

def model_from_string(self, model_str):

141

"""

142

Load model from string representation.

143

144

Parameters:

145

- model_str: str - String representation of model

146

"""

147

148

def model_to_string(self, num_iteration=None, start_iteration=0):

149

"""

150

Export model to string representation.

151

152

Parameters:

153

- num_iteration: int or None - Number of iterations to export

154

- start_iteration: int - Starting iteration to export

155

156

Returns:

157

- str: String representation of model

158

"""

159

160

def num_feature(self):

161

"""

162

Get number of features.

163

164

Returns:

165

- int: Number of features

166

"""

167

168

def num_model_per_iteration(self):

169

"""

170

Get number of models per iteration.

171

172

Returns:

173

- int: Number of models per iteration

174

"""

175

176

def num_trees(self):

177

"""

178

Get total number of trees.

179

180

Returns:

181

- int: Total number of trees

182

"""

183

184

def predict(self, data, start_iteration=0, num_iteration=None,

185

pred_leaf=False, pred_contrib=False, **kwargs):

186

"""

187

Make predictions on data.

188

189

Parameters:

190

- data: array-like, Dataset, or str - Input data or filename

191

- start_iteration: int - Starting iteration for prediction

192

- num_iteration: int or None - Number of iterations to use

193

- pred_leaf: bool - Whether to predict leaf indices

194

- pred_contrib: bool - Whether to predict feature contributions

195

196

Returns:

197

- numpy.ndarray: Predictions

198

"""

199

200

def refit(self, data, label, decay_rate=0.9, **kwargs):

201

"""

202

Refit model with new data using online learning.

203

204

Parameters:

205

- data: array-like - New training data

206

- label: array-like - New training labels

207

- decay_rate: float - Decay rate for online learning

208

"""

209

210

def reset_parameter(self, params):

211

"""

212

Reset model parameters.

213

214

Parameters:

215

- params: dict - New parameters to set

216

"""

217

218

def rollback_one_iter(self):

219

"""Rollback one iteration."""

220

221

def save_model(self, filename, num_iteration=None, start_iteration=0):

222

"""

223

Save model to file.

224

225

Parameters:

226

- filename: str - Output filename

227

- num_iteration: int or None - Number of iterations to save

228

- start_iteration: int - Starting iteration to save

229

"""

230

231

def set_network(self, machines, local_listen_port=12400,

232

listen_time_out=120, num_machines=1):

233

"""

234

Setup distributed training network.

235

236

Parameters:

237

- machines: str - Machine list for distributed training

238

- local_listen_port: int - Local listening port

239

- listen_time_out: int - Listen timeout in seconds

240

- num_machines: int - Number of machines

241

"""

242

243

def free_network(self):

244

"""Free network resources."""

245

246

def set_train_data_name(self, name):

247

"""

248

Set training data name.

249

250

Parameters:

251

- name: str - Training data name

252

"""

253

254

def shuffle_models(self, start_iter=0, end_iter=-1):

255

"""

256

Shuffle model order.

257

258

Parameters:

259

- start_iter: int - Starting iteration

260

- end_iter: int - Ending iteration (-1 for all)

261

"""

262

263

def trees_to_dataframe(self):

264

"""

265

Convert trees to pandas DataFrame format.

266

267

Returns:

268

- pandas.DataFrame: Tree structure as DataFrame

269

"""

270

271

def update(self, train_set=None, fobj=None):

272

"""

273

Update model for one iteration.

274

275

Parameters:

276

- train_set: Dataset or None - Training dataset

277

- fobj: callable or None - Custom objective function

278

279

Returns:

280

- bool: True if updated successfully

281

"""

282

```

283

284

### Data Management

285

286

The Dataset class provides efficient data handling and preprocessing capabilities for LightGBM training.

287

288

```python { .api }

289

class Dataset:

290

"""

291

LightGBM dataset wrapper for efficient data handling and preprocessing.

292

"""

293

294

def __init__(self, data, label=None, reference=None, weight=None, group=None,

295

init_score=None, feature_name='auto', categorical_feature='auto',

296

params=None, free_raw_data=True, position=None):

297

"""

298

Initialize Dataset object.

299

300

Parameters:

301

- data: array-like, pandas DataFrame, or str - Input data or filename

302

- label: array-like or None - Target values

303

- reference: Dataset or None - Reference dataset for validation

304

- weight: array-like or None - Sample weights

305

- group: array-like or None - Group/query sizes for ranking

306

- init_score: array-like or None - Initial prediction scores

307

- feature_name: list or 'auto' - Feature names

308

- categorical_feature: list or 'auto' - Categorical feature indices/names

309

- params: dict or None - Dataset parameters

310

- free_raw_data: bool - Whether to free raw data after construction

311

- position: array-like or None - Position information

312

"""

313

314

def add_features_from(self, other):

315

"""

316

Add features from another dataset.

317

318

Parameters:

319

- other: Dataset - Source dataset for additional features

320

"""

321

322

def construct(self):

323

"""Lazy initialization of dataset."""

324

325

def create_valid(self, data, label=None, weight=None, group=None,

326

init_score=None, position=None, **kwargs):

327

"""

328

Create validation dataset with same parameters.

329

330

Parameters:

331

- data: array-like - Validation data

332

- label: array-like or None - Validation labels

333

- weight: array-like or None - Validation sample weights

334

- group: array-like or None - Validation group sizes

335

- init_score: array-like or None - Validation initial scores

336

- position: array-like or None - Validation position info

337

338

Returns:

339

- Dataset: Validation dataset object

340

"""

341

342

def feature_num_bin(self, feature):

343

"""

344

Get number of bins for a feature.

345

346

Parameters:

347

- feature: int or str - Feature index or name

348

349

Returns:

350

- int: Number of bins for the feature

351

"""

352

353

def get_data(self):

354

"""

355

Get raw data reference.

356

357

Returns:

358

- Reference to raw data

359

"""

360

361

def get_field(self, field_name):

362

"""

363

Get dataset field value.

364

365

Parameters:

366

- field_name: str - Field name ('label', 'weight', 'group', etc.)

367

368

Returns:

369

- Field value

370

"""

371

372

def get_feature_name(self):

373

"""

374

Get feature names.

375

376

Returns:

377

- list: Feature names

378

"""

379

380

def get_group(self):

381

"""Get group field."""

382

383

def get_init_score(self):

384

"""Get initial score field."""

385

386

def get_label(self):

387

"""Get label field."""

388

389

def get_position(self):

390

"""Get position field."""

391

392

def get_weight(self):

393

"""Get weight field."""

394

395

def get_ref_chain(self, ref_limit=100):

396

"""

397

Get reference dataset chain.

398

399

Parameters:

400

- ref_limit: int - Maximum reference chain length

401

402

Returns:

403

- list: Reference dataset chain

404

"""

405

406

def num_data(self):

407

"""

408

Get number of data points.

409

410

Returns:

411

- int: Number of data points

412

"""

413

414

def num_feature(self):

415

"""

416

Get number of features.

417

418

Returns:

419

- int: Number of features

420

"""

421

422

def save_binary(self, filename):

423

"""

424

Save dataset in binary format.

425

426

Parameters:

427

- filename: str - Output filename

428

"""

429

430

def set_categorical_feature(self, categorical_feature):

431

"""

432

Set categorical features.

433

434

Parameters:

435

- categorical_feature: list - Categorical feature indices/names

436

"""

437

438

def set_feature_name(self, feature_name):

439

"""

440

Set feature names.

441

442

Parameters:

443

- feature_name: list - Feature names

444

"""

445

446

def set_field(self, field_name, data):

447

"""

448

Set dataset field value.

449

450

Parameters:

451

- field_name: str - Field name

452

- data: array-like - Field data

453

"""

454

455

def set_group(self, group):

456

"""Set group field."""

457

458

def set_init_score(self, init_score):

459

"""Set initial score field."""

460

461

def set_label(self, label):

462

"""Set label field."""

463

464

def set_position(self, position):

465

"""Set position field."""

466

467

def set_weight(self, weight):

468

"""Set weight field."""

469

470

def set_reference(self, reference):

471

"""

472

Set reference dataset.

473

474

Parameters:

475

- reference: Dataset - Reference dataset

476

"""

477

478

def subset(self, used_indices, **kwargs):

479

"""

480

Create dataset subset.

481

482

Parameters:

483

- used_indices: array-like - Indices to include in subset

484

485

Returns:

486

- Dataset: Subset dataset

487

"""

488

```

489

490

### Training Functions

491

492

High-level training functions that provide convenient interfaces for model training and cross-validation.

493

494

```python { .api }

495

def train(params, train_set, num_boost_round=100, valid_sets=None,

496

valid_names=None, feval=None, init_model=None, feature_name='auto',

497

categorical_feature='auto', keep_training_booster=False, callbacks=None):

498

"""

499

Train LightGBM model with specified parameters.

500

501

Parameters:

502

- params: dict - Training parameters

503

- train_set: Dataset - Training dataset

504

- num_boost_round: int - Number of boosting iterations

505

- valid_sets: list or None - List of validation datasets

506

- valid_names: list or None - Names for validation sets

507

- feval: callable or None - Custom evaluation function

508

- init_model: str, Booster, or None - Initial model for continued training

509

- feature_name: list or 'auto' - Feature names

510

- categorical_feature: list or 'auto' - Categorical features

511

- keep_training_booster: bool - Whether to keep training booster

512

- callbacks: list or None - List of callback functions

513

514

Returns:

515

- Booster: Trained model

516

"""

517

518

def cv(params, train_set, num_boost_round=100, folds=None, nfold=5,

519

stratified=True, shuffle=True, metrics=None, feval=None, init_model=None,

520

fpreproc=None, feature_name='auto', categorical_feature='auto',

521

seed=0, callbacks=None, eval_train_metric=False, return_cvbooster=False):

522

"""

523

Perform k-fold cross-validation.

524

525

Parameters:

526

- params: dict - Training parameters

527

- train_set: Dataset - Training dataset

528

- num_boost_round: int - Number of boosting iterations

529

- folds: generator or None - Custom cross-validation generator

530

- nfold: int - Number of CV folds

531

- stratified: bool - Whether to use stratified CV

532

- shuffle: bool - Whether to shuffle data before splitting

533

- metrics: str, list, or None - Evaluation metrics

534

- feval: callable or None - Custom evaluation function

535

- init_model: str, Booster, or None - Initial model

536

- fpreproc: callable or None - Preprocessing function

537

- feature_name: list or 'auto' - Feature names

538

- categorical_feature: list or 'auto' - Categorical features

539

- seed: int - Random seed for CV splits

540

- callbacks: list or None - List of callback functions

541

- eval_train_metric: bool - Whether to evaluate training metric

542

- return_cvbooster: bool - Whether to return CVBooster object

543

544

Returns:

545

- dict or CVBooster: CV results dictionary or CVBooster object

546

"""

547

548

class CVBooster:

549

"""

550

Container for cross-validation boosters and results.

551

"""

552

553

def __init__(self, model_file=None):

554

"""

555

Initialize CVBooster object.

556

557

Parameters:

558

- model_file: str or None - Model file to load from

559

"""

560

561

def model_from_string(self, model_str):

562

"""

563

Load CVBooster from string representation.

564

565

Parameters:

566

- model_str: str - String representation

567

"""

568

569

def model_to_string(self):

570

"""

571

Export CVBooster to string representation.

572

573

Returns:

574

- str: String representation

575

"""

576

577

def save_model(self, filename, num_iteration=None):

578

"""

579

Save CVBooster to file.

580

581

Parameters:

582

- filename: str - Output filename

583

- num_iteration: int or None - Number of iterations to save

584

"""

585

586

@property

587

def boosters(self):

588

"""List of trained booster objects for each fold."""

589

590

@property

591

def best_iteration(self):

592

"""Best iteration number across all folds."""

593

```

594

595

### Data Interface

596

597

Abstract base class for implementing custom data sources.

598

599

```python { .api }

600

class Sequence:

601

"""

602

Generic data access interface for custom data sources.

603

604

This abstract base class allows you to implement custom data loading

605

for scenarios where data cannot fit in memory or needs special handling.

606

"""

607

608

batch_size = 4096 # Default batch size

609

610

def __getitem__(self, idx):

611

"""

612

Abstract method for data access by index.

613

614

Parameters:

615

- idx: int - Data index

616

617

Returns:

618

- Data item at the specified index

619

"""

620

raise NotImplementedError()

621

622

def __len__(self):

623

"""

624

Abstract method returning sequence length.

625

626

Returns:

627

- int: Total number of items in sequence

628

"""

629

raise NotImplementedError()

630

```

631

632

### Utility Functions

633

634

Additional utilities for logging and error handling.

635

636

```python { .api }

637

def register_logger(logger, info_method_name="info", warning_method_name="warning"):

638

"""

639

Register custom logger for LightGBM messages.

640

641

Parameters:

642

- logger: Logger object - Custom logger instance

643

- info_method_name: str - Name of info logging method

644

- warning_method_name: str - Name of warning logging method

645

"""

646

647

class LightGBMError(Exception):

648

"""Custom exception for LightGBM-specific errors."""

649

650

class LGBMDeprecationWarning(UserWarning):

651

"""Custom deprecation warning for LightGBM."""

652

```

653

654

## Usage Examples

655

656

### Basic Training Example

657

658

```python

659

import lightgbm as lgb

660

import numpy as np

661

from sklearn.datasets import load_breast_cancer

662

from sklearn.model_selection import train_test_split

663

664

# Load and prepare data

665

X, y = load_breast_cancer(return_X_y=True)

666

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

667

668

# Create LightGBM datasets

669

train_data = lgb.Dataset(X_train, label=y_train)

670

test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

671

672

# Set parameters

673

params = {

674

'objective': 'binary',

675

'metric': 'binary_logloss',

676

'boosting_type': 'gbdt',

677

'num_leaves': 31,

678

'learning_rate': 0.05,

679

'feature_fraction': 0.9,

680

'verbose': -1

681

}

682

683

# Train model

684

model = lgb.train(

685

params,

686

train_data,

687

num_boost_round=100,

688

valid_sets=[test_data],

689

valid_names=['test'],

690

callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]

691

)

692

693

# Make predictions

694

predictions = model.predict(X_test)

695

binary_predictions = (predictions > 0.5).astype(int)

696

697

print(f"Accuracy: {(binary_predictions == y_test).mean():.4f}")

698

print(f"Feature importance: {model.feature_importance()[:5]}")

699

```

700

701

### Cross-Validation Example

702

703

```python

704

import lightgbm as lgb

705

import numpy as np

706

from sklearn.datasets import load_diabetes

707

708

# Load data

709

X, y = load_diabetes(return_X_y=True)

710

train_data = lgb.Dataset(X, label=y)

711

712

# Set parameters

713

params = {

714

'objective': 'regression',

715

'metric': 'rmse',

716

'boosting_type': 'gbdt',

717

'num_leaves': 31,

718

'learning_rate': 0.05,

719

'verbose': -1

720

}

721

722

# Perform cross-validation

723

cv_results = lgb.cv(

724

params,

725

train_data,

726

num_boost_round=100,

727

nfold=5,

728

stratified=False,

729

shuffle=True,

730

seed=42,

731

return_cvbooster=True,

732

callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]

733

)

734

735

print(f"Best CV score: {cv_results['valid rmse-mean'][-1]:.4f}")

736

print(f"Best iteration: {len(cv_results['valid rmse-mean'])}")

737

738

# Access individual fold models

739

cvbooster = cv_results # When return_cvbooster=True

740

print(f"Number of fold models: {len(cvbooster.boosters)}")

741

```

742

743

### Custom Objective Function Example

744

745

```python

746

import lightgbm as lgb

747

import numpy as np

748

from sklearn.datasets import make_regression

749

750

# Create sample data

751

X, y = make_regression(n_samples=1000, n_features=10, random_state=42)

752

train_data = lgb.Dataset(X, label=y)

753

754

def custom_objective(y_true, y_pred):

755

"""Custom objective function (L1 loss)."""

756

residual = y_pred - y_true

757

grad = np.sign(residual)

758

hess = np.ones_like(residual)

759

return grad, hess

760

761

def custom_eval(y_true, y_pred):

762

"""Custom evaluation function."""

763

residual = y_pred - y_true

764

mae = np.mean(np.abs(residual))

765

return 'mae', mae, False # (eval_name, eval_result, is_higher_better)

766

767

# Train with custom functions

768

model = lgb.train(

769

{'verbose': -1},

770

train_data,

771

num_boost_round=100,

772

fobj=custom_objective,

773

feval=custom_eval

774

)

775

776

predictions = model.predict(X)

777

print(f"Custom MAE: {np.mean(np.abs(predictions - y)):.4f}")

778

```