or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

feature-extraction.mdgeneration.mdindex.mdmodels.mdoptimization.mdpipelines.mdtokenization.mdtraining.md

optimization.mddocs/

0

# Optimization

1

2

Advanced optimization techniques including quantization, mixed precision training, hardware acceleration, and memory efficiency improvements for both inference and training workflows.

3

4

## Capabilities

5

6

### Quantization

7

8

Reduce model memory footprint and increase inference speed through various quantization techniques.

9

10

```python { .api }

11

class BitsAndBytesConfig:

12

def __init__(

13

self,

14

load_in_8bit: bool = False,

15

load_in_4bit: bool = False,

16

llm_int8_threshold: float = 6.0,

17

llm_int8_skip_modules: Optional[List[str]] = None,

18

llm_int8_enable_fp32_cpu_offload: bool = False,

19

llm_int8_has_fp16_weight: bool = False,

20

bnb_4bit_compute_dtype: Optional[torch.dtype] = None,

21

bnb_4bit_quant_type: str = "fp4",

22

bnb_4bit_use_double_quant: bool = False,

23

bnb_4bit_quant_storage: Optional[torch.dtype] = None,

24

**kwargs

25

):

26

"""

27

Configuration for BitsAndBytes quantization.

28

29

Args:

30

load_in_8bit: Enable 8-bit quantization

31

load_in_4bit: Enable 4-bit quantization

32

llm_int8_threshold: Threshold for outlier detection

33

llm_int8_skip_modules: Modules to skip quantization

34

llm_int8_enable_fp32_cpu_offload: Offload fp32 weights to CPU

35

llm_int8_has_fp16_weight: Model has fp16 weights

36

bnb_4bit_compute_dtype: Compute dtype for 4-bit

37

bnb_4bit_quant_type: Quantization type ("fp4", "nf4")

38

bnb_4bit_use_double_quant: Use double quantization

39

bnb_4bit_quant_storage: Storage dtype for quantized weights

40

"""

41

42

class GPTQConfig:

43

def __init__(

44

self,

45

bits: int = 4,

46

tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,

47

dataset: Optional[Union[str, List[str]]] = None,

48

group_size: int = 128,

49

damp_percent: float = 0.1,

50

desc_act: bool = False,

51

static_groups: bool = False,

52

sym: bool = True,

53

true_sequential: bool = True,

54

model_name_or_path: Optional[str] = None,

55

model_seqlen: Optional[int] = None,

56

block_name_to_quantize: Optional[str] = None,

57

module_name_preceding_first_block: Optional[List[str]] = None,

58

batch_size: int = 1,

59

pad_token_id: Optional[int] = None,

60

use_exllama: Optional[bool] = None,

61

max_input_length: Optional[int] = None,

62

exllama_config: Optional[Dict[str, Any]] = None,

63

cache_block_outputs: bool = True,

64

modules_in_block_to_quantize: Optional[List[List[str]]] = None,

65

**kwargs

66

):

67

"""

68

Configuration for GPTQ quantization.

69

70

Args:

71

bits: Number of bits for quantization

72

tokenizer: Tokenizer for calibration dataset

73

dataset: Calibration dataset name or samples

74

group_size: Group size for quantization

75

damp_percent: Damping percentage

76

desc_act: Use descending activation order

77

static_groups: Use static groups

78

sym: Use symmetric quantization

79

true_sequential: Use true sequential quantization

80

model_seqlen: Model sequence length

81

batch_size: Batch size for calibration

82

use_exllama: Use ExLlama kernels

83

max_input_length: Maximum input length

84

"""

85

86

class AwqConfig:

87

def __init__(

88

self,

89

bits: int = 4,

90

group_size: int = 128,

91

zero_point: bool = True,

92

version: str = "GEMM",

93

backend: str = "autoawq",

94

do_fuse: Optional[bool] = None,

95

fuse_max_seq_len: Optional[int] = None,

96

modules_to_fuse: Optional[Dict] = None,

97

**kwargs

98

):

99

"""

100

Configuration for AWQ quantization.

101

102

Args:

103

bits: Number of bits for quantization

104

group_size: Group size for quantization

105

zero_point: Use zero point quantization

106

version: AWQ version ("GEMM", "GEMV")

107

backend: Backend implementation

108

do_fuse: Enable module fusion

109

fuse_max_seq_len: Maximum sequence length for fusion

110

modules_to_fuse: Specific modules to fuse

111

"""

112

```

113

114

### Mixed Precision Training

115

116

Optimize training speed and memory usage with mixed precision techniques.

117

118

```python { .api }

119

class TrainingArguments:

120

"""Training arguments with mixed precision options."""

121

122

def __init__(

123

self,

124

# Mixed precision options

125

fp16: bool = False,

126

bf16: bool = False,

127

fp16_opt_level: str = "O1",

128

fp16_backend: str = "auto",

129

fp16_full_eval: bool = False,

130

bf16_full_eval: bool = False,

131

tf32: Optional[bool] = None,

132

dataloader_pin_memory: bool = True,

133

# Other training args...

134

**kwargs

135

):

136

"""

137

Configure mixed precision training.

138

139

Args:

140

fp16: Enable 16-bit floating point training

141

bf16: Enable bfloat16 training (better numerical stability)

142

fp16_opt_level: Optimization level for Apex ("O0", "O1", "O2", "O3")

143

fp16_backend: Backend for fp16 ("auto", "apex", "amp")

144

fp16_full_eval: Use fp16 for evaluation

145

bf16_full_eval: Use bf16 for evaluation

146

tf32: Enable TensorFloat-32 on Ampere GPUs

147

dataloader_pin_memory: Pin memory for faster data transfer

148

"""

149

```

150

151

### Gradient Checkpointing

152

153

Trade computation for memory by recomputing activations during backward pass.

154

155

```python { .api }

156

class PreTrainedModel:

157

"""Model with gradient checkpointing support."""

158

159

def gradient_checkpointing_enable(

160

self,

161

gradient_checkpointing_kwargs: Optional[Dict] = None

162

) -> None:

163

"""

164

Enable gradient checkpointing for the model.

165

166

Args:

167

gradient_checkpointing_kwargs: Additional arguments for checkpointing

168

"""

169

170

def gradient_checkpointing_disable(self) -> None:

171

"""Disable gradient checkpointing."""

172

173

class TrainingArguments:

174

def __init__(

175

self,

176

gradient_checkpointing: bool = False,

177

gradient_checkpointing_kwargs: Optional[Dict] = None,

178

**kwargs

179

):

180

"""

181

Configure gradient checkpointing in training.

182

183

Args:

184

gradient_checkpointing: Enable gradient checkpointing

185

gradient_checkpointing_kwargs: Additional checkpointing options

186

"""

187

```

188

189

### Memory Optimization

190

191

Advanced memory management techniques for large models.

192

193

```python { .api }

194

def enable_memory_efficient_attention():

195

"""Enable memory-efficient attention implementations."""

196

197

def get_memory_footprint_mb(

198

model: torch.nn.Module,

199

return_buffers: bool = True

200

) -> int:

201

"""

202

Get model memory footprint in MB.

203

204

Args:

205

model: PyTorch model

206

return_buffers: Include buffer memory

207

208

Returns:

209

Memory footprint in megabytes

210

"""

211

212

class DeepSpeedConfig:

213

"""Configuration for DeepSpeed optimization."""

214

215

@staticmethod

216

def get_config(

217

stage: int = 2,

218

offload_optimizer: bool = False,

219

offload_param: bool = False,

220

reduce_bucket_size: int = 200000000,

221

stage3_prefetch_bucket_size: int = 200000000,

222

stage3_param_persistence_threshold: int = 1000000,

223

**kwargs

224

) -> Dict[str, Any]:

225

"""

226

Get DeepSpeed configuration dictionary.

227

228

Args:

229

stage: ZeRO stage (1, 2, or 3)

230

offload_optimizer: Offload optimizer states to CPU

231

offload_param: Offload parameters to CPU

232

reduce_bucket_size: Gradient reduction bucket size

233

stage3_prefetch_bucket_size: Parameter prefetch bucket size

234

stage3_param_persistence_threshold: Parameter persistence threshold

235

236

Returns:

237

DeepSpeed configuration dictionary

238

"""

239

```

240

241

### Hardware Acceleration

242

243

Optimize for specific hardware platforms and accelerators.

244

245

```python { .api }

246

class BetterTransformer:

247

"""Flash Attention and other optimized kernels."""

248

249

@staticmethod

250

def transform(

251

model: PreTrainedModel,

252

keep_original_model: bool = False,

253

**kwargs

254

) -> PreTrainedModel:

255

"""

256

Apply BetterTransformer optimizations.

257

258

Args:

259

model: Model to optimize

260

keep_original_model: Keep reference to original model

261

262

Returns:

263

Optimized model with fast attention

264

"""

265

266

@staticmethod

267

def reverse(model: PreTrainedModel) -> PreTrainedModel:

268

"""Reverse BetterTransformer optimizations."""

269

270

# PyTorch 2.0 Compilation

271

def torch_compile_model(

272

model: PreTrainedModel,

273

backend: str = "inductor",

274

mode: str = "default",

275

**kwargs

276

) -> PreTrainedModel:

277

"""

278

Compile model with PyTorch 2.0 torch.compile.

279

280

Args:

281

model: Model to compile

282

backend: Compilation backend ("inductor", "aot_eager", etc.)

283

mode: Compilation mode ("default", "reduce-overhead", "max-autotune")

284

285

Returns:

286

Compiled model

287

"""

288

289

class TrainingArguments:

290

def __init__(

291

self,

292

torch_compile: bool = False,

293

torch_compile_backend: Optional[str] = None,

294

torch_compile_mode: Optional[str] = None,

295

**kwargs

296

):

297

"""

298

Configure PyTorch compilation in training.

299

300

Args:

301

torch_compile: Enable torch.compile

302

torch_compile_backend: Compilation backend

303

torch_compile_mode: Compilation mode

304

"""

305

```

306

307

### Model Parallelism

308

309

Distribute large models across multiple devices.

310

311

```python { .api }

312

class TrainingArguments:

313

def __init__(

314

self,

315

# Data parallelism

316

local_rank: int = -1,

317

ddp_backend: Optional[str] = None,

318

ddp_timeout: Optional[int] = 1800,

319

ddp_find_unused_parameters: Optional[bool] = None,

320

321

# Model parallelism

322

fsdp: str = "",

323

fsdp_min_num_params: int = 0,

324

fsdp_config: Optional[str] = None,

325

fsdp_transformer_layer_cls_to_wrap: Optional[str] = None,

326

327

# Pipeline parallelism

328

deepspeed: Optional[str] = None,

329

330

**kwargs

331

):

332

"""

333

Configure distributed training strategies.

334

335

Args:

336

local_rank: Local rank for distributed training

337

ddp_backend: Distributed data parallel backend

338

ddp_timeout: DDP timeout in seconds

339

fsdp: Fully Sharded Data Parallel configuration

340

fsdp_min_num_params: Minimum parameters for FSDP wrapping

341

deepspeed: DeepSpeed configuration file path

342

"""

343

344

def load_model_with_device_map(

345

model_name: str,

346

device_map: Union[str, Dict] = "auto",

347

max_memory: Optional[Dict] = None,

348

offload_folder: Optional[str] = None,

349

**kwargs

350

) -> PreTrainedModel:

351

"""

352

Load model with automatic device mapping.

353

354

Args:

355

model_name: Model name or path

356

device_map: Device mapping strategy or custom mapping

357

max_memory: Maximum memory per device

358

offload_folder: Folder for offloaded weights

359

360

Returns:

361

Model distributed across available devices

362

"""

363

```

364

365

### Inference Optimization

366

367

Optimize models specifically for inference workloads.

368

369

```python { .api }

370

def optimize_model_for_inference(

371

model: PreTrainedModel,

372

optimize_for_latency: bool = True,

373

optimize_for_throughput: bool = False,

374

use_bettertransformer: bool = True,

375

use_torch_compile: bool = True,

376

**kwargs

377

) -> PreTrainedModel:

378

"""

379

Apply inference-specific optimizations.

380

381

Args:

382

model: Model to optimize

383

optimize_for_latency: Optimize for low latency

384

optimize_for_throughput: Optimize for high throughput

385

use_bettertransformer: Apply BetterTransformer

386

use_torch_compile: Use torch.compile

387

388

Returns:

389

Optimized model for inference

390

"""

391

392

class StaticCache:

393

"""Static key-value cache for improved inference performance."""

394

395

def __init__(

396

self,

397

config: PretrainedConfig,

398

max_batch_size: int,

399

max_cache_len: int,

400

device: torch.device,

401

dtype: torch.dtype = torch.float16

402

):

403

"""

404

Initialize static cache.

405

406

Args:

407

config: Model configuration

408

max_batch_size: Maximum batch size

409

max_cache_len: Maximum cache length

410

device: Device for cache tensors

411

dtype: Data type for cache

412

"""

413

```

414

415

### Caching and Hub Integration

416

417

Optimize model loading and sharing through intelligent caching.

418

419

```python { .api }

420

def cached_file(

421

path_or_repo_id: Union[str, os.PathLike],

422

filename: str,

423

cache_dir: Union[str, os.PathLike] = None,

424

force_download: bool = False,

425

resume_download: bool = False,

426

proxies: Optional[Dict[str, str]] = None,

427

token: Union[bool, str] = None,

428

revision: Optional[str] = None,

429

local_files_only: bool = False,

430

**kwargs

431

) -> Optional[str]:

432

"""

433

Download and cache file from Hugging Face Hub.

434

435

Args:

436

path_or_repo_id: Repository ID or local path

437

filename: File to download

438

cache_dir: Custom cache directory

439

force_download: Force fresh download

440

resume_download: Resume interrupted download

441

token: Authentication token

442

revision: Model revision/branch

443

local_files_only: Only use local files

444

445

Returns:

446

Path to cached file

447

"""

448

449

def clean_files_cache(

450

cache_dir: Optional[Union[str, os.PathLike]] = None,

451

token: Union[bool, str] = None

452

) -> None:

453

"""Clean up cached files to free disk space."""

454

455

def scan_cache_dir(

456

cache_dir: Union[str, os.PathLike] = None

457

) -> Dict[str, Any]:

458

"""Scan cache directory and return usage statistics."""

459

```

460

461

## Optimization Examples

462

463

Common optimization patterns for different use cases:

464

465

```python

466

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

467

import torch

468

469

# 8-bit quantization

470

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

471

model = AutoModelForCausalLM.from_pretrained(

472

"facebook/opt-6.7b",

473

quantization_config=quantization_config,

474

device_map="auto"

475

)

476

477

# 4-bit quantization with NF4

478

quantization_config = BitsAndBytesConfig(

479

load_in_4bit=True,

480

bnb_4bit_quant_type="nf4",

481

bnb_4bit_use_double_quant=True,

482

bnb_4bit_compute_dtype=torch.bfloat16

483

)

484

485

model = AutoModelForCausalLM.from_pretrained(

486

"microsoft/DialoGPT-large",

487

quantization_config=quantization_config

488

)

489

490

# Mixed precision training

491

from transformers import TrainingArguments, Trainer

492

493

training_args = TrainingArguments(

494

output_dir="./results",

495

bf16=True, # Use bfloat16 for better stability

496

gradient_checkpointing=True, # Save memory

497

dataloader_pin_memory=True, # Faster data loading

498

torch_compile=True, # PyTorch 2.0 compilation

499

)

500

501

trainer = Trainer(

502

model=model,

503

args=training_args,

504

train_dataset=train_dataset

505

)

506

507

# BetterTransformer optimization

508

from transformers import BetterTransformer

509

510

# Apply optimizations

511

model = BetterTransformer.transform(model)

512

513

# Use optimized model for inference

514

outputs = model.generate(**inputs, max_new_tokens=50)

515

516

# DeepSpeed training

517

training_args = TrainingArguments(

518

output_dir="./results",

519

deepspeed="ds_config.json", # DeepSpeed config file

520

gradient_checkpointing=True,

521

fp16=True

522

)

523

524

# Device mapping for large models

525

model = AutoModelForCausalLM.from_pretrained(

526

"microsoft/DialoGPT-large",

527

device_map="auto", # Automatic device placement

528

max_memory={0: "10GB", 1: "10GB"}, # Memory limits per GPU

529

offload_folder="./offload" # Offload unused weights

530

)

531

532

# Static cache for faster inference

533

from transformers import StaticCache

534

535

cache = StaticCache(

536

config=model.config,

537

max_batch_size=4,

538

max_cache_len=512,

539

device=model.device,

540

dtype=torch.float16

541

)

542

543

# Use cache during generation

544

outputs = model.generate(

545

**inputs,

546

past_key_values=cache,

547

use_cache=True,

548

max_new_tokens=50

549

)

550

```

551

552

## Performance Recommendations

553

554

**For Training:**

555

- Use `bf16=True` for better numerical stability than `fp16`

556

- Enable `gradient_checkpointing=True` for large models

557

- Use `torch_compile=True` with PyTorch 2.0+ for speed improvements

558

- Consider DeepSpeed for very large models

559

560

**For Inference:**

561

- Apply BetterTransformer optimizations for attention speedup

562

- Use 8-bit or 4-bit quantization for memory-constrained environments

563

- Enable `torch_compile=True` for repeated inference patterns

564

- Use static caching for batch inference scenarios

565

566

**For Memory Optimization:**

567

- Use `device_map="auto"` for automatic multi-GPU placement

568

- Set appropriate `max_memory` limits per device

569

- Enable gradient checkpointing during training

570

- Consider CPU offloading for extremely large models