or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

amazon-algorithms.mdautoml.mdcore-training.mddata-processing.mddebugging-profiling.mdexperiments.mdframework-training.mdhyperparameter-tuning.mdindex.mdmodel-monitoring.mdmodel-serving.mdremote-functions.md

debugging-profiling.mddocs/

0

# Debugging and Profiling

1

2

Comprehensive model debugging and performance profiling tools including tensor analysis, system metrics collection, framework-specific profiling, and automated rule-based monitoring for optimizing model training performance.

3

4

## Capabilities

5

6

### Performance Profiling

7

8

Core profiling functionality for analyzing training job performance, system resource utilization, and identifying performance bottlenecks.

9

10

```python { .api }

11

class ProfilerConfig:

12

"""

13

Configuration for SageMaker performance profiler.

14

15

Parameters:

16

- s3_output_path (str, optional): S3 path for profiling output

17

- profiling_interval_millis (int, optional): Profiling interval in milliseconds

18

- profiling_parameters (dict, optional): Additional profiling parameters

19

"""

20

def __init__(self, s3_output_path: str = None,

21

profiling_interval_millis: int = None, **kwargs): ...

22

23

class Profiler:

24

"""

25

SageMaker performance profiler for training jobs.

26

27

Parameters:

28

- rules (List[ProfilerRule], optional): Profiling rules to apply

29

- system_monitor_interval_millis (int, optional): System monitoring interval

30

- framework_profile_params (dict, optional): Framework-specific parameters

31

"""

32

def __init__(self, rules: List['ProfilerRule'] = None, **kwargs): ...

33

```

34

35

### Framework-Specific Profiling

36

37

Specialized profiling configurations for different ML frameworks with optimized performance analysis.

38

39

```python { .api }

40

class FrameworkProfile:

41

"""

42

Base class for framework-specific profiling configurations.

43

44

Parameters:

45

- start_step (int, optional): Step to start profiling

46

- num_steps (int, optional): Number of steps to profile

47

- start_unix_time (int, optional): Unix timestamp to start profiling

48

- duration_in_seconds (int, optional): Duration of profiling

49

"""

50

def __init__(self, start_step: int = None, num_steps: int = None, **kwargs): ...

51

52

class DataloaderProfilingConfig:

53

"""

54

Configuration for dataloader performance profiling.

55

56

Parameters:

57

- start_step (int, optional): Step to start dataloader profiling

58

- num_steps (int, optional): Number of steps to profile

59

- start_unix_time (int, optional): Unix timestamp to start

60

- duration_in_seconds (int, optional): Duration of profiling

61

"""

62

def __init__(self, **kwargs): ...

63

64

class DetailedProfilingConfig:

65

"""

66

Configuration for detailed performance profiling including memory and CPU usage.

67

68

Parameters:

69

- start_step (int, optional): Step to start detailed profiling

70

- num_steps (int, optional): Number of steps to profile

71

- start_unix_time (int, optional): Unix timestamp to start

72

- duration_in_seconds (int, optional): Duration of profiling

73

"""

74

def __init__(self, **kwargs): ...

75

76

class SMDataParallelProfilingConfig:

77

"""

78

Configuration for SageMaker data parallel profiling.

79

80

Parameters:

81

- start_step (int, optional): Step to start profiling

82

- num_steps (int, optional): Number of steps to profile

83

- start_unix_time (int, optional): Unix timestamp to start

84

- duration_in_seconds (int, optional): Duration of profiling

85

"""

86

def __init__(self, **kwargs): ...

87

88

class HorovodProfilingConfig:

89

"""

90

Configuration for Horovod distributed training profiling.

91

92

Parameters:

93

- start_step (int, optional): Step to start profiling

94

- num_steps (int, optional): Number of steps to profile

95

- start_unix_time (int, optional): Unix timestamp to start

96

- duration_in_seconds (int, optional): Duration of profiling

97

"""

98

def __init__(self, **kwargs): ...

99

100

class PythonProfilingConfig:

101

"""

102

Configuration for Python code profiling using cProfile.

103

104

Parameters:

105

- start_step (int, optional): Step to start profiling

106

- num_steps (int, optional): Number of steps to profile

107

- start_unix_time (int, optional): Unix timestamp to start

108

- duration_in_seconds (int, optional): Duration of profiling

109

- python_profiler (str, optional): Python profiler type ("cProfile" or "Pyinstrument")

110

- cprofile_timer (str, optional): Timer for cProfile profiling

111

"""

112

def __init__(self, python_profiler: str = "cProfile", **kwargs): ...

113

```

114

115

### Python Profiling Utilities

116

117

Utilities for Python-specific performance profiling and analysis.

118

119

```python { .api }

120

class PythonProfiler:

121

"""

122

Python profiler for analyzing Python code performance in training scripts.

123

124

Parameters:

125

- profiler_name (str): Name of the profiler ("cProfile" or "Pyinstrument")

126

- start_step (int, optional): Step to start profiling

127

- num_steps (int, optional): Number of steps to profile

128

"""

129

def __init__(self, profiler_name: str = "cProfile", **kwargs): ...

130

131

class cProfileTimer:

132

"""

133

Timer configuration for cProfile profiling.

134

135

Parameters:

136

- timer (str): Timer type for cProfile ("perf_counter", "process_time", "thread_time")

137

"""

138

def __init__(self, timer: str = "perf_counter"): ...

139

```

140

141

### Debugging Configuration

142

143

Configuration classes for model debugging including tensor analysis and gradient monitoring.

144

145

```python { .api }

146

class DebuggerHookConfig:

147

"""

148

Configuration for SageMaker debugger hooks in training scripts.

149

150

Parameters:

151

- s3_output_path (str): S3 path for debugger output

152

- container_local_output_path (str, optional): Local container path for output

153

- hook_parameters (dict, optional): Hook-specific parameters

154

- collection_configs (List[CollectionConfig], optional): Data collection configurations

155

"""

156

def __init__(self, s3_output_path: str, **kwargs): ...

157

158

class CollectionConfig:

159

"""

160

Configuration for collecting specific tensors or metrics during training.

161

162

Parameters:

163

- name (str): Name of the collection

164

- parameters (dict, optional): Collection-specific parameters

165

"""

166

def __init__(self, name: str, parameters: dict = None): ...

167

```

168

169

### Rule-Based Monitoring

170

171

Automated rule-based analysis for detecting common training issues and performance problems.

172

173

```python { .api }

174

class Rule:

175

"""

176

Base class for SageMaker debugger and profiler rules.

177

178

Parameters:

179

- name (str): Name of the rule

180

- image_uri (str): Docker image URI for the rule container

181

- instance_type (str, optional): Instance type for rule evaluation

182

- volume_size_in_gb (int, optional): Storage volume size

183

- rule_configuration_name (str, optional): Configuration name

184

- rule_parameters (dict, optional): Rule-specific parameters

185

"""

186

def __init__(self, name: str, image_uri: str, **kwargs): ...

187

188

class ProfilerRule(Rule):

189

"""

190

Rule for analyzing profiler data and detecting performance issues.

191

192

Parameters:

193

- name (str): Name of the profiler rule

194

- rule_parameters (dict, optional): Rule-specific parameters

195

- All Rule parameters

196

"""

197

def __init__(self, name: str, **kwargs): ...

198

199

class DebuggerRule(Rule):

200

"""

201

Rule for analyzing tensor data and detecting training issues.

202

203

Parameters:

204

- name (str): Name of the debugger rule

205

- collections_to_save (List[CollectionConfig], optional): Collections to analyze

206

- rule_parameters (dict, optional): Rule-specific parameters

207

- All Rule parameters

208

"""

209

def __init__(self, name: str, **kwargs): ...

210

```

211

212

### Built-in Rules

213

214

Pre-defined rules for common debugging and profiling scenarios.

215

216

```python { .api }

217

class ProfilerReport:

218

"""Built-in rule for generating comprehensive profiling reports."""

219

@staticmethod

220

def rule(**kwargs) -> ProfilerRule: ...

221

222

class BatchSize:

223

"""Built-in rule for analyzing batch size efficiency."""

224

@staticmethod

225

def rule(**kwargs) -> ProfilerRule: ...

226

227

class CPUBottleneck:

228

"""Built-in rule for detecting CPU bottlenecks."""

229

@staticmethod

230

def rule(**kwargs) -> ProfilerRule: ...

231

232

class GPUMemoryIncrease:

233

"""Built-in rule for detecting GPU memory increases."""

234

@staticmethod

235

def rule(**kwargs) -> ProfilerRule: ...

236

237

class IOBottleneck:

238

"""Built-in rule for detecting I/O bottlenecks."""

239

@staticmethod

240

def rule(**kwargs) -> ProfilerRule: ...

241

242

class LoadBalancing:

243

"""Built-in rule for analyzing load balancing across distributed training."""

244

@staticmethod

245

def rule(**kwargs) -> ProfilerRule: ...

246

247

class LowGPUUtilization:

248

"""Built-in rule for detecting low GPU utilization."""

249

@staticmethod

250

def rule(**kwargs) -> ProfilerRule: ...

251

252

class OverallSystemUsage:

253

"""Built-in rule for analyzing overall system resource usage."""

254

@staticmethod

255

def rule(**kwargs) -> ProfilerRule: ...

256

257

class StepOutlier:

258

"""Built-in rule for detecting training step outliers."""

259

@staticmethod

260

def rule(**kwargs) -> ProfilerRule: ...

261

262

# Debugger Rules

263

class VanishingGradient:

264

"""Built-in rule for detecting vanishing gradients."""

265

@staticmethod

266

def rule(**kwargs) -> DebuggerRule: ...

267

268

class ExplodingTensor:

269

"""Built-in rule for detecting exploding tensors."""

270

@staticmethod

271

def rule(**kwargs) -> DebuggerRule: ...

272

273

class PoorWeightInitialization:

274

"""Built-in rule for detecting poor weight initialization."""

275

@staticmethod

276

def rule(**kwargs) -> DebuggerRule: ...

277

278

class LossNotDecreasing:

279

"""Built-in rule for detecting when loss is not decreasing."""

280

@staticmethod

281

def rule(**kwargs) -> DebuggerRule: ...

282

283

class Overfit:

284

"""Built-in rule for detecting overfitting."""

285

@staticmethod

286

def rule(**kwargs) -> DebuggerRule: ...

287

288

class Overtraining:

289

"""Built-in rule for detecting overtraining."""

290

@staticmethod

291

def rule(**kwargs) -> DebuggerRule: ...

292

293

class SimilarAcrossRuns:

294

"""Built-in rule for comparing metrics across training runs."""

295

@staticmethod

296

def rule(**kwargs) -> DebuggerRule: ...

297

298

class TensorVariance:

299

"""Built-in rule for analyzing tensor variance."""

300

@staticmethod

301

def rule(**kwargs) -> DebuggerRule: ...

302

```

303

304

## Usage Examples

305

306

### Basic Performance Profiling

307

308

```python

309

from sagemaker.debugger import ProfilerConfig, ProfilerRule, Profiler

310

from sagemaker.debugger import ProfilerReport, CPUBottleneck, GPUMemoryIncrease

311

312

# Configure profiler

313

profiler_config = ProfilerConfig(

314

s3_output_path="s3://bucket/profiler-output",

315

profiling_interval_millis=500 # Profile every 500ms

316

)

317

318

# Define profiling rules

319

profiler_rules = [

320

ProfilerReport.rule(),

321

CPUBottleneck.rule(),

322

GPUMemoryIncrease.rule(

323

rule_parameters={"threshold": 20.0} # Alert if memory increases by 20%

324

)

325

]

326

327

# Create profiler

328

profiler = Profiler(rules=profiler_rules)

329

330

# Use with estimator

331

estimator = PyTorch(

332

entry_point="train.py",

333

framework_version="1.12.0",

334

py_version="py38",

335

instance_type="ml.p3.2xlarge",

336

role=role,

337

profiler_config=profiler_config,

338

rules=profiler_rules

339

)

340

341

estimator.fit(inputs)

342

```

343

344

### Advanced Framework Profiling

345

346

```python

347

from sagemaker.debugger import (

348

DetailedProfilingConfig, DataloaderProfilingConfig,

349

PythonProfilingConfig, SMDataParallelProfilingConfig

350

)

351

352

# Configure detailed profiling

353

detailed_config = DetailedProfilingConfig(

354

start_step=5, # Start after warmup

355

num_steps=10 # Profile 10 steps

356

)

357

358

# Configure dataloader profiling

359

dataloader_config = DataloaderProfilingConfig(

360

start_step=5,

361

num_steps=15

362

)

363

364

# Configure Python profiling

365

python_config = PythonProfilingConfig(

366

start_step=10,

367

num_steps=5,

368

python_profiler="cProfile"

369

)

370

371

# Configure distributed training profiling

372

sm_parallel_config = SMDataParallelProfilingConfig(

373

start_step=5,

374

num_steps=10

375

)

376

377

# Create comprehensive profiler config

378

profiler_config = ProfilerConfig(

379

s3_output_path="s3://bucket/detailed-profiling",

380

profiling_parameters={

381

"DataloaderProfilingConfig": dataloader_config,

382

"DetailedProfilingConfig": detailed_config,

383

"PythonProfilingConfig": python_config,

384

"SMDataParallelProfilingConfig": sm_parallel_config

385

}

386

)

387

```

388

389

### Model Debugging

390

391

```python

392

from sagemaker.debugger import (

393

DebuggerHookConfig, CollectionConfig, DebuggerRule,

394

VanishingGradient, ExplodingTensor, LossNotDecreasing

395

)

396

397

# Configure tensor collections

398

collection_configs = [

399

CollectionConfig(

400

name="weights",

401

parameters={"save_interval": "100"}

402

),

403

CollectionConfig(

404

name="gradients",

405

parameters={"save_interval": "100"}

406

),

407

CollectionConfig(

408

name="losses",

409

parameters={"save_interval": "10"}

410

)

411

]

412

413

# Configure debugger hook

414

debugger_config = DebuggerHookConfig(

415

s3_output_path="s3://bucket/debugger-output",

416

collection_configs=collection_configs

417

)

418

419

# Define debugging rules

420

debugger_rules = [

421

VanishingGradient.rule(),

422

ExplodingTensor.rule(

423

rule_parameters={"threshold": 20.0}

424

),

425

LossNotDecreasing.rule(

426

rule_parameters={

427

"mode": "TRAIN",

428

"patience": 10,

429

"min_delta": 0.01

430

}

431

)

432

]

433

434

# Use with estimator

435

estimator = TensorFlow(

436

entry_point="train.py",

437

framework_version="2.8.0",

438

py_version="py39",

439

instance_type="ml.p3.2xlarge",

440

role=role,

441

debugger_hook_config=debugger_config,

442

rules=debugger_rules

443

)

444

445

estimator.fit(inputs)

446

```

447

448

### Custom Rules and Analysis

449

450

```python

451

from sagemaker.debugger import Rule

452

453

# Create custom profiler rule

454

custom_profiler_rule = ProfilerRule(

455

name="CustomGPURule",

456

rule_parameters={

457

"gpu_utilization_threshold": 80.0,

458

"eval_frequency": 10

459

}

460

)

461

462

# Create custom debugger rule

463

custom_debugger_rule = DebuggerRule(

464

name="CustomGradientRule",

465

collections_to_save=[

466

CollectionConfig(name="gradients")

467

],

468

rule_parameters={

469

"gradient_threshold": 1.0

470

}

471

)

472

473

# Use custom rules

474

estimator = PyTorch(

475

entry_point="train.py",

476

framework_version="1.12.0",

477

py_version="py38",

478

instance_type="ml.p3.8xlarge",

479

role=role,

480

rules=[custom_profiler_rule, custom_debugger_rule],

481

debugger_hook_config=debugger_config,

482

profiler_config=profiler_config

483

)

484

485

estimator.fit(inputs)

486

487

# Monitor rule evaluation status

488

for rule in estimator.latest_training_job.rule_job_summary():

489

print(f"Rule: {rule['RuleConfigurationName']}")

490

print(f"Status: {rule['RuleEvaluationStatus']}")

491

if rule['RuleEvaluationStatus'] == 'IssuesFound':

492

print(f"Issues: {rule.get('StatusDetails', 'Check S3 output')}")

493

```