0
# Model Training
1
2
Comprehensive training capabilities supporting AutoML, custom training jobs, and distributed training with full lifecycle management. The Vertex AI SDK provides both automated machine learning and flexible custom training options for various ML tasks and frameworks.
3
4
## Capabilities
5
6
### AutoML Training Jobs
7
8
Automated machine learning with minimal configuration for tabular, image, text, video, and forecasting tasks.
9
10
```python { .api }
11
class AutoMLTabularTrainingJob:
12
def __init__(
13
self,
14
display_name: str,
15
optimization_prediction_type: str,
16
optimization_objective: Optional[str] = None,
17
column_specs: Optional[Dict[str, str]] = None,
18
column_transformations: Optional[List[Dict[str, Any]]] = None,
19
optimization_objective_recall_value: Optional[float] = None,
20
optimization_objective_precision_value: Optional[float] = None,
21
project: Optional[str] = None,
22
location: Optional[str] = None,
23
labels: Optional[Dict[str, str]] = None,
24
training_encryption_spec_key_name: Optional[str] = None,
25
model_encryption_spec_key_name: Optional[str] = None,
26
**kwargs
27
): ...
28
29
def run(
30
self,
31
dataset: TabularDataset,
32
target_column: str,
33
training_fraction_split: Optional[float] = None,
34
validation_fraction_split: Optional[float] = None,
35
test_fraction_split: Optional[float] = None,
36
predefined_split_column_name: Optional[str] = None,
37
timestamp_split_column_name: Optional[str] = None,
38
weight_column: Optional[str] = None,
39
budget_milli_node_hours: int = 1000,
40
model_display_name: Optional[str] = None,
41
model_labels: Optional[Dict[str, str]] = None,
42
model_id: Optional[str] = None,
43
parent_model: Optional[str] = None,
44
is_default_version: bool = True,
45
model_version_aliases: Optional[Sequence[str]] = None,
46
model_version_description: Optional[str] = None,
47
disable_early_stopping: bool = False,
48
export_evaluated_data_items: bool = False,
49
export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None,
50
export_evaluated_data_items_override_destination: bool = False,
51
additional_experiments: Optional[List[str]] = None,
52
sync: bool = True,
53
create_request_timeout: Optional[float] = None,
54
**kwargs
55
) -> Model: ...
56
57
@property
58
def state(self) -> JobState: ...
59
@property
60
def model(self) -> Optional[Model]: ...
61
```
62
63
#### Usage Examples
64
65
**AutoML tabular classification:**
66
```python
67
import google.cloud.aiplatform as aiplatform
68
69
aiplatform.init(project='my-project', location='us-central1')
70
71
# Create dataset
72
dataset = aiplatform.TabularDataset.create(
73
display_name="customer-data",
74
gcs_source="gs://my-bucket/customer_data.csv"
75
)
76
77
# Create and run training job
78
job = aiplatform.AutoMLTabularTrainingJob(
79
display_name="customer-classification",
80
optimization_prediction_type="classification",
81
optimization_objective="maximize-au-prc"
82
)
83
84
model = job.run(
85
dataset=dataset,
86
target_column="label",
87
training_fraction_split=0.7,
88
validation_fraction_split=0.15,
89
test_fraction_split=0.15,
90
budget_milli_node_hours=2000
91
)
92
93
print(f"Training completed. Model: {model.resource_name}")
94
```
95
96
### AutoML Forecasting
97
98
Specialized time series forecasting with multiple model architectures and automatic feature engineering.
99
100
```python { .api }
101
class AutoMLForecastingTrainingJob:
102
def __init__(
103
self,
104
display_name: str,
105
optimization_objective: Optional[str] = None,
106
column_specs: Optional[Dict[str, str]] = None,
107
column_transformations: Optional[List[Dict[str, Any]]] = None,
108
**kwargs
109
): ...
110
111
def run(
112
self,
113
dataset: TimeSeriesDataset,
114
target_column: str,
115
time_column: str,
116
time_series_identifier_column: str,
117
unavailable_at_forecast_columns: List[str],
118
available_at_forecast_columns: List[str],
119
forecast_horizon: int,
120
data_granularity_unit: str,
121
data_granularity_count: int,
122
**kwargs
123
) -> Model: ...
124
125
class SequenceToSequencePlusForecastingTrainingJob:
126
def run(
127
self,
128
dataset: TimeSeriesDataset,
129
target_column: str,
130
time_column: str,
131
time_series_identifier_column: str,
132
forecast_horizon: int,
133
data_granularity_unit: str,
134
data_granularity_count: int,
135
quantiles: Optional[List[float]] = None,
136
validation_options: Optional[str] = None,
137
**kwargs
138
) -> Model: ...
139
140
class TemporalFusionTransformerForecastingTrainingJob:
141
def run(
142
self,
143
dataset: TimeSeriesDataset,
144
target_column: str,
145
time_column: str,
146
time_series_identifier_column: str,
147
forecast_horizon: int,
148
data_granularity_unit: str,
149
data_granularity_count: int,
150
quantiles: Optional[List[float]] = None,
151
context_window: Optional[int] = None,
152
**kwargs
153
) -> Model: ...
154
```
155
156
### AutoML Vision and NLP
157
158
Automated training for image, text, and video understanding tasks.
159
160
```python { .api }
161
class AutoMLImageTrainingJob:
162
def __init__(
163
self,
164
display_name: str,
165
prediction_type: str = "classification",
166
multi_label: bool = False,
167
model_type: str = "CLOUD",
168
base_model: Optional[Model] = None,
169
**kwargs
170
): ...
171
172
def run(
173
self,
174
dataset: ImageDataset,
175
model_display_name: Optional[str] = None,
176
model_labels: Optional[Dict[str, str]] = None,
177
training_fraction_split: Optional[float] = None,
178
validation_fraction_split: Optional[float] = None,
179
test_fraction_split: Optional[float] = None,
180
budget_milli_node_hours: int = 8000,
181
disable_early_stopping: bool = False,
182
**kwargs
183
) -> Model: ...
184
185
class AutoMLTextTrainingJob:
186
def __init__(
187
self,
188
display_name: str,
189
prediction_type: str,
190
multi_label: bool = False,
191
sentiment_max: int = 10,
192
**kwargs
193
): ...
194
195
def run(
196
self,
197
dataset: TextDataset,
198
**kwargs
199
) -> Model: ...
200
201
class AutoMLVideoTrainingJob:
202
def __init__(
203
self,
204
display_name: str,
205
prediction_type: str = "classification",
206
model_type: str = "CLOUD",
207
**kwargs
208
): ...
209
210
def run(
211
self,
212
dataset: VideoDataset,
213
**kwargs
214
) -> Model: ...
215
```
216
217
### Custom Training Jobs
218
219
Flexible custom training with support for any ML framework and distributed training configurations.
220
221
```python { .api }
222
class CustomTrainingJob:
223
def __init__(
224
self,
225
display_name: str,
226
script_path: str,
227
container_uri: str,
228
requirements: Optional[Sequence[str]] = None,
229
model_serving_container_image_uri: Optional[str] = None,
230
model_serving_container_predict_route: Optional[str] = None,
231
model_serving_container_health_route: Optional[str] = None,
232
model_serving_container_command: Optional[Sequence[str]] = None,
233
model_serving_container_args: Optional[Sequence[str]] = None,
234
model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
235
model_serving_container_ports: Optional[Sequence[int]] = None,
236
model_description: Optional[str] = None,
237
model_instance_schema_uri: Optional[str] = None,
238
model_parameters_schema_uri: Optional[str] = None,
239
model_prediction_schema_uri: Optional[str] = None,
240
labels: Optional[Dict[str, str]] = None,
241
training_encryption_spec_key_name: Optional[str] = None,
242
model_encryption_spec_key_name: Optional[str] = None,
243
staging_bucket: Optional[str] = None,
244
**kwargs
245
): ...
246
247
def run(
248
self,
249
dataset: Optional[Dataset] = None,
250
annotation_schema_uri: Optional[str] = None,
251
model_display_name: Optional[str] = None,
252
model_labels: Optional[Dict[str, str]] = None,
253
base_output_dir: Optional[str] = None,
254
service_account: Optional[str] = None,
255
network: Optional[str] = None,
256
bigquery_destination: Optional[str] = None,
257
args: Optional[List[str]] = None,
258
environment_variables: Optional[Dict[str, str]] = None,
259
replica_count: int = 1,
260
machine_type: str = "n1-standard-4",
261
accelerator_type: Optional[str] = None,
262
accelerator_count: Optional[int] = None,
263
boot_disk_type: str = "pd-ssd",
264
boot_disk_size_gb: int = 100,
265
training_fraction_split: Optional[float] = None,
266
validation_fraction_split: Optional[float] = None,
267
test_fraction_split: Optional[float] = None,
268
training_filter_split: Optional[str] = None,
269
validation_filter_split: Optional[str] = None,
270
test_filter_split: Optional[str] = None,
271
predefined_split_column_name: Optional[str] = None,
272
timestamp_split_column_name: Optional[str] = None,
273
tensorboard: Optional[str] = None,
274
sync: bool = True,
275
create_request_timeout: Optional[float] = None,
276
disable_retries: bool = False,
277
persistent_resource_id: Optional[str] = None,
278
**kwargs
279
) -> Optional[Model]: ...
280
```
281
282
#### Usage Examples
283
284
**Custom Python package training:**
285
```python
286
job = aiplatform.CustomTrainingJob(
287
display_name="custom-sklearn-training",
288
script_path="train.py",
289
container_uri="gcr.io/cloud-aiplatform/training/scikit-learn-cpu.0-23:latest",
290
requirements=["scikit-learn==0.23.2", "pandas>=1.0.0"],
291
model_serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-23:latest"
292
)
293
294
model = job.run(
295
dataset=dataset,
296
replica_count=1,
297
machine_type="n1-standard-4",
298
args=["--epochs", "100", "--batch-size", "32"],
299
environment_variables={"LEARNING_RATE": "0.001"}
300
)
301
```
302
303
### Custom Container Training
304
305
Training with custom Docker containers for maximum flexibility and framework support.
306
307
```python { .api }
308
class CustomContainerTrainingJob:
309
def __init__(
310
self,
311
display_name: str,
312
container_uri: str,
313
model_serving_container_image_uri: Optional[str] = None,
314
model_serving_container_predict_route: Optional[str] = None,
315
model_serving_container_health_route: Optional[str] = None,
316
model_serving_container_command: Optional[Sequence[str]] = None,
317
model_serving_container_args: Optional[Sequence[str]] = None,
318
model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
319
model_serving_container_ports: Optional[Sequence[int]] = None,
320
model_description: Optional[str] = None,
321
model_instance_schema_uri: Optional[str] = None,
322
model_parameters_schema_uri: Optional[str] = None,
323
model_prediction_schema_uri: Optional[str] = None,
324
explanation_metadata: Optional[explain.ExplanationMetadata] = None,
325
explanation_parameters: Optional[explain.ExplanationParameters] = None,
326
labels: Optional[Dict[str, str]] = None,
327
training_encryption_spec_key_name: Optional[str] = None,
328
model_encryption_spec_key_name: Optional[str] = None,
329
staging_bucket: Optional[str] = None,
330
**kwargs
331
): ...
332
333
def run(
334
self,
335
dataset: Optional[Dataset] = None,
336
model_display_name: Optional[str] = None,
337
model_labels: Optional[Dict[str, str]] = None,
338
base_output_dir: Optional[str] = None,
339
service_account: Optional[str] = None,
340
network: Optional[str] = None,
341
bigquery_destination: Optional[str] = None,
342
args: Optional[List[str]] = None,
343
environment_variables: Optional[Dict[str, str]] = None,
344
replica_count: int = 1,
345
machine_type: str = "n1-standard-4",
346
accelerator_type: Optional[str] = None,
347
accelerator_count: Optional[int] = None,
348
boot_disk_type: str = "pd-ssd",
349
boot_disk_size_gb: int = 100,
350
training_fraction_split: Optional[float] = None,
351
validation_fraction_split: Optional[float] = None,
352
test_fraction_split: Optional[float] = None,
353
sync: bool = True,
354
create_request_timeout: Optional[float] = None,
355
restart_job_on_worker_restart: bool = False,
356
enable_web_access: bool = False,
357
enable_dashboard_access: bool = False,
358
tensorboard: Optional[str] = None,
359
reduce_image_size: bool = False,
360
**kwargs
361
) -> Optional[Model]: ...
362
```
363
364
### Custom Python Package Training
365
366
Training with Python packages uploaded to the cloud.
367
368
```python { .api }
369
class CustomPythonPackageTrainingJob:
370
def __init__(
371
self,
372
display_name: str,
373
python_package_gcs_uri: str,
374
python_module_name: str,
375
container_uri: str,
376
model_serving_container_image_uri: Optional[str] = None,
377
**kwargs
378
): ...
379
380
def run(
381
self,
382
dataset: Optional[Dataset] = None,
383
**kwargs
384
) -> Optional[Model]: ...
385
```
386
387
### Hyperparameter Tuning
388
389
Automated hyperparameter optimization with various search algorithms and early stopping.
390
391
```python { .api }
392
class HyperparameterTuningJob:
393
def __init__(
394
self,
395
display_name: str,
396
custom_job: Union[CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob],
397
metric_spec: Dict[str, str],
398
parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec],
399
max_trial_count: int,
400
parallel_trial_count: int,
401
max_failed_trial_count: int = 0,
402
search_algorithm: Optional[str] = None,
403
measurement_selection: str = "best",
404
labels: Optional[Dict[str, str]] = None,
405
encryption_spec_key_name: Optional[str] = None,
406
**kwargs
407
): ...
408
409
def run(
410
self,
411
service_account: Optional[str] = None,
412
network: Optional[str] = None,
413
timeout: Optional[int] = None,
414
restart_job_on_worker_restart: bool = False,
415
enable_web_access: bool = False,
416
tensorboard: Optional[str] = None,
417
sync: bool = True,
418
create_request_timeout: Optional[float] = None,
419
**kwargs
420
) -> Optional[Model]: ...
421
422
@property
423
def trials(self) -> List[Trial]: ...
424
@property
425
def best_trial(self) -> Optional[Trial]: ...
426
```
427
428
#### Usage Examples
429
430
**Hyperparameter tuning:**
431
```python
432
from google.cloud.aiplatform import hyperparameter_tuning as hpt
433
434
# Define custom training job
435
custom_job = aiplatform.CustomTrainingJob(
436
display_name="hyperparameter-tuning-job",
437
script_path="train.py",
438
container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-8:latest"
439
)
440
441
# Define hyperparameter space
442
hp_job = aiplatform.HyperparameterTuningJob(
443
display_name="tune-learning-rate",
444
custom_job=custom_job,
445
metric_spec={
446
"accuracy": "maximize",
447
"loss": "minimize"
448
},
449
parameter_spec={
450
"learning_rate": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
451
"batch_size": hpt.IntegerParameterSpec(min=16, max=128, scale="linear"),
452
"epochs": hpt.DiscreteParameterSpec(values=[50, 100, 200])
453
},
454
max_trial_count=20,
455
parallel_trial_count=3
456
)
457
458
# Run tuning
459
hp_job.run()
460
print(f"Best trial: {hp_job.best_trial}")
461
```
462
463
### Distributed Training
464
465
Multi-node and multi-GPU training support with various distribution strategies.
466
467
```python { .api }
468
# Worker pool specifications for distributed training
469
class WorkerPoolSpec:
470
machine_spec: MachineSpec
471
replica_count: int
472
container_spec: ContainerSpec
473
disk_spec: Optional[DiskSpec]
474
475
# Distributed training with multiple worker pools
476
def run_distributed_training(
477
worker_pool_specs: List[WorkerPoolSpec],
478
base_output_dir: str,
479
**kwargs
480
) -> Optional[Model]: ...
481
```
482
483
#### Usage Examples
484
485
**Multi-GPU training:**
486
```python
487
# Define worker pool with GPUs
488
worker_pool_specs = [
489
{
490
"machine_spec": {
491
"machine_type": "n1-standard-16",
492
"accelerator_type": "NVIDIA_TESLA_V100",
493
"accelerator_count": 4
494
},
495
"replica_count": 2,
496
"container_spec": {
497
"image_uri": "gcr.io/my-project/training:latest",
498
"args": ["--distributed", "--num-gpus=4"]
499
}
500
}
501
]
502
503
job = aiplatform.CustomJob(
504
display_name="distributed-training",
505
worker_pool_specs=worker_pool_specs
506
)
507
508
job.run()
509
```
510
511
## Types
512
513
```python { .api }
514
# Job state enumeration
515
class JobState(Enum):
516
JOB_STATE_UNSPECIFIED = 0
517
JOB_STATE_QUEUED = 1
518
JOB_STATE_PENDING = 2
519
JOB_STATE_RUNNING = 3
520
JOB_STATE_SUCCEEDED = 4
521
JOB_STATE_FAILED = 5
522
JOB_STATE_CANCELLING = 6
523
JOB_STATE_CANCELLED = 7
524
JOB_STATE_PAUSED = 8
525
JOB_STATE_EXPIRED = 9
526
527
# Training job base information
528
class TrainingJob:
529
resource_name: str
530
display_name: str
531
state: JobState
532
create_time: datetime
533
start_time: Optional[datetime]
534
end_time: Optional[datetime]
535
error: Optional[Status]
536
trial_count: Optional[int]
537
538
# Hyperparameter tuning trial
539
class Trial:
540
id: str
541
state: TrialState
542
parameters: List[Parameter]
543
final_measurement: Optional[Measurement]
544
measurements: List[Measurement]
545
start_time: datetime
546
end_time: Optional[datetime]
547
548
# Parameter specification for hyperparameter tuning
549
class ParameterSpec:
550
parameter_id: str
551
scale_type: ScaleType
552
conditional_parameter_specs: Optional[List[ConditionalParameterSpec]]
553
554
class DoubleParameterSpec(ParameterSpec):
555
min_value: float
556
max_value: float
557
558
class IntegerParameterSpec(ParameterSpec):
559
min_value: int
560
max_value: int
561
562
class CategoricalParameterSpec(ParameterSpec):
563
values: List[str]
564
565
class DiscreteParameterSpec(ParameterSpec):
566
values: List[float]
567
```
568
569
## Advanced Features
570
571
### Early Stopping
572
573
Automatic training termination when performance plateaus, saving time and compute costs.
574
575
```python
576
# AutoML with early stopping disabled
577
job = aiplatform.AutoMLTabularTrainingJob(
578
display_name="training-no-early-stop",
579
optimization_prediction_type="classification",
580
disable_early_stopping=True
581
)
582
```
583
584
### Training with Tensorboard
585
586
Integrated experiment tracking and visualization with Tensorboard.
587
588
```python
589
# Create Tensorboard instance
590
tensorboard = aiplatform.Tensorboard.create(display_name="my-tensorboard")
591
592
# Use with training job
593
job.run(
594
dataset=dataset,
595
tensorboard=tensorboard.resource_name,
596
# Training metrics will be automatically logged
597
)
598
```
599
600
### Custom Metrics and Objectives
601
602
Define custom optimization objectives for AutoML training.
603
604
```python
605
# Custom optimization objective
606
job = aiplatform.AutoMLTabularTrainingJob(
607
display_name="custom-optimization",
608
optimization_prediction_type="classification",
609
optimization_objective="maximize-precision-at-recall",
610
optimization_objective_recall_value=0.8 # Precision at 80% recall
611
)
612
```
613
614
### Resource Management
615
616
Fine-grained control over compute resources, storage, and networking.
617
618
```python
619
# Training with specific resource requirements
620
job.run(
621
replica_count=4, # 4 training replicas
622
machine_type="n1-highmem-8", # High memory machines
623
accelerator_type="NVIDIA_TESLA_T4",
624
accelerator_count=1,
625
boot_disk_type="pd-ssd",
626
boot_disk_size_gb=200,
627
network="projects/my-project/global/networks/my-vpc"
628
)
629
```
630
631
This comprehensive training system supports the full spectrum of ML training needs from automated AutoML to highly customized distributed training with enterprise-grade features for production ML workflows.