Comprehensive Python client library for Google Cloud Vertex AI, offering machine learning tools, generative AI models, and MLOps capabilities
—
Comprehensive training capabilities supporting AutoML, custom training jobs, and distributed training with full lifecycle management. The Vertex AI SDK provides both automated machine learning and flexible custom training options for various ML tasks and frameworks.
Automated machine learning with minimal configuration for tabular, image, text, video, and forecasting tasks.
class AutoMLTabularTrainingJob:
def __init__(
self,
display_name: str,
optimization_prediction_type: str,
optimization_objective: Optional[str] = None,
column_specs: Optional[Dict[str, str]] = None,
column_transformations: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
project: Optional[str] = None,
location: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
training_encryption_spec_key_name: Optional[str] = None,
model_encryption_spec_key_name: Optional[str] = None,
**kwargs
): ...
def run(
self,
dataset: TabularDataset,
target_column: str,
training_fraction_split: Optional[float] = None,
validation_fraction_split: Optional[float] = None,
test_fraction_split: Optional[float] = None,
predefined_split_column_name: Optional[str] = None,
timestamp_split_column_name: Optional[str] = None,
weight_column: Optional[str] = None,
budget_milli_node_hours: int = 1000,
model_display_name: Optional[str] = None,
model_labels: Optional[Dict[str, str]] = None,
model_id: Optional[str] = None,
parent_model: Optional[str] = None,
is_default_version: bool = True,
model_version_aliases: Optional[Sequence[str]] = None,
model_version_description: Optional[str] = None,
disable_early_stopping: bool = False,
export_evaluated_data_items: bool = False,
export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None,
export_evaluated_data_items_override_destination: bool = False,
additional_experiments: Optional[List[str]] = None,
sync: bool = True,
create_request_timeout: Optional[float] = None,
**kwargs
) -> Model: ...
@property
def state(self) -> JobState: ...
@property
def model(self) -> Optional[Model]: ...AutoML tabular classification:
import google.cloud.aiplatform as aiplatform
aiplatform.init(project='my-project', location='us-central1')
# Create dataset
dataset = aiplatform.TabularDataset.create(
display_name="customer-data",
gcs_source="gs://my-bucket/customer_data.csv"
)
# Create and run training job
job = aiplatform.AutoMLTabularTrainingJob(
display_name="customer-classification",
optimization_prediction_type="classification",
optimization_objective="maximize-au-prc"
)
model = job.run(
dataset=dataset,
target_column="label",
training_fraction_split=0.7,
validation_fraction_split=0.15,
test_fraction_split=0.15,
budget_milli_node_hours=2000
)
print(f"Training completed. Model: {model.resource_name}")Specialized time series forecasting with multiple model architectures and automatic feature engineering.
class AutoMLForecastingTrainingJob:
def __init__(
self,
display_name: str,
optimization_objective: Optional[str] = None,
column_specs: Optional[Dict[str, str]] = None,
column_transformations: Optional[List[Dict[str, Any]]] = None,
**kwargs
): ...
def run(
self,
dataset: TimeSeriesDataset,
target_column: str,
time_column: str,
time_series_identifier_column: str,
unavailable_at_forecast_columns: List[str],
available_at_forecast_columns: List[str],
forecast_horizon: int,
data_granularity_unit: str,
data_granularity_count: int,
**kwargs
) -> Model: ...
class SequenceToSequencePlusForecastingTrainingJob:
def run(
self,
dataset: TimeSeriesDataset,
target_column: str,
time_column: str,
time_series_identifier_column: str,
forecast_horizon: int,
data_granularity_unit: str,
data_granularity_count: int,
quantiles: Optional[List[float]] = None,
validation_options: Optional[str] = None,
**kwargs
) -> Model: ...
class TemporalFusionTransformerForecastingTrainingJob:
def run(
self,
dataset: TimeSeriesDataset,
target_column: str,
time_column: str,
time_series_identifier_column: str,
forecast_horizon: int,
data_granularity_unit: str,
data_granularity_count: int,
quantiles: Optional[List[float]] = None,
context_window: Optional[int] = None,
**kwargs
) -> Model: ...Automated training for image, text, and video understanding tasks.
class AutoMLImageTrainingJob:
def __init__(
self,
display_name: str,
prediction_type: str = "classification",
multi_label: bool = False,
model_type: str = "CLOUD",
base_model: Optional[Model] = None,
**kwargs
): ...
def run(
self,
dataset: ImageDataset,
model_display_name: Optional[str] = None,
model_labels: Optional[Dict[str, str]] = None,
training_fraction_split: Optional[float] = None,
validation_fraction_split: Optional[float] = None,
test_fraction_split: Optional[float] = None,
budget_milli_node_hours: int = 8000,
disable_early_stopping: bool = False,
**kwargs
) -> Model: ...
class AutoMLTextTrainingJob:
def __init__(
self,
display_name: str,
prediction_type: str,
multi_label: bool = False,
sentiment_max: int = 10,
**kwargs
): ...
def run(
self,
dataset: TextDataset,
**kwargs
) -> Model: ...
class AutoMLVideoTrainingJob:
def __init__(
self,
display_name: str,
prediction_type: str = "classification",
model_type: str = "CLOUD",
**kwargs
): ...
def run(
self,
dataset: VideoDataset,
**kwargs
) -> Model: ...Flexible custom training with support for any ML framework and distributed training configurations.
class CustomTrainingJob:
def __init__(
self,
display_name: str,
script_path: str,
container_uri: str,
requirements: Optional[Sequence[str]] = None,
model_serving_container_image_uri: Optional[str] = None,
model_serving_container_predict_route: Optional[str] = None,
model_serving_container_health_route: Optional[str] = None,
model_serving_container_command: Optional[Sequence[str]] = None,
model_serving_container_args: Optional[Sequence[str]] = None,
model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
model_serving_container_ports: Optional[Sequence[int]] = None,
model_description: Optional[str] = None,
model_instance_schema_uri: Optional[str] = None,
model_parameters_schema_uri: Optional[str] = None,
model_prediction_schema_uri: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
training_encryption_spec_key_name: Optional[str] = None,
model_encryption_spec_key_name: Optional[str] = None,
staging_bucket: Optional[str] = None,
**kwargs
): ...
def run(
self,
dataset: Optional[Dataset] = None,
annotation_schema_uri: Optional[str] = None,
model_display_name: Optional[str] = None,
model_labels: Optional[Dict[str, str]] = None,
base_output_dir: Optional[str] = None,
service_account: Optional[str] = None,
network: Optional[str] = None,
bigquery_destination: Optional[str] = None,
args: Optional[List[str]] = None,
environment_variables: Optional[Dict[str, str]] = None,
replica_count: int = 1,
machine_type: str = "n1-standard-4",
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
training_fraction_split: Optional[float] = None,
validation_fraction_split: Optional[float] = None,
test_fraction_split: Optional[float] = None,
training_filter_split: Optional[str] = None,
validation_filter_split: Optional[str] = None,
test_filter_split: Optional[str] = None,
predefined_split_column_name: Optional[str] = None,
timestamp_split_column_name: Optional[str] = None,
tensorboard: Optional[str] = None,
sync: bool = True,
create_request_timeout: Optional[float] = None,
disable_retries: bool = False,
persistent_resource_id: Optional[str] = None,
**kwargs
) -> Optional[Model]: ...Custom Python package training:
job = aiplatform.CustomTrainingJob(
display_name="custom-sklearn-training",
script_path="train.py",
container_uri="gcr.io/cloud-aiplatform/training/scikit-learn-cpu.0-23:latest",
requirements=["scikit-learn==0.23.2", "pandas>=1.0.0"],
model_serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-23:latest"
)
model = job.run(
dataset=dataset,
replica_count=1,
machine_type="n1-standard-4",
args=["--epochs", "100", "--batch-size", "32"],
environment_variables={"LEARNING_RATE": "0.001"}
)Training with custom Docker containers for maximum flexibility and framework support.
class CustomContainerTrainingJob:
def __init__(
self,
display_name: str,
container_uri: str,
model_serving_container_image_uri: Optional[str] = None,
model_serving_container_predict_route: Optional[str] = None,
model_serving_container_health_route: Optional[str] = None,
model_serving_container_command: Optional[Sequence[str]] = None,
model_serving_container_args: Optional[Sequence[str]] = None,
model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
model_serving_container_ports: Optional[Sequence[int]] = None,
model_description: Optional[str] = None,
model_instance_schema_uri: Optional[str] = None,
model_parameters_schema_uri: Optional[str] = None,
model_prediction_schema_uri: Optional[str] = None,
explanation_metadata: Optional[explain.ExplanationMetadata] = None,
explanation_parameters: Optional[explain.ExplanationParameters] = None,
labels: Optional[Dict[str, str]] = None,
training_encryption_spec_key_name: Optional[str] = None,
model_encryption_spec_key_name: Optional[str] = None,
staging_bucket: Optional[str] = None,
**kwargs
): ...
def run(
self,
dataset: Optional[Dataset] = None,
model_display_name: Optional[str] = None,
model_labels: Optional[Dict[str, str]] = None,
base_output_dir: Optional[str] = None,
service_account: Optional[str] = None,
network: Optional[str] = None,
bigquery_destination: Optional[str] = None,
args: Optional[List[str]] = None,
environment_variables: Optional[Dict[str, str]] = None,
replica_count: int = 1,
machine_type: str = "n1-standard-4",
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
training_fraction_split: Optional[float] = None,
validation_fraction_split: Optional[float] = None,
test_fraction_split: Optional[float] = None,
sync: bool = True,
create_request_timeout: Optional[float] = None,
restart_job_on_worker_restart: bool = False,
enable_web_access: bool = False,
enable_dashboard_access: bool = False,
tensorboard: Optional[str] = None,
reduce_image_size: bool = False,
**kwargs
) -> Optional[Model]: ...Training with Python packages uploaded to the cloud.
class CustomPythonPackageTrainingJob:
def __init__(
self,
display_name: str,
python_package_gcs_uri: str,
python_module_name: str,
container_uri: str,
model_serving_container_image_uri: Optional[str] = None,
**kwargs
): ...
def run(
self,
dataset: Optional[Dataset] = None,
**kwargs
) -> Optional[Model]: ...Automated hyperparameter optimization with various search algorithms and early stopping.
class HyperparameterTuningJob:
def __init__(
self,
display_name: str,
custom_job: Union[CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob],
metric_spec: Dict[str, str],
parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec],
max_trial_count: int,
parallel_trial_count: int,
max_failed_trial_count: int = 0,
search_algorithm: Optional[str] = None,
measurement_selection: str = "best",
labels: Optional[Dict[str, str]] = None,
encryption_spec_key_name: Optional[str] = None,
**kwargs
): ...
def run(
self,
service_account: Optional[str] = None,
network: Optional[str] = None,
timeout: Optional[int] = None,
restart_job_on_worker_restart: bool = False,
enable_web_access: bool = False,
tensorboard: Optional[str] = None,
sync: bool = True,
create_request_timeout: Optional[float] = None,
**kwargs
) -> Optional[Model]: ...
@property
def trials(self) -> List[Trial]: ...
@property
def best_trial(self) -> Optional[Trial]: ...Hyperparameter tuning:
from google.cloud.aiplatform import hyperparameter_tuning as hpt
# Define custom training job
custom_job = aiplatform.CustomTrainingJob(
display_name="hyperparameter-tuning-job",
script_path="train.py",
container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-8:latest"
)
# Define hyperparameter space
hp_job = aiplatform.HyperparameterTuningJob(
display_name="tune-learning-rate",
custom_job=custom_job,
metric_spec={
"accuracy": "maximize",
"loss": "minimize"
},
parameter_spec={
"learning_rate": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
"batch_size": hpt.IntegerParameterSpec(min=16, max=128, scale="linear"),
"epochs": hpt.DiscreteParameterSpec(values=[50, 100, 200])
},
max_trial_count=20,
parallel_trial_count=3
)
# Run tuning
hp_job.run()
print(f"Best trial: {hp_job.best_trial}")Multi-node and multi-GPU training support with various distribution strategies.
# Worker pool specifications for distributed training
class WorkerPoolSpec:
machine_spec: MachineSpec
replica_count: int
container_spec: ContainerSpec
disk_spec: Optional[DiskSpec]
# Distributed training with multiple worker pools
def run_distributed_training(
worker_pool_specs: List[WorkerPoolSpec],
base_output_dir: str,
**kwargs
) -> Optional[Model]: ...Multi-GPU training:
# Define worker pool with GPUs
worker_pool_specs = [
{
"machine_spec": {
"machine_type": "n1-standard-16",
"accelerator_type": "NVIDIA_TESLA_V100",
"accelerator_count": 4
},
"replica_count": 2,
"container_spec": {
"image_uri": "gcr.io/my-project/training:latest",
"args": ["--distributed", "--num-gpus=4"]
}
}
]
job = aiplatform.CustomJob(
display_name="distributed-training",
worker_pool_specs=worker_pool_specs
)
job.run()# Job state enumeration
class JobState(Enum):
JOB_STATE_UNSPECIFIED = 0
JOB_STATE_QUEUED = 1
JOB_STATE_PENDING = 2
JOB_STATE_RUNNING = 3
JOB_STATE_SUCCEEDED = 4
JOB_STATE_FAILED = 5
JOB_STATE_CANCELLING = 6
JOB_STATE_CANCELLED = 7
JOB_STATE_PAUSED = 8
JOB_STATE_EXPIRED = 9
# Training job base information
class TrainingJob:
resource_name: str
display_name: str
state: JobState
create_time: datetime
start_time: Optional[datetime]
end_time: Optional[datetime]
error: Optional[Status]
trial_count: Optional[int]
# Hyperparameter tuning trial
class Trial:
id: str
state: TrialState
parameters: List[Parameter]
final_measurement: Optional[Measurement]
measurements: List[Measurement]
start_time: datetime
end_time: Optional[datetime]
# Parameter specification for hyperparameter tuning
class ParameterSpec:
parameter_id: str
scale_type: ScaleType
conditional_parameter_specs: Optional[List[ConditionalParameterSpec]]
class DoubleParameterSpec(ParameterSpec):
min_value: float
max_value: float
class IntegerParameterSpec(ParameterSpec):
min_value: int
max_value: int
class CategoricalParameterSpec(ParameterSpec):
values: List[str]
class DiscreteParameterSpec(ParameterSpec):
values: List[float]Automatic training termination when performance plateaus, saving time and compute costs.
# AutoML with early stopping disabled
job = aiplatform.AutoMLTabularTrainingJob(
display_name="training-no-early-stop",
optimization_prediction_type="classification",
disable_early_stopping=True
)Integrated experiment tracking and visualization with Tensorboard.
# Create Tensorboard instance
tensorboard = aiplatform.Tensorboard.create(display_name="my-tensorboard")
# Use with training job
job.run(
dataset=dataset,
tensorboard=tensorboard.resource_name,
# Training metrics will be automatically logged
)Define custom optimization objectives for AutoML training.
# Custom optimization objective
job = aiplatform.AutoMLTabularTrainingJob(
display_name="custom-optimization",
optimization_prediction_type="classification",
optimization_objective="maximize-precision-at-recall",
optimization_objective_recall_value=0.8 # Precision at 80% recall
)Fine-grained control over compute resources, storage, and networking.
# Training with specific resource requirements
job.run(
replica_count=4, # 4 training replicas
machine_type="n1-highmem-8", # High memory machines
accelerator_type="NVIDIA_TESLA_T4",
accelerator_count=1,
boot_disk_type="pd-ssd",
boot_disk_size_gb=200,
network="projects/my-project/global/networks/my-vpc"
)This comprehensive training system supports the full spectrum of ML training needs from automated AutoML to highly customized distributed training with enterprise-grade features for production ML workflows.
Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-aiplatform