CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-aiplatform

Comprehensive Python client library for Google Cloud Vertex AI, offering machine learning tools, generative AI models, and MLOps capabilities

Pending
Overview
Eval results
Files

training.mddocs/

Model Training

Comprehensive training capabilities supporting AutoML, custom training jobs, and distributed training with full lifecycle management. The Vertex AI SDK provides both automated machine learning and flexible custom training options for various ML tasks and frameworks.

Capabilities

AutoML Training Jobs

Automated machine learning with minimal configuration for tabular, image, text, video, and forecasting tasks.

class AutoMLTabularTrainingJob:
    def __init__(
        self,
        display_name: str,
        optimization_prediction_type: str,
        optimization_objective: Optional[str] = None,
        column_specs: Optional[Dict[str, str]] = None,
        column_transformations: Optional[List[Dict[str, Any]]] = None,
        optimization_objective_recall_value: Optional[float] = None,
        optimization_objective_precision_value: Optional[float] = None,
        project: Optional[str] = None,
        location: Optional[str] = None,
        labels: Optional[Dict[str, str]] = None,
        training_encryption_spec_key_name: Optional[str] = None,
        model_encryption_spec_key_name: Optional[str] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: TabularDataset,
        target_column: str,
        training_fraction_split: Optional[float] = None,
        validation_fraction_split: Optional[float] = None,
        test_fraction_split: Optional[float] = None,
        predefined_split_column_name: Optional[str] = None,
        timestamp_split_column_name: Optional[str] = None,
        weight_column: Optional[str] = None,
        budget_milli_node_hours: int = 1000,
        model_display_name: Optional[str] = None,
        model_labels: Optional[Dict[str, str]] = None,
        model_id: Optional[str] = None,
        parent_model: Optional[str] = None,
        is_default_version: bool = True,
        model_version_aliases: Optional[Sequence[str]] = None,
        model_version_description: Optional[str] = None,
        disable_early_stopping: bool = False,
        export_evaluated_data_items: bool = False,
        export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None,
        export_evaluated_data_items_override_destination: bool = False,
        additional_experiments: Optional[List[str]] = None,
        sync: bool = True,
        create_request_timeout: Optional[float] = None,
        **kwargs
    ) -> Model: ...

    @property
    def state(self) -> JobState: ...
    @property
    def model(self) -> Optional[Model]: ...

Usage Examples

AutoML tabular classification:

import google.cloud.aiplatform as aiplatform

aiplatform.init(project='my-project', location='us-central1')

# Create dataset
dataset = aiplatform.TabularDataset.create(
    display_name="customer-data",
    gcs_source="gs://my-bucket/customer_data.csv"
)

# Create and run training job
job = aiplatform.AutoMLTabularTrainingJob(
    display_name="customer-classification",
    optimization_prediction_type="classification",
    optimization_objective="maximize-au-prc"
)

model = job.run(
    dataset=dataset,
    target_column="label",
    training_fraction_split=0.7,
    validation_fraction_split=0.15,
    test_fraction_split=0.15,
    budget_milli_node_hours=2000
)

print(f"Training completed. Model: {model.resource_name}")

AutoML Forecasting

Specialized time series forecasting with multiple model architectures and automatic feature engineering.

class AutoMLForecastingTrainingJob:
    def __init__(
        self,
        display_name: str,
        optimization_objective: Optional[str] = None,
        column_specs: Optional[Dict[str, str]] = None,
        column_transformations: Optional[List[Dict[str, Any]]] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: TimeSeriesDataset,
        target_column: str,
        time_column: str,
        time_series_identifier_column: str,
        unavailable_at_forecast_columns: List[str],
        available_at_forecast_columns: List[str],
        forecast_horizon: int,
        data_granularity_unit: str,
        data_granularity_count: int,
        **kwargs
    ) -> Model: ...

class SequenceToSequencePlusForecastingTrainingJob:
    def run(
        self,
        dataset: TimeSeriesDataset,
        target_column: str,
        time_column: str,
        time_series_identifier_column: str,
        forecast_horizon: int,
        data_granularity_unit: str,
        data_granularity_count: int,
        quantiles: Optional[List[float]] = None,
        validation_options: Optional[str] = None,
        **kwargs
    ) -> Model: ...

class TemporalFusionTransformerForecastingTrainingJob:
    def run(
        self,
        dataset: TimeSeriesDataset,
        target_column: str,
        time_column: str,
        time_series_identifier_column: str,
        forecast_horizon: int,
        data_granularity_unit: str,
        data_granularity_count: int,
        quantiles: Optional[List[float]] = None,
        context_window: Optional[int] = None,
        **kwargs
    ) -> Model: ...

AutoML Vision and NLP

Automated training for image, text, and video understanding tasks.

class AutoMLImageTrainingJob:
    def __init__(
        self,
        display_name: str,
        prediction_type: str = "classification",
        multi_label: bool = False,
        model_type: str = "CLOUD",
        base_model: Optional[Model] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: ImageDataset,
        model_display_name: Optional[str] = None,
        model_labels: Optional[Dict[str, str]] = None,
        training_fraction_split: Optional[float] = None,
        validation_fraction_split: Optional[float] = None,
        test_fraction_split: Optional[float] = None,
        budget_milli_node_hours: int = 8000,
        disable_early_stopping: bool = False,
        **kwargs
    ) -> Model: ...

class AutoMLTextTrainingJob:
    def __init__(
        self,
        display_name: str,
        prediction_type: str,
        multi_label: bool = False,
        sentiment_max: int = 10,
        **kwargs
    ): ...

    def run(
        self,
        dataset: TextDataset,
        **kwargs
    ) -> Model: ...

class AutoMLVideoTrainingJob:
    def __init__(
        self,
        display_name: str,
        prediction_type: str = "classification",
        model_type: str = "CLOUD",
        **kwargs
    ): ...

    def run(
        self,
        dataset: VideoDataset,
        **kwargs
    ) -> Model: ...

Custom Training Jobs

Flexible custom training with support for any ML framework and distributed training configurations.

class CustomTrainingJob:
    def __init__(
        self,
        display_name: str,
        script_path: str,
        container_uri: str,
        requirements: Optional[Sequence[str]] = None,
        model_serving_container_image_uri: Optional[str] = None,
        model_serving_container_predict_route: Optional[str] = None,
        model_serving_container_health_route: Optional[str] = None,
        model_serving_container_command: Optional[Sequence[str]] = None,
        model_serving_container_args: Optional[Sequence[str]] = None,
        model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
        model_serving_container_ports: Optional[Sequence[int]] = None,
        model_description: Optional[str] = None,
        model_instance_schema_uri: Optional[str] = None,
        model_parameters_schema_uri: Optional[str] = None,
        model_prediction_schema_uri: Optional[str] = None,
        labels: Optional[Dict[str, str]] = None,
        training_encryption_spec_key_name: Optional[str] = None,
        model_encryption_spec_key_name: Optional[str] = None,
        staging_bucket: Optional[str] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: Optional[Dataset] = None,
        annotation_schema_uri: Optional[str] = None,
        model_display_name: Optional[str] = None,
        model_labels: Optional[Dict[str, str]] = None,
        base_output_dir: Optional[str] = None,
        service_account: Optional[str] = None,
        network: Optional[str] = None,
        bigquery_destination: Optional[str] = None,
        args: Optional[List[str]] = None,
        environment_variables: Optional[Dict[str, str]] = None,
        replica_count: int = 1,
        machine_type: str = "n1-standard-4",
        accelerator_type: Optional[str] = None,
        accelerator_count: Optional[int] = None,
        boot_disk_type: str = "pd-ssd",
        boot_disk_size_gb: int = 100,
        training_fraction_split: Optional[float] = None,
        validation_fraction_split: Optional[float] = None,
        test_fraction_split: Optional[float] = None,
        training_filter_split: Optional[str] = None,
        validation_filter_split: Optional[str] = None,
        test_filter_split: Optional[str] = None,
        predefined_split_column_name: Optional[str] = None,
        timestamp_split_column_name: Optional[str] = None,
        tensorboard: Optional[str] = None,
        sync: bool = True,
        create_request_timeout: Optional[float] = None,
        disable_retries: bool = False,
        persistent_resource_id: Optional[str] = None,
        **kwargs
    ) -> Optional[Model]: ...

Usage Examples

Custom Python package training:

job = aiplatform.CustomTrainingJob(
    display_name="custom-sklearn-training",
    script_path="train.py",
    container_uri="gcr.io/cloud-aiplatform/training/scikit-learn-cpu.0-23:latest",
    requirements=["scikit-learn==0.23.2", "pandas>=1.0.0"],
    model_serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-23:latest"
)

model = job.run(
    dataset=dataset,
    replica_count=1,
    machine_type="n1-standard-4",
    args=["--epochs", "100", "--batch-size", "32"],
    environment_variables={"LEARNING_RATE": "0.001"}
)

Custom Container Training

Training with custom Docker containers for maximum flexibility and framework support.

class CustomContainerTrainingJob:
    def __init__(
        self,
        display_name: str,
        container_uri: str,
        model_serving_container_image_uri: Optional[str] = None,
        model_serving_container_predict_route: Optional[str] = None,
        model_serving_container_health_route: Optional[str] = None,
        model_serving_container_command: Optional[Sequence[str]] = None,
        model_serving_container_args: Optional[Sequence[str]] = None,
        model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
        model_serving_container_ports: Optional[Sequence[int]] = None,
        model_description: Optional[str] = None,
        model_instance_schema_uri: Optional[str] = None,
        model_parameters_schema_uri: Optional[str] = None,
        model_prediction_schema_uri: Optional[str] = None,
        explanation_metadata: Optional[explain.ExplanationMetadata] = None,
        explanation_parameters: Optional[explain.ExplanationParameters] = None,
        labels: Optional[Dict[str, str]] = None,
        training_encryption_spec_key_name: Optional[str] = None,
        model_encryption_spec_key_name: Optional[str] = None,
        staging_bucket: Optional[str] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: Optional[Dataset] = None,
        model_display_name: Optional[str] = None,
        model_labels: Optional[Dict[str, str]] = None,
        base_output_dir: Optional[str] = None,
        service_account: Optional[str] = None,
        network: Optional[str] = None,
        bigquery_destination: Optional[str] = None,
        args: Optional[List[str]] = None,
        environment_variables: Optional[Dict[str, str]] = None,
        replica_count: int = 1,
        machine_type: str = "n1-standard-4",
        accelerator_type: Optional[str] = None,
        accelerator_count: Optional[int] = None,
        boot_disk_type: str = "pd-ssd",
        boot_disk_size_gb: int = 100,
        training_fraction_split: Optional[float] = None,
        validation_fraction_split: Optional[float] = None,
        test_fraction_split: Optional[float] = None,
        sync: bool = True,
        create_request_timeout: Optional[float] = None,
        restart_job_on_worker_restart: bool = False,
        enable_web_access: bool = False,
        enable_dashboard_access: bool = False,
        tensorboard: Optional[str] = None,
        reduce_image_size: bool = False,
        **kwargs
    ) -> Optional[Model]: ...

Custom Python Package Training

Training with Python packages uploaded to the cloud.

class CustomPythonPackageTrainingJob:
    def __init__(
        self,
        display_name: str,
        python_package_gcs_uri: str,
        python_module_name: str,
        container_uri: str,
        model_serving_container_image_uri: Optional[str] = None,
        **kwargs
    ): ...

    def run(
        self,
        dataset: Optional[Dataset] = None,
        **kwargs
    ) -> Optional[Model]: ...

Hyperparameter Tuning

Automated hyperparameter optimization with various search algorithms and early stopping.

class HyperparameterTuningJob:
    def __init__(
        self,
        display_name: str,
        custom_job: Union[CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob],
        metric_spec: Dict[str, str],
        parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec],
        max_trial_count: int,
        parallel_trial_count: int,
        max_failed_trial_count: int = 0,
        search_algorithm: Optional[str] = None,
        measurement_selection: str = "best",
        labels: Optional[Dict[str, str]] = None,
        encryption_spec_key_name: Optional[str] = None,
        **kwargs
    ): ...

    def run(
        self,
        service_account: Optional[str] = None,
        network: Optional[str] = None,
        timeout: Optional[int] = None,
        restart_job_on_worker_restart: bool = False,
        enable_web_access: bool = False,
        tensorboard: Optional[str] = None,
        sync: bool = True,
        create_request_timeout: Optional[float] = None,
        **kwargs
    ) -> Optional[Model]: ...

    @property
    def trials(self) -> List[Trial]: ...
    @property
    def best_trial(self) -> Optional[Trial]: ...

Usage Examples

Hyperparameter tuning:

from google.cloud.aiplatform import hyperparameter_tuning as hpt

# Define custom training job
custom_job = aiplatform.CustomTrainingJob(
    display_name="hyperparameter-tuning-job",
    script_path="train.py",
    container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-8:latest"
)

# Define hyperparameter space
hp_job = aiplatform.HyperparameterTuningJob(
    display_name="tune-learning-rate",
    custom_job=custom_job,
    metric_spec={
        "accuracy": "maximize",
        "loss": "minimize"
    },
    parameter_spec={
        "learning_rate": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
        "batch_size": hpt.IntegerParameterSpec(min=16, max=128, scale="linear"),
        "epochs": hpt.DiscreteParameterSpec(values=[50, 100, 200])
    },
    max_trial_count=20,
    parallel_trial_count=3
)

# Run tuning
hp_job.run()
print(f"Best trial: {hp_job.best_trial}")

Distributed Training

Multi-node and multi-GPU training support with various distribution strategies.

# Worker pool specifications for distributed training
class WorkerPoolSpec:
    machine_spec: MachineSpec
    replica_count: int
    container_spec: ContainerSpec
    disk_spec: Optional[DiskSpec]

# Distributed training with multiple worker pools
def run_distributed_training(
    worker_pool_specs: List[WorkerPoolSpec],
    base_output_dir: str,
    **kwargs
) -> Optional[Model]: ...

Usage Examples

Multi-GPU training:

# Define worker pool with GPUs
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-16",
            "accelerator_type": "NVIDIA_TESLA_V100",
            "accelerator_count": 4
        },
        "replica_count": 2,
        "container_spec": {
            "image_uri": "gcr.io/my-project/training:latest",
            "args": ["--distributed", "--num-gpus=4"]
        }
    }
]

job = aiplatform.CustomJob(
    display_name="distributed-training",
    worker_pool_specs=worker_pool_specs
)

job.run()

Types

# Job state enumeration
class JobState(Enum):
    JOB_STATE_UNSPECIFIED = 0
    JOB_STATE_QUEUED = 1
    JOB_STATE_PENDING = 2
    JOB_STATE_RUNNING = 3
    JOB_STATE_SUCCEEDED = 4
    JOB_STATE_FAILED = 5
    JOB_STATE_CANCELLING = 6
    JOB_STATE_CANCELLED = 7
    JOB_STATE_PAUSED = 8
    JOB_STATE_EXPIRED = 9

# Training job base information
class TrainingJob:
    resource_name: str
    display_name: str
    state: JobState
    create_time: datetime
    start_time: Optional[datetime]
    end_time: Optional[datetime]
    error: Optional[Status]
    trial_count: Optional[int]

# Hyperparameter tuning trial
class Trial:
    id: str
    state: TrialState
    parameters: List[Parameter]
    final_measurement: Optional[Measurement]
    measurements: List[Measurement]
    start_time: datetime
    end_time: Optional[datetime]

# Parameter specification for hyperparameter tuning
class ParameterSpec:
    parameter_id: str
    scale_type: ScaleType
    conditional_parameter_specs: Optional[List[ConditionalParameterSpec]]

class DoubleParameterSpec(ParameterSpec):
    min_value: float
    max_value: float

class IntegerParameterSpec(ParameterSpec):
    min_value: int
    max_value: int

class CategoricalParameterSpec(ParameterSpec):
    values: List[str]

class DiscreteParameterSpec(ParameterSpec):
    values: List[float]

Advanced Features

Early Stopping

Automatic training termination when performance plateaus, saving time and compute costs.

# AutoML with early stopping disabled
job = aiplatform.AutoMLTabularTrainingJob(
    display_name="training-no-early-stop",
    optimization_prediction_type="classification",
    disable_early_stopping=True
)

Training with Tensorboard

Integrated experiment tracking and visualization with Tensorboard.

# Create Tensorboard instance
tensorboard = aiplatform.Tensorboard.create(display_name="my-tensorboard")

# Use with training job
job.run(
    dataset=dataset,
    tensorboard=tensorboard.resource_name,
    # Training metrics will be automatically logged
)

Custom Metrics and Objectives

Define custom optimization objectives for AutoML training.

# Custom optimization objective
job = aiplatform.AutoMLTabularTrainingJob(
    display_name="custom-optimization",
    optimization_prediction_type="classification",
    optimization_objective="maximize-precision-at-recall",
    optimization_objective_recall_value=0.8  # Precision at 80% recall
)

Resource Management

Fine-grained control over compute resources, storage, and networking.

# Training with specific resource requirements
job.run(
    replica_count=4,  # 4 training replicas
    machine_type="n1-highmem-8",  # High memory machines
    accelerator_type="NVIDIA_TESLA_T4",
    accelerator_count=1,
    boot_disk_type="pd-ssd",
    boot_disk_size_gb=200,
    network="projects/my-project/global/networks/my-vpc"
)

This comprehensive training system supports the full spectrum of ML training needs from automated AutoML to highly customized distributed training with enterprise-grade features for production ML workflows.

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-aiplatform

docs

batch.md

datasets.md

experiments.md

feature-store.md

generative-ai.md

index.md

models.md

pipelines.md

training.md

vector-search.md

vision.md

tile.json