CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-apache-airflow-providers-google

Provider package for Google services integration with Apache Airflow, including Google Ads, Google Cloud (GCP), Google Firebase, Google LevelDB, Google Marketing Platform, and Google Workspace

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

gcp-services.mddocs/

Google Cloud Platform Services

Comprehensive integration with Google Cloud Platform providing complete CRUD operations, batch processing, real-time streaming, and machine learning capabilities across 40+ Google Cloud services.

Capabilities

BigQuery

Data warehouse and analytics platform integration with support for datasets, tables, jobs, and complex queries.

class BigQueryHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        use_legacy_sql: bool = True,
        location: Optional[str] = None,
        priority: str = "INTERACTIVE",
        job_id: Optional[str] = None,
        job_retry: Optional[Retry] = None,
        job_timeout: Optional[float] = None,
        **kwargs
    ): ...
    
    def get_service(self): ...
    def create_empty_table(
        self,
        project_id: str,
        dataset_id: str,
        table_id: str,
        schema_fields: List[Dict] = None,
        time_partitioning: Dict = None,
        cluster_fields: List[str] = None,
        labels: Dict = None,
        view: Dict = None,
        materialized_view: Dict = None,
        encryption_configuration: Dict = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        exists_ok: bool = False
    ): ...
    def create_empty_dataset(
        self,
        dataset_id: str,
        project_id: str,
        location: Optional[str] = None,
        dataset_reference: Optional[Dict] = None,
        exists_ok: bool = False
    ): ...
    def insert_job(
        self,
        configuration: Dict,
        project_id: str,
        location: Optional[str] = None,
        job_id: Optional[str] = None,
        timeout: Optional[float] = None,
        retry: Optional[Retry] = None,
        nowait: bool = False
    ): ...

class BigQueryAsyncHook(GoogleBaseAsyncHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    async def get_job_status(self, job_id: str, project_id: str, location: str): ...

class BigQueryCreateDatasetOperator(BaseOperator):
    def __init__(
        self,
        dataset_id: str,
        project_id: Optional[str] = None,
        dataset_reference: Optional[Dict] = None,
        location: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        exists_ok: bool = False,
        **kwargs
    ): ...

class BigQueryCreateEmptyTableOperator(BaseOperator):
    def __init__(
        self,
        dataset_id: str,
        table_id: str,
        project_id: Optional[str] = None,
        schema_fields: Optional[List] = None,
        gcs_schema_object: Optional[str] = None,
        time_partitioning: Optional[Dict] = None,
        bigquery_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class BigQueryInsertJobOperator(BaseOperator):
    def __init__(
        self,
        configuration: Dict,
        project_id: Optional[str] = None,
        location: Optional[str] = None,
        job_id: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class BigQueryTableExistenceSensor(BaseSensorOperator):
    def __init__(
        self,
        project_id: str,
        dataset_id: str,
        table_id: str,
        bigquery_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Google Cloud Storage (GCS)

Object storage service integration for bucket and object management, with support for lifecycle policies and access controls.

class GCSHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def get_conn(self): ...
    def list(
        self,
        bucket_name: str,
        versions: Optional[bool] = None,
        max_results: Optional[int] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None
    ): ...
    def exists(self, bucket_name: str, object_name: str): ...
    def upload(
        self,
        bucket_name: str,
        object_name: str,
        filename: Optional[str] = None,
        data: Optional[Union[str, bytes]] = None,
        mime_type: Optional[str] = None,
        gzip: bool = False,
        encoding: str = "utf-8",
        chunk_size: Optional[int] = None,
        timeout: Optional[float] = None,
        num_max_attempts: int = 1
    ): ...
    def download(
        self,
        bucket_name: str,
        object_name: str,
        filename: Optional[str] = None,
        chunk_size: int = 104857600,
        timeout: Optional[float] = None,
        num_max_attempts: int = 1
    ): ...

class GCSCreateBucketOperator(BaseOperator):
    def __init__(
        self,
        bucket_name: str,
        project_id: Optional[str] = None,
        storage_class: str = "MULTI_REGIONAL",
        location: str = "US",
        labels: Optional[Dict] = None,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class GCSDeleteBucketOperator(BaseOperator):
    def __init__(
        self,
        bucket_name: str,
        force: bool = True,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class GCSObjectExistenceSensor(BaseSensorOperator):
    def __init__(
        self,
        bucket: str,
        object: str,
        google_cloud_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Dataproc

Managed Apache Spark and Hadoop service integration for big data processing.

class DataprocHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def create_cluster(
        self,
        project_id: str,
        region: str,
        cluster_name: str,
        cluster_config: Dict,
        labels: Optional[Dict] = None,
        request_id: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def delete_cluster(
        self,
        project_id: str,
        region: str,
        cluster_name: str,
        cluster_uuid: Optional[str] = None,
        request_id: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def submit_job(
        self,
        project_id: str,
        region: str,
        job: Dict,
        request_id: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...

class DataprocCreateClusterOperator(BaseOperator):
    def __init__(
        self,
        cluster_name: str,
        project_id: Optional[str] = None,
        region: str = "global",
        cluster_config: Optional[Dict] = None,
        labels: Optional[Dict] = None,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class DataprocSubmitJobOperator(BaseOperator):
    def __init__(
        self,
        job: Dict,
        project_id: Optional[str] = None,
        region: str = "global",
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Dataflow

Stream and batch data processing service integration using Apache Beam.

class DataflowHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def start_java_dataflow(
        self,
        job_name: str,
        variables: Dict,
        jar: str,
        project_id: str,
        job_class: Optional[str] = None,
        append_job_name: bool = True,
        multiple_jobs: bool = False,
        on_new_job_id_callback: Optional[Callable[[str], None]] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION
    ): ...
    def start_python_dataflow(
        self,
        job_name: str,
        variables: Dict,
        dataflow: str,
        py_options: List[str],
        project_id: str,
        append_job_name: bool = True,
        py_interpreter: str = "python3",
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        location: str = DEFAULT_DATAFLOW_LOCATION
    ): ...

class DataflowCreateJavaJobOperator(BaseOperator):
    def __init__(
        self,
        jar: str,
        job_name: str = "{{task.task_id}}",
        dataflow_default_options: Optional[Dict] = None,
        options: Optional[Dict] = None,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

class DataflowCreatePythonJobOperator(BaseOperator):
    def __init__(
        self,
        py_file: str,
        job_name: str = "{{task.task_id}}",
        dataflow_default_options: Optional[Dict] = None,
        options: Optional[Dict] = None,
        py_interpreter: str = "python3",
        py_options: Optional[List[str]] = None,
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Vertex AI

Google Cloud's unified ML platform integration for training, deployment, and management of machine learning models.

class VertexAIHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def create_custom_training_job(
        self,
        project_id: str,
        region: str,
        display_name: str,
        script_path: str,
        container_uri: str,
        requirements: Optional[Sequence[str]] = None,
        model_serving_container_uri: Optional[str] = None,
        model_serving_container_predict_route: Optional[str] = None,
        model_serving_container_health_route: Optional[str] = None,
        model_serving_container_command: Optional[Sequence[str]] = None,
        model_serving_container_args: Optional[Sequence[str]] = None,
        model_serving_container_environment_variables: Optional[Dict[str, str]] = None,
        model_serving_container_ports: Optional[Sequence[Dict[str, str]]] = None,
        model_description: Optional[str] = None,
        model_instance_schema_uri: Optional[str] = None,
        model_parameters_schema_uri: Optional[str] = None,
        model_prediction_schema_uri: Optional[str] = None,
        labels: Optional[Dict[str, str]] = None,
        training_encryption_spec_key_name: Optional[str] = None,
        model_encryption_spec_key_name: Optional[str] = None,
        staging_bucket: Optional[str] = None,
        **kwargs
    ): ...

class CreateCustomTrainingJobOperator(BaseOperator):
    def __init__(
        self,
        project_id: str,
        region: str,
        display_name: str,
        script_path: str,
        container_uri: str,
        model_serving_container_uri: Optional[str] = None,
        requirements: Optional[Sequence[str]] = None,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Pub/Sub

Messaging service integration for real-time event streaming and asynchronous communication.

class PubSubHook(GoogleBaseHook):
    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def create_topic(
        self,
        project_id: str,
        topic: str,
        labels: Optional[Dict[str, str]] = None,
        message_retention_duration: Optional[str] = None,
        kms_key_name: Optional[str] = None,
        schema_settings: Optional[Dict] = None,
        message_storage_policy: Optional[Dict] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def create_subscription(
        self,
        project_id: str,
        topic: str,
        subscription: str,
        subscription_project_id: Optional[str] = None,
        ack_deadline_secs: int = 10,
        fail_if_exists: bool = False,
        push_config: Optional[Dict] = None,
        retain_acked_messages: Optional[bool] = None,
        message_retention_duration: Optional[str] = None,
        labels: Optional[Dict[str, str]] = None,
        enable_message_ordering: bool = False,
        expiration_policy: Optional[Dict] = None,
        filter_: Optional[str] = None,
        dead_letter_policy: Optional[Dict] = None,
        retry_policy: Optional[Dict] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def publish(
        self,
        project_id: str,
        topic: str,
        messages: List[Dict],
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...

class PubSubPullSensor(BaseSensorOperator):
    def __init__(
        self,
        project_id: str,
        subscription: str,
        max_messages: int = 5,
        ack_messages: bool = False,
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...

Cloud SQL

Managed relational database service integration supporting MySQL, PostgreSQL, and SQL Server.

class CloudSQLHook(GoogleBaseHook):
    def __init__(
        self,
        api_version: str = "v1beta4",
        gcp_conn_id: str = "google_cloud_default",
        **kwargs
    ): ...
    
    def create_instance(
        self,
        project_id: str,
        body: Dict,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def patch_instance(
        self,
        project_id: str,
        body: Dict,
        instance: str,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def delete_instance(
        self,
        project_id: str,
        instance: str,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...
    def create_database(
        self,
        project_id: str,
        instance: str,
        body: Dict,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None
    ): ...

Types

from typing import Dict, List, Optional, Union, Any, Sequence, Callable
from google.api_core.retry import Retry
from airflow.models import BaseOperator
from airflow.sensors.base import BaseSensorOperator

# BigQuery specific types
BigQueryJob = Dict[str, Any]
BigQueryTable = Dict[str, Any]
BigQuerySchema = List[Dict[str, str]]

# GCS specific types
GCSObject = Dict[str, Any]
GCSBucket = Dict[str, Any]

# Dataproc specific types
DataprocCluster = Dict[str, Any]
DataprocJob = Dict[str, Any]

# Common GCP types
GcpResource = Dict[str, Any]
ResourceLabels = Dict[str, str]
OperationResult = Dict[str, Any]

Install with Tessl CLI

npx tessl i tessl/pypi-apache-airflow-providers-google

docs

common-utilities.md

data-transfers.md

firebase.md

gcp-services.md

google-ads.md

google-workspace.md

index.md

leveldb.md

marketing-platform.md

tile.json