CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-bigquery

Google BigQuery API client library for Python providing comprehensive data warehouse and analytics capabilities

Pending
Overview
Eval results
Files

dataset-management.mddocs/

Dataset Management

Dataset creation, configuration, access control, and metadata management. Datasets serve as logical containers for tables, views, models, and routines within BigQuery, providing organization and access control boundaries.

Capabilities

Dataset Definition

Create and configure BigQuery datasets with metadata and access controls.

class Dataset:
    def __init__(self, dataset_ref: Union[DatasetReference, str]):
        """
        Initialize a Dataset.

        Args:
            dataset_ref: Dataset reference string or DatasetReference object.
        """

    @property
    def reference(self) -> DatasetReference:
        """Dataset reference object."""

    @property
    def dataset_id(self) -> str:
        """Dataset ID."""

    @property
    def project(self) -> str:
        """Project ID containing the dataset."""

    @property
    def location(self) -> str:
        """Geographic location of the dataset."""

    @location.setter
    def location(self, value: str): ...

    @property
    def friendly_name(self) -> str:
        """Human-readable dataset name."""

    @friendly_name.setter
    def friendly_name(self, value: str): ...

    @property
    def description(self) -> str:
        """Dataset description."""

    @description.setter
    def description(self, value: str): ...

    @property
    def labels(self) -> Dict[str, str]:
        """Labels for organizing and filtering datasets."""

    @labels.setter
    def labels(self, value: Dict[str, str]): ...

    @property
    def access_entries(self) -> List[AccessEntry]:
        """Access control entries for the dataset."""

    @access_entries.setter
    def access_entries(self, value: List[AccessEntry]): ...

    @property
    def created(self) -> datetime.datetime:
        """Dataset creation timestamp."""

    @property
    def modified(self) -> datetime.datetime:
        """Dataset last modification timestamp."""

    @property
    def default_table_expiration_ms(self) -> int:
        """Default table expiration in milliseconds."""

    @default_table_expiration_ms.setter
    def default_table_expiration_ms(self, value: int): ...

    @property
    def default_partition_expiration_ms(self) -> int:
        """Default partition expiration in milliseconds."""

    @default_partition_expiration_ms.setter
    def default_partition_expiration_ms(self, value: int): ...

Dataset Reference

Reference datasets by project and dataset ID for API operations.

class DatasetReference:
    def __init__(self, project: str, dataset_id: str):
        """
        Reference to a BigQuery dataset.

        Args:
            project: Project ID.
            dataset_id: Dataset ID.
        """

    @property
    def project(self) -> str:
        """Project ID."""

    @property
    def dataset_id(self) -> str:
        """Dataset ID."""

    @property
    def path(self) -> str:
        """Full dataset path (project:dataset_id)."""

    @classmethod
    def from_string(cls, dataset_id: str, default_project: str = None) -> DatasetReference:
        """
        Create DatasetReference from string.

        Args:
            dataset_id: Dataset ID, optionally prefixed with project.
            default_project: Default project if not specified in dataset_id.

        Returns:
            DatasetReference: Dataset reference object.
        """

Access Control

Manage dataset-level permissions and access control entries.

class AccessEntry:
    def __init__(
        self,
        role: str = None,
        entity_type: str = None,
        entity_id: str = None,
        user_by_email: str = None,
        group_by_email: str = None,
        domain: str = None,
        special_group: str = None,
        view: TableReference = None,
        routine: RoutineReference = None,
    ):
        """
        Access control entry for dataset permissions.

        Args:
            role: IAM role (READER, WRITER, OWNER).
            entity_type: Type of entity (userByEmail, groupByEmail, etc.).
            entity_id: Entity identifier.
            user_by_email: User email address.
            group_by_email: Group email address.
            domain: Domain name.
            special_group: Special group (projectOwners, projectReaders, etc.).
            view: Authorized view reference.
            routine: Authorized routine reference.
        """

    @property
    def role(self) -> str:
        """IAM role for this access entry."""

    @property
    def entity_type(self) -> str:
        """Type of entity granted access."""

    @property
    def entity_id(self) -> str:
        """Entity identifier."""

    @property
    def user_by_email(self) -> str:
        """User email address."""

    @property
    def group_by_email(self) -> str:
        """Group email address."""

    @property
    def domain(self) -> str:
        """Domain name."""

    @property
    def special_group(self) -> str:
        """Special group identifier."""

    @property
    def view(self) -> TableReference:
        """Authorized view reference."""

    @property
    def routine(self) -> RoutineReference:
        """Authorized routine reference."""

Usage Examples

Create and Configure Dataset

from google.cloud import bigquery

client = bigquery.Client()

# Create a new dataset
dataset_id = "my_analytics_dataset"
dataset = bigquery.Dataset(f"{client.project}.{dataset_id}")

# Configure dataset properties
dataset.location = "US"
dataset.friendly_name = "Analytics Data"
dataset.description = "Dataset for analytics and reporting tables"

# Set default table expiration (30 days)
dataset.default_table_expiration_ms = 30 * 24 * 60 * 60 * 1000

# Add labels for organization
dataset.labels = {
    "environment": "production",
    "team": "analytics",
    "cost_center": "engineering"
}

# Create the dataset
dataset = client.create_dataset(dataset, exists_ok=True)
print(f"Created dataset {dataset.dataset_id}")

Access Control Management

# Set up access control entries
access_entries = []

# Grant read access to a user
access_entries.append(bigquery.AccessEntry(
    role="READER",
    entity_type="userByEmail",
    entity_id="analyst@company.com"
))

# Grant write access to a group
access_entries.append(bigquery.AccessEntry(
    role="WRITER",
    entity_type="groupByEmail", 
    entity_id="data-engineers@company.com"
))

# Grant access to all project owners
access_entries.append(bigquery.AccessEntry(
    role="OWNER",
    entity_type="specialGroup",
    entity_id="projectOwners"
))

# Grant access to a domain
access_entries.append(bigquery.AccessEntry(
    role="READER",
    entity_type="domain",
    entity_id="company.com"
))

# Apply access entries to dataset
dataset.access_entries = access_entries
dataset = client.update_dataset(dataset, ["access_entries"])

Authorized Views

# Create an authorized view for sensitive data access
sensitive_dataset = bigquery.Dataset(f"{client.project}.sensitive_data")
sensitive_dataset.location = "US"
sensitive_dataset = client.create_dataset(sensitive_dataset, exists_ok=True)

# Create a view in a public dataset that accesses sensitive data
public_dataset = bigquery.Dataset(f"{client.project}.public_reports")
public_dataset.location = "US" 
public_dataset = client.create_dataset(public_dataset, exists_ok=True)

# Create the view
view_query = """
SELECT 
    user_id,
    anonymized_name,
    city,
    purchase_amount
FROM `{}.sensitive_data.customer_purchases`
WHERE purchase_amount > 100
""".format(client.project)

view = bigquery.Table(f"{client.project}.public_reports.high_value_purchases")
view.view_query = view_query
view = client.create_table(view)

# Authorize the view to access sensitive dataset
view_access = bigquery.AccessEntry(
    role="READER",
    entity_type="view",
    entity_id=view.reference
)

sensitive_dataset.access_entries.append(view_access)
sensitive_dataset = client.update_dataset(sensitive_dataset, ["access_entries"])

Dataset Listing and Discovery

# List all datasets in the project
print("Datasets in project:")
for dataset in client.list_datasets():
    print(f"  {dataset.dataset_id}")
    print(f"    Location: {dataset.location}")
    print(f"    Tables: {len(list(client.list_tables(dataset)))}")

# List datasets with filtering
print("\nProduction datasets:")
for dataset in client.list_datasets():
    full_dataset = client.get_dataset(dataset.reference)
    if full_dataset.labels.get("environment") == "production":
        print(f"  {dataset.dataset_id}: {full_dataset.description}")

# Get detailed dataset information
dataset = client.get_dataset("my_analytics_dataset")
print(f"\nDataset: {dataset.dataset_id}")
print(f"Location: {dataset.location}")
print(f"Created: {dataset.created}")
print(f"Last Modified: {dataset.modified}")
print(f"Tables: {len(list(client.list_tables(dataset)))}")
print("Access Entries:")
for entry in dataset.access_entries:
    print(f"  {entry.role}: {entry.entity_type}({entry.entity_id})")

Dataset Metadata Management

# Update dataset metadata
dataset = client.get_dataset("my_analytics_dataset")

# Update description and labels
dataset.description = "Updated analytics dataset with customer data"
dataset.labels.update({
    "updated": "2023-12-01",
    "data_classification": "internal"
})

# Update default expiration times
dataset.default_table_expiration_ms = 60 * 24 * 60 * 60 * 1000  # 60 days
dataset.default_partition_expiration_ms = 7 * 24 * 60 * 60 * 1000  # 7 days

# Apply updates
dataset = client.update_dataset(
    dataset, 
    ["description", "labels", "default_table_expiration_ms", "default_partition_expiration_ms"]
)

print("Dataset updated successfully")

Dataset Cleanup and Management

# List datasets older than 30 days with no recent activity
import datetime

cutoff_date = datetime.datetime.now() - datetime.timedelta(days=30)
unused_datasets = []

for dataset_list_item in client.list_datasets():
    dataset = client.get_dataset(dataset_list_item.reference)
    
    # Check if dataset has been modified recently
    if dataset.modified < cutoff_date:
        # Check if dataset has any tables
        tables = list(client.list_tables(dataset))
        if not tables:
            unused_datasets.append(dataset)
        else:
            # Check if tables have been accessed recently
            all_tables_old = True
            for table in tables:
                full_table = client.get_table(table.reference)
                if full_table.modified >= cutoff_date:
                    all_tables_old = False
                    break
            
            if all_tables_old:
                unused_datasets.append(dataset)

print(f"Found {len(unused_datasets)} potentially unused datasets")
for dataset in unused_datasets:
    print(f"  {dataset.dataset_id} (last modified: {dataset.modified})")

# Delete unused datasets (with confirmation)
for dataset in unused_datasets:
    confirm = input(f"Delete dataset {dataset.dataset_id}? (y/N): ")
    if confirm.lower() == 'y':
        client.delete_dataset(dataset, delete_contents=True)
        print(f"Deleted {dataset.dataset_id}")

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-bigquery

docs

client-operations.md

data-loading.md

database-api.md

dataset-management.md

index.md

models-routines.md

query-operations.md

query-parameters.md

schema-definition.md

table-operations.md

tile.json