Google BigQuery API client library for Python providing comprehensive data warehouse and analytics capabilities
—
Dataset creation, configuration, access control, and metadata management. Datasets serve as logical containers for tables, views, models, and routines within BigQuery, providing organization and access control boundaries.
Create and configure BigQuery datasets with metadata and access controls.
class Dataset:
def __init__(self, dataset_ref: Union[DatasetReference, str]):
"""
Initialize a Dataset.
Args:
dataset_ref: Dataset reference string or DatasetReference object.
"""
@property
def reference(self) -> DatasetReference:
"""Dataset reference object."""
@property
def dataset_id(self) -> str:
"""Dataset ID."""
@property
def project(self) -> str:
"""Project ID containing the dataset."""
@property
def location(self) -> str:
"""Geographic location of the dataset."""
@location.setter
def location(self, value: str): ...
@property
def friendly_name(self) -> str:
"""Human-readable dataset name."""
@friendly_name.setter
def friendly_name(self, value: str): ...
@property
def description(self) -> str:
"""Dataset description."""
@description.setter
def description(self, value: str): ...
@property
def labels(self) -> Dict[str, str]:
"""Labels for organizing and filtering datasets."""
@labels.setter
def labels(self, value: Dict[str, str]): ...
@property
def access_entries(self) -> List[AccessEntry]:
"""Access control entries for the dataset."""
@access_entries.setter
def access_entries(self, value: List[AccessEntry]): ...
@property
def created(self) -> datetime.datetime:
"""Dataset creation timestamp."""
@property
def modified(self) -> datetime.datetime:
"""Dataset last modification timestamp."""
@property
def default_table_expiration_ms(self) -> int:
"""Default table expiration in milliseconds."""
@default_table_expiration_ms.setter
def default_table_expiration_ms(self, value: int): ...
@property
def default_partition_expiration_ms(self) -> int:
"""Default partition expiration in milliseconds."""
@default_partition_expiration_ms.setter
def default_partition_expiration_ms(self, value: int): ...Reference datasets by project and dataset ID for API operations.
class DatasetReference:
def __init__(self, project: str, dataset_id: str):
"""
Reference to a BigQuery dataset.
Args:
project: Project ID.
dataset_id: Dataset ID.
"""
@property
def project(self) -> str:
"""Project ID."""
@property
def dataset_id(self) -> str:
"""Dataset ID."""
@property
def path(self) -> str:
"""Full dataset path (project:dataset_id)."""
@classmethod
def from_string(cls, dataset_id: str, default_project: str = None) -> DatasetReference:
"""
Create DatasetReference from string.
Args:
dataset_id: Dataset ID, optionally prefixed with project.
default_project: Default project if not specified in dataset_id.
Returns:
DatasetReference: Dataset reference object.
"""Manage dataset-level permissions and access control entries.
class AccessEntry:
def __init__(
self,
role: str = None,
entity_type: str = None,
entity_id: str = None,
user_by_email: str = None,
group_by_email: str = None,
domain: str = None,
special_group: str = None,
view: TableReference = None,
routine: RoutineReference = None,
):
"""
Access control entry for dataset permissions.
Args:
role: IAM role (READER, WRITER, OWNER).
entity_type: Type of entity (userByEmail, groupByEmail, etc.).
entity_id: Entity identifier.
user_by_email: User email address.
group_by_email: Group email address.
domain: Domain name.
special_group: Special group (projectOwners, projectReaders, etc.).
view: Authorized view reference.
routine: Authorized routine reference.
"""
@property
def role(self) -> str:
"""IAM role for this access entry."""
@property
def entity_type(self) -> str:
"""Type of entity granted access."""
@property
def entity_id(self) -> str:
"""Entity identifier."""
@property
def user_by_email(self) -> str:
"""User email address."""
@property
def group_by_email(self) -> str:
"""Group email address."""
@property
def domain(self) -> str:
"""Domain name."""
@property
def special_group(self) -> str:
"""Special group identifier."""
@property
def view(self) -> TableReference:
"""Authorized view reference."""
@property
def routine(self) -> RoutineReference:
"""Authorized routine reference."""from google.cloud import bigquery
client = bigquery.Client()
# Create a new dataset
dataset_id = "my_analytics_dataset"
dataset = bigquery.Dataset(f"{client.project}.{dataset_id}")
# Configure dataset properties
dataset.location = "US"
dataset.friendly_name = "Analytics Data"
dataset.description = "Dataset for analytics and reporting tables"
# Set default table expiration (30 days)
dataset.default_table_expiration_ms = 30 * 24 * 60 * 60 * 1000
# Add labels for organization
dataset.labels = {
"environment": "production",
"team": "analytics",
"cost_center": "engineering"
}
# Create the dataset
dataset = client.create_dataset(dataset, exists_ok=True)
print(f"Created dataset {dataset.dataset_id}")# Set up access control entries
access_entries = []
# Grant read access to a user
access_entries.append(bigquery.AccessEntry(
role="READER",
entity_type="userByEmail",
entity_id="analyst@company.com"
))
# Grant write access to a group
access_entries.append(bigquery.AccessEntry(
role="WRITER",
entity_type="groupByEmail",
entity_id="data-engineers@company.com"
))
# Grant access to all project owners
access_entries.append(bigquery.AccessEntry(
role="OWNER",
entity_type="specialGroup",
entity_id="projectOwners"
))
# Grant access to a domain
access_entries.append(bigquery.AccessEntry(
role="READER",
entity_type="domain",
entity_id="company.com"
))
# Apply access entries to dataset
dataset.access_entries = access_entries
dataset = client.update_dataset(dataset, ["access_entries"])# Create an authorized view for sensitive data access
sensitive_dataset = bigquery.Dataset(f"{client.project}.sensitive_data")
sensitive_dataset.location = "US"
sensitive_dataset = client.create_dataset(sensitive_dataset, exists_ok=True)
# Create a view in a public dataset that accesses sensitive data
public_dataset = bigquery.Dataset(f"{client.project}.public_reports")
public_dataset.location = "US"
public_dataset = client.create_dataset(public_dataset, exists_ok=True)
# Create the view
view_query = """
SELECT
user_id,
anonymized_name,
city,
purchase_amount
FROM `{}.sensitive_data.customer_purchases`
WHERE purchase_amount > 100
""".format(client.project)
view = bigquery.Table(f"{client.project}.public_reports.high_value_purchases")
view.view_query = view_query
view = client.create_table(view)
# Authorize the view to access sensitive dataset
view_access = bigquery.AccessEntry(
role="READER",
entity_type="view",
entity_id=view.reference
)
sensitive_dataset.access_entries.append(view_access)
sensitive_dataset = client.update_dataset(sensitive_dataset, ["access_entries"])# List all datasets in the project
print("Datasets in project:")
for dataset in client.list_datasets():
print(f" {dataset.dataset_id}")
print(f" Location: {dataset.location}")
print(f" Tables: {len(list(client.list_tables(dataset)))}")
# List datasets with filtering
print("\nProduction datasets:")
for dataset in client.list_datasets():
full_dataset = client.get_dataset(dataset.reference)
if full_dataset.labels.get("environment") == "production":
print(f" {dataset.dataset_id}: {full_dataset.description}")
# Get detailed dataset information
dataset = client.get_dataset("my_analytics_dataset")
print(f"\nDataset: {dataset.dataset_id}")
print(f"Location: {dataset.location}")
print(f"Created: {dataset.created}")
print(f"Last Modified: {dataset.modified}")
print(f"Tables: {len(list(client.list_tables(dataset)))}")
print("Access Entries:")
for entry in dataset.access_entries:
print(f" {entry.role}: {entry.entity_type}({entry.entity_id})")# Update dataset metadata
dataset = client.get_dataset("my_analytics_dataset")
# Update description and labels
dataset.description = "Updated analytics dataset with customer data"
dataset.labels.update({
"updated": "2023-12-01",
"data_classification": "internal"
})
# Update default expiration times
dataset.default_table_expiration_ms = 60 * 24 * 60 * 60 * 1000 # 60 days
dataset.default_partition_expiration_ms = 7 * 24 * 60 * 60 * 1000 # 7 days
# Apply updates
dataset = client.update_dataset(
dataset,
["description", "labels", "default_table_expiration_ms", "default_partition_expiration_ms"]
)
print("Dataset updated successfully")# List datasets older than 30 days with no recent activity
import datetime
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=30)
unused_datasets = []
for dataset_list_item in client.list_datasets():
dataset = client.get_dataset(dataset_list_item.reference)
# Check if dataset has been modified recently
if dataset.modified < cutoff_date:
# Check if dataset has any tables
tables = list(client.list_tables(dataset))
if not tables:
unused_datasets.append(dataset)
else:
# Check if tables have been accessed recently
all_tables_old = True
for table in tables:
full_table = client.get_table(table.reference)
if full_table.modified >= cutoff_date:
all_tables_old = False
break
if all_tables_old:
unused_datasets.append(dataset)
print(f"Found {len(unused_datasets)} potentially unused datasets")
for dataset in unused_datasets:
print(f" {dataset.dataset_id} (last modified: {dataset.modified})")
# Delete unused datasets (with confirmation)
for dataset in unused_datasets:
confirm = input(f"Delete dataset {dataset.dataset_id}? (y/N): ")
if confirm.lower() == 'y':
client.delete_dataset(dataset, delete_contents=True)
print(f"Deleted {dataset.dataset_id}")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-bigquery