0
# Dataset Management
1
2
Dataset creation, configuration, access control, and metadata management. Datasets serve as logical containers for tables, views, models, and routines within BigQuery, providing organization and access control boundaries.
3
4
## Capabilities
5
6
### Dataset Definition
7
8
Create and configure BigQuery datasets with metadata and access controls.
9
10
```python { .api }
11
class Dataset:
12
def __init__(self, dataset_ref: Union[DatasetReference, str]):
13
"""
14
Initialize a Dataset.
15
16
Args:
17
dataset_ref: Dataset reference string or DatasetReference object.
18
"""
19
20
@property
21
def reference(self) -> DatasetReference:
22
"""Dataset reference object."""
23
24
@property
25
def dataset_id(self) -> str:
26
"""Dataset ID."""
27
28
@property
29
def project(self) -> str:
30
"""Project ID containing the dataset."""
31
32
@property
33
def location(self) -> str:
34
"""Geographic location of the dataset."""
35
36
@location.setter
37
def location(self, value: str): ...
38
39
@property
40
def friendly_name(self) -> str:
41
"""Human-readable dataset name."""
42
43
@friendly_name.setter
44
def friendly_name(self, value: str): ...
45
46
@property
47
def description(self) -> str:
48
"""Dataset description."""
49
50
@description.setter
51
def description(self, value: str): ...
52
53
@property
54
def labels(self) -> Dict[str, str]:
55
"""Labels for organizing and filtering datasets."""
56
57
@labels.setter
58
def labels(self, value: Dict[str, str]): ...
59
60
@property
61
def access_entries(self) -> List[AccessEntry]:
62
"""Access control entries for the dataset."""
63
64
@access_entries.setter
65
def access_entries(self, value: List[AccessEntry]): ...
66
67
@property
68
def created(self) -> datetime.datetime:
69
"""Dataset creation timestamp."""
70
71
@property
72
def modified(self) -> datetime.datetime:
73
"""Dataset last modification timestamp."""
74
75
@property
76
def default_table_expiration_ms(self) -> int:
77
"""Default table expiration in milliseconds."""
78
79
@default_table_expiration_ms.setter
80
def default_table_expiration_ms(self, value: int): ...
81
82
@property
83
def default_partition_expiration_ms(self) -> int:
84
"""Default partition expiration in milliseconds."""
85
86
@default_partition_expiration_ms.setter
87
def default_partition_expiration_ms(self, value: int): ...
88
```
89
90
### Dataset Reference
91
92
Reference datasets by project and dataset ID for API operations.
93
94
```python { .api }
95
class DatasetReference:
96
def __init__(self, project: str, dataset_id: str):
97
"""
98
Reference to a BigQuery dataset.
99
100
Args:
101
project: Project ID.
102
dataset_id: Dataset ID.
103
"""
104
105
@property
106
def project(self) -> str:
107
"""Project ID."""
108
109
@property
110
def dataset_id(self) -> str:
111
"""Dataset ID."""
112
113
@property
114
def path(self) -> str:
115
"""Full dataset path (project:dataset_id)."""
116
117
@classmethod
118
def from_string(cls, dataset_id: str, default_project: str = None) -> DatasetReference:
119
"""
120
Create DatasetReference from string.
121
122
Args:
123
dataset_id: Dataset ID, optionally prefixed with project.
124
default_project: Default project if not specified in dataset_id.
125
126
Returns:
127
DatasetReference: Dataset reference object.
128
"""
129
```
130
131
### Access Control
132
133
Manage dataset-level permissions and access control entries.
134
135
```python { .api }
136
class AccessEntry:
137
def __init__(
138
self,
139
role: str = None,
140
entity_type: str = None,
141
entity_id: str = None,
142
user_by_email: str = None,
143
group_by_email: str = None,
144
domain: str = None,
145
special_group: str = None,
146
view: TableReference = None,
147
routine: RoutineReference = None,
148
):
149
"""
150
Access control entry for dataset permissions.
151
152
Args:
153
role: IAM role (READER, WRITER, OWNER).
154
entity_type: Type of entity (userByEmail, groupByEmail, etc.).
155
entity_id: Entity identifier.
156
user_by_email: User email address.
157
group_by_email: Group email address.
158
domain: Domain name.
159
special_group: Special group (projectOwners, projectReaders, etc.).
160
view: Authorized view reference.
161
routine: Authorized routine reference.
162
"""
163
164
@property
165
def role(self) -> str:
166
"""IAM role for this access entry."""
167
168
@property
169
def entity_type(self) -> str:
170
"""Type of entity granted access."""
171
172
@property
173
def entity_id(self) -> str:
174
"""Entity identifier."""
175
176
@property
177
def user_by_email(self) -> str:
178
"""User email address."""
179
180
@property
181
def group_by_email(self) -> str:
182
"""Group email address."""
183
184
@property
185
def domain(self) -> str:
186
"""Domain name."""
187
188
@property
189
def special_group(self) -> str:
190
"""Special group identifier."""
191
192
@property
193
def view(self) -> TableReference:
194
"""Authorized view reference."""
195
196
@property
197
def routine(self) -> RoutineReference:
198
"""Authorized routine reference."""
199
```
200
201
## Usage Examples
202
203
### Create and Configure Dataset
204
205
```python
206
from google.cloud import bigquery
207
208
client = bigquery.Client()
209
210
# Create a new dataset
211
dataset_id = "my_analytics_dataset"
212
dataset = bigquery.Dataset(f"{client.project}.{dataset_id}")
213
214
# Configure dataset properties
215
dataset.location = "US"
216
dataset.friendly_name = "Analytics Data"
217
dataset.description = "Dataset for analytics and reporting tables"
218
219
# Set default table expiration (30 days)
220
dataset.default_table_expiration_ms = 30 * 24 * 60 * 60 * 1000
221
222
# Add labels for organization
223
dataset.labels = {
224
"environment": "production",
225
"team": "analytics",
226
"cost_center": "engineering"
227
}
228
229
# Create the dataset
230
dataset = client.create_dataset(dataset, exists_ok=True)
231
print(f"Created dataset {dataset.dataset_id}")
232
```
233
234
### Access Control Management
235
236
```python
237
# Set up access control entries
238
access_entries = []
239
240
# Grant read access to a user
241
access_entries.append(bigquery.AccessEntry(
242
role="READER",
243
entity_type="userByEmail",
244
entity_id="analyst@company.com"
245
))
246
247
# Grant write access to a group
248
access_entries.append(bigquery.AccessEntry(
249
role="WRITER",
250
entity_type="groupByEmail",
251
entity_id="data-engineers@company.com"
252
))
253
254
# Grant access to all project owners
255
access_entries.append(bigquery.AccessEntry(
256
role="OWNER",
257
entity_type="specialGroup",
258
entity_id="projectOwners"
259
))
260
261
# Grant access to a domain
262
access_entries.append(bigquery.AccessEntry(
263
role="READER",
264
entity_type="domain",
265
entity_id="company.com"
266
))
267
268
# Apply access entries to dataset
269
dataset.access_entries = access_entries
270
dataset = client.update_dataset(dataset, ["access_entries"])
271
```
272
273
### Authorized Views
274
275
```python
276
# Create an authorized view for sensitive data access
277
sensitive_dataset = bigquery.Dataset(f"{client.project}.sensitive_data")
278
sensitive_dataset.location = "US"
279
sensitive_dataset = client.create_dataset(sensitive_dataset, exists_ok=True)
280
281
# Create a view in a public dataset that accesses sensitive data
282
public_dataset = bigquery.Dataset(f"{client.project}.public_reports")
283
public_dataset.location = "US"
284
public_dataset = client.create_dataset(public_dataset, exists_ok=True)
285
286
# Create the view
287
view_query = """
288
SELECT
289
user_id,
290
anonymized_name,
291
city,
292
purchase_amount
293
FROM `{}.sensitive_data.customer_purchases`
294
WHERE purchase_amount > 100
295
""".format(client.project)
296
297
view = bigquery.Table(f"{client.project}.public_reports.high_value_purchases")
298
view.view_query = view_query
299
view = client.create_table(view)
300
301
# Authorize the view to access sensitive dataset
302
view_access = bigquery.AccessEntry(
303
role="READER",
304
entity_type="view",
305
entity_id=view.reference
306
)
307
308
sensitive_dataset.access_entries.append(view_access)
309
sensitive_dataset = client.update_dataset(sensitive_dataset, ["access_entries"])
310
```
311
312
### Dataset Listing and Discovery
313
314
```python
315
# List all datasets in the project
316
print("Datasets in project:")
317
for dataset in client.list_datasets():
318
print(f" {dataset.dataset_id}")
319
print(f" Location: {dataset.location}")
320
print(f" Tables: {len(list(client.list_tables(dataset)))}")
321
322
# List datasets with filtering
323
print("\nProduction datasets:")
324
for dataset in client.list_datasets():
325
full_dataset = client.get_dataset(dataset.reference)
326
if full_dataset.labels.get("environment") == "production":
327
print(f" {dataset.dataset_id}: {full_dataset.description}")
328
329
# Get detailed dataset information
330
dataset = client.get_dataset("my_analytics_dataset")
331
print(f"\nDataset: {dataset.dataset_id}")
332
print(f"Location: {dataset.location}")
333
print(f"Created: {dataset.created}")
334
print(f"Last Modified: {dataset.modified}")
335
print(f"Tables: {len(list(client.list_tables(dataset)))}")
336
print("Access Entries:")
337
for entry in dataset.access_entries:
338
print(f" {entry.role}: {entry.entity_type}({entry.entity_id})")
339
```
340
341
### Dataset Metadata Management
342
343
```python
344
# Update dataset metadata
345
dataset = client.get_dataset("my_analytics_dataset")
346
347
# Update description and labels
348
dataset.description = "Updated analytics dataset with customer data"
349
dataset.labels.update({
350
"updated": "2023-12-01",
351
"data_classification": "internal"
352
})
353
354
# Update default expiration times
355
dataset.default_table_expiration_ms = 60 * 24 * 60 * 60 * 1000 # 60 days
356
dataset.default_partition_expiration_ms = 7 * 24 * 60 * 60 * 1000 # 7 days
357
358
# Apply updates
359
dataset = client.update_dataset(
360
dataset,
361
["description", "labels", "default_table_expiration_ms", "default_partition_expiration_ms"]
362
)
363
364
print("Dataset updated successfully")
365
```
366
367
### Dataset Cleanup and Management
368
369
```python
370
# List datasets older than 30 days with no recent activity
371
import datetime
372
373
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=30)
374
unused_datasets = []
375
376
for dataset_list_item in client.list_datasets():
377
dataset = client.get_dataset(dataset_list_item.reference)
378
379
# Check if dataset has been modified recently
380
if dataset.modified < cutoff_date:
381
# Check if dataset has any tables
382
tables = list(client.list_tables(dataset))
383
if not tables:
384
unused_datasets.append(dataset)
385
else:
386
# Check if tables have been accessed recently
387
all_tables_old = True
388
for table in tables:
389
full_table = client.get_table(table.reference)
390
if full_table.modified >= cutoff_date:
391
all_tables_old = False
392
break
393
394
if all_tables_old:
395
unused_datasets.append(dataset)
396
397
print(f"Found {len(unused_datasets)} potentially unused datasets")
398
for dataset in unused_datasets:
399
print(f" {dataset.dataset_id} (last modified: {dataset.modified})")
400
401
# Delete unused datasets (with confirmation)
402
for dataset in unused_datasets:
403
confirm = input(f"Delete dataset {dataset.dataset_id}? (y/N): ")
404
if confirm.lower() == 'y':
405
client.delete_dataset(dataset, delete_contents=True)
406
print(f"Deleted {dataset.dataset_id}")
407
```