or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

client-operations.mddata-loading.mddatabase-api.mddataset-management.mdindex.mdmodels-routines.mdquery-operations.mdquery-parameters.mdschema-definition.mdtable-operations.md

dataset-management.mddocs/

0

# Dataset Management

1

2

Dataset creation, configuration, access control, and metadata management. Datasets serve as logical containers for tables, views, models, and routines within BigQuery, providing organization and access control boundaries.

3

4

## Capabilities

5

6

### Dataset Definition

7

8

Create and configure BigQuery datasets with metadata and access controls.

9

10

```python { .api }

11

class Dataset:

12

def __init__(self, dataset_ref: Union[DatasetReference, str]):

13

"""

14

Initialize a Dataset.

15

16

Args:

17

dataset_ref: Dataset reference string or DatasetReference object.

18

"""

19

20

@property

21

def reference(self) -> DatasetReference:

22

"""Dataset reference object."""

23

24

@property

25

def dataset_id(self) -> str:

26

"""Dataset ID."""

27

28

@property

29

def project(self) -> str:

30

"""Project ID containing the dataset."""

31

32

@property

33

def location(self) -> str:

34

"""Geographic location of the dataset."""

35

36

@location.setter

37

def location(self, value: str): ...

38

39

@property

40

def friendly_name(self) -> str:

41

"""Human-readable dataset name."""

42

43

@friendly_name.setter

44

def friendly_name(self, value: str): ...

45

46

@property

47

def description(self) -> str:

48

"""Dataset description."""

49

50

@description.setter

51

def description(self, value: str): ...

52

53

@property

54

def labels(self) -> Dict[str, str]:

55

"""Labels for organizing and filtering datasets."""

56

57

@labels.setter

58

def labels(self, value: Dict[str, str]): ...

59

60

@property

61

def access_entries(self) -> List[AccessEntry]:

62

"""Access control entries for the dataset."""

63

64

@access_entries.setter

65

def access_entries(self, value: List[AccessEntry]): ...

66

67

@property

68

def created(self) -> datetime.datetime:

69

"""Dataset creation timestamp."""

70

71

@property

72

def modified(self) -> datetime.datetime:

73

"""Dataset last modification timestamp."""

74

75

@property

76

def default_table_expiration_ms(self) -> int:

77

"""Default table expiration in milliseconds."""

78

79

@default_table_expiration_ms.setter

80

def default_table_expiration_ms(self, value: int): ...

81

82

@property

83

def default_partition_expiration_ms(self) -> int:

84

"""Default partition expiration in milliseconds."""

85

86

@default_partition_expiration_ms.setter

87

def default_partition_expiration_ms(self, value: int): ...

88

```

89

90

### Dataset Reference

91

92

Reference datasets by project and dataset ID for API operations.

93

94

```python { .api }

95

class DatasetReference:

96

def __init__(self, project: str, dataset_id: str):

97

"""

98

Reference to a BigQuery dataset.

99

100

Args:

101

project: Project ID.

102

dataset_id: Dataset ID.

103

"""

104

105

@property

106

def project(self) -> str:

107

"""Project ID."""

108

109

@property

110

def dataset_id(self) -> str:

111

"""Dataset ID."""

112

113

@property

114

def path(self) -> str:

115

"""Full dataset path (project:dataset_id)."""

116

117

@classmethod

118

def from_string(cls, dataset_id: str, default_project: str = None) -> DatasetReference:

119

"""

120

Create DatasetReference from string.

121

122

Args:

123

dataset_id: Dataset ID, optionally prefixed with project.

124

default_project: Default project if not specified in dataset_id.

125

126

Returns:

127

DatasetReference: Dataset reference object.

128

"""

129

```

130

131

### Access Control

132

133

Manage dataset-level permissions and access control entries.

134

135

```python { .api }

136

class AccessEntry:

137

def __init__(

138

self,

139

role: str = None,

140

entity_type: str = None,

141

entity_id: str = None,

142

user_by_email: str = None,

143

group_by_email: str = None,

144

domain: str = None,

145

special_group: str = None,

146

view: TableReference = None,

147

routine: RoutineReference = None,

148

):

149

"""

150

Access control entry for dataset permissions.

151

152

Args:

153

role: IAM role (READER, WRITER, OWNER).

154

entity_type: Type of entity (userByEmail, groupByEmail, etc.).

155

entity_id: Entity identifier.

156

user_by_email: User email address.

157

group_by_email: Group email address.

158

domain: Domain name.

159

special_group: Special group (projectOwners, projectReaders, etc.).

160

view: Authorized view reference.

161

routine: Authorized routine reference.

162

"""

163

164

@property

165

def role(self) -> str:

166

"""IAM role for this access entry."""

167

168

@property

169

def entity_type(self) -> str:

170

"""Type of entity granted access."""

171

172

@property

173

def entity_id(self) -> str:

174

"""Entity identifier."""

175

176

@property

177

def user_by_email(self) -> str:

178

"""User email address."""

179

180

@property

181

def group_by_email(self) -> str:

182

"""Group email address."""

183

184

@property

185

def domain(self) -> str:

186

"""Domain name."""

187

188

@property

189

def special_group(self) -> str:

190

"""Special group identifier."""

191

192

@property

193

def view(self) -> TableReference:

194

"""Authorized view reference."""

195

196

@property

197

def routine(self) -> RoutineReference:

198

"""Authorized routine reference."""

199

```

200

201

## Usage Examples

202

203

### Create and Configure Dataset

204

205

```python

206

from google.cloud import bigquery

207

208

client = bigquery.Client()

209

210

# Create a new dataset

211

dataset_id = "my_analytics_dataset"

212

dataset = bigquery.Dataset(f"{client.project}.{dataset_id}")

213

214

# Configure dataset properties

215

dataset.location = "US"

216

dataset.friendly_name = "Analytics Data"

217

dataset.description = "Dataset for analytics and reporting tables"

218

219

# Set default table expiration (30 days)

220

dataset.default_table_expiration_ms = 30 * 24 * 60 * 60 * 1000

221

222

# Add labels for organization

223

dataset.labels = {

224

"environment": "production",

225

"team": "analytics",

226

"cost_center": "engineering"

227

}

228

229

# Create the dataset

230

dataset = client.create_dataset(dataset, exists_ok=True)

231

print(f"Created dataset {dataset.dataset_id}")

232

```

233

234

### Access Control Management

235

236

```python

237

# Set up access control entries

238

access_entries = []

239

240

# Grant read access to a user

241

access_entries.append(bigquery.AccessEntry(

242

role="READER",

243

entity_type="userByEmail",

244

entity_id="analyst@company.com"

245

))

246

247

# Grant write access to a group

248

access_entries.append(bigquery.AccessEntry(

249

role="WRITER",

250

entity_type="groupByEmail",

251

entity_id="data-engineers@company.com"

252

))

253

254

# Grant access to all project owners

255

access_entries.append(bigquery.AccessEntry(

256

role="OWNER",

257

entity_type="specialGroup",

258

entity_id="projectOwners"

259

))

260

261

# Grant access to a domain

262

access_entries.append(bigquery.AccessEntry(

263

role="READER",

264

entity_type="domain",

265

entity_id="company.com"

266

))

267

268

# Apply access entries to dataset

269

dataset.access_entries = access_entries

270

dataset = client.update_dataset(dataset, ["access_entries"])

271

```

272

273

### Authorized Views

274

275

```python

276

# Create an authorized view for sensitive data access

277

sensitive_dataset = bigquery.Dataset(f"{client.project}.sensitive_data")

278

sensitive_dataset.location = "US"

279

sensitive_dataset = client.create_dataset(sensitive_dataset, exists_ok=True)

280

281

# Create a view in a public dataset that accesses sensitive data

282

public_dataset = bigquery.Dataset(f"{client.project}.public_reports")

283

public_dataset.location = "US"

284

public_dataset = client.create_dataset(public_dataset, exists_ok=True)

285

286

# Create the view

287

view_query = """

288

SELECT

289

user_id,

290

anonymized_name,

291

city,

292

purchase_amount

293

FROM `{}.sensitive_data.customer_purchases`

294

WHERE purchase_amount > 100

295

""".format(client.project)

296

297

view = bigquery.Table(f"{client.project}.public_reports.high_value_purchases")

298

view.view_query = view_query

299

view = client.create_table(view)

300

301

# Authorize the view to access sensitive dataset

302

view_access = bigquery.AccessEntry(

303

role="READER",

304

entity_type="view",

305

entity_id=view.reference

306

)

307

308

sensitive_dataset.access_entries.append(view_access)

309

sensitive_dataset = client.update_dataset(sensitive_dataset, ["access_entries"])

310

```

311

312

### Dataset Listing and Discovery

313

314

```python

315

# List all datasets in the project

316

print("Datasets in project:")

317

for dataset in client.list_datasets():

318

print(f" {dataset.dataset_id}")

319

print(f" Location: {dataset.location}")

320

print(f" Tables: {len(list(client.list_tables(dataset)))}")

321

322

# List datasets with filtering

323

print("\nProduction datasets:")

324

for dataset in client.list_datasets():

325

full_dataset = client.get_dataset(dataset.reference)

326

if full_dataset.labels.get("environment") == "production":

327

print(f" {dataset.dataset_id}: {full_dataset.description}")

328

329

# Get detailed dataset information

330

dataset = client.get_dataset("my_analytics_dataset")

331

print(f"\nDataset: {dataset.dataset_id}")

332

print(f"Location: {dataset.location}")

333

print(f"Created: {dataset.created}")

334

print(f"Last Modified: {dataset.modified}")

335

print(f"Tables: {len(list(client.list_tables(dataset)))}")

336

print("Access Entries:")

337

for entry in dataset.access_entries:

338

print(f" {entry.role}: {entry.entity_type}({entry.entity_id})")

339

```

340

341

### Dataset Metadata Management

342

343

```python

344

# Update dataset metadata

345

dataset = client.get_dataset("my_analytics_dataset")

346

347

# Update description and labels

348

dataset.description = "Updated analytics dataset with customer data"

349

dataset.labels.update({

350

"updated": "2023-12-01",

351

"data_classification": "internal"

352

})

353

354

# Update default expiration times

355

dataset.default_table_expiration_ms = 60 * 24 * 60 * 60 * 1000 # 60 days

356

dataset.default_partition_expiration_ms = 7 * 24 * 60 * 60 * 1000 # 7 days

357

358

# Apply updates

359

dataset = client.update_dataset(

360

dataset,

361

["description", "labels", "default_table_expiration_ms", "default_partition_expiration_ms"]

362

)

363

364

print("Dataset updated successfully")

365

```

366

367

### Dataset Cleanup and Management

368

369

```python

370

# List datasets older than 30 days with no recent activity

371

import datetime

372

373

cutoff_date = datetime.datetime.now() - datetime.timedelta(days=30)

374

unused_datasets = []

375

376

for dataset_list_item in client.list_datasets():

377

dataset = client.get_dataset(dataset_list_item.reference)

378

379

# Check if dataset has been modified recently

380

if dataset.modified < cutoff_date:

381

# Check if dataset has any tables

382

tables = list(client.list_tables(dataset))

383

if not tables:

384

unused_datasets.append(dataset)

385

else:

386

# Check if tables have been accessed recently

387

all_tables_old = True

388

for table in tables:

389

full_table = client.get_table(table.reference)

390

if full_table.modified >= cutoff_date:

391

all_tables_old = False

392

break

393

394

if all_tables_old:

395

unused_datasets.append(dataset)

396

397

print(f"Found {len(unused_datasets)} potentially unused datasets")

398

for dataset in unused_datasets:

399

print(f" {dataset.dataset_id} (last modified: {dataset.modified})")

400

401

# Delete unused datasets (with confirmation)

402

for dataset in unused_datasets:

403

confirm = input(f"Delete dataset {dataset.dataset_id}? (y/N): ")

404

if confirm.lower() == 'y':

405

client.delete_dataset(dataset, delete_contents=True)

406

print(f"Deleted {dataset.dataset_id}")

407

```