0
# Schema Management
1
2
Schema definition, evolution, and type system for Delta Lake tables including field definitions, data types, and schema operations for maintaining table structure over time.
3
4
## Capabilities
5
6
### Schema Definition
7
8
```python { .api }
9
class Schema:
10
def __init__(self, fields: list[Field]): ...
11
12
@property
13
def fields(self) -> list[Field]: ...
14
15
def to_pyarrow(self) -> pyarrow.Schema: ...
16
17
def to_json(self) -> str: ...
18
19
@classmethod
20
def from_pyarrow(cls, schema: pyarrow.Schema) -> Schema: ...
21
22
@classmethod
23
def from_json(cls, json_str: str) -> Schema: ...
24
```
25
26
Main schema class for defining table structure.
27
28
### Field Definition
29
30
```python { .api }
31
class Field:
32
def __init__(
33
self,
34
name: str,
35
data_type: DataType,
36
nullable: bool = True,
37
metadata: dict[str, Any] | None = None
38
): ...
39
40
@property
41
def name(self) -> str: ...
42
43
@property
44
def data_type(self) -> DataType: ...
45
46
@property
47
def nullable(self) -> bool: ...
48
49
@property
50
def metadata(self) -> dict[str, Any]: ...
51
52
def to_json(self) -> str: ...
53
54
@classmethod
55
def from_json(cls, json_str: str) -> Field: ...
56
```
57
58
Individual field definition within a schema.
59
60
### Data Types
61
62
```python { .api }
63
# Union type for all data types
64
DataType = Union[PrimitiveType, ArrayType, MapType, StructType]
65
66
class PrimitiveType:
67
def __init__(self, data_type: str): ...
68
69
@property
70
def data_type(self) -> str: ...
71
72
class ArrayType:
73
def __init__(self, element_type: DataType, contains_null: bool = True): ...
74
75
@property
76
def element_type(self) -> DataType: ...
77
78
@property
79
def contains_null(self) -> bool: ...
80
81
class MapType:
82
def __init__(
83
self,
84
key_type: DataType,
85
value_type: DataType,
86
value_contains_null: bool = True
87
): ...
88
89
@property
90
def key_type(self) -> DataType: ...
91
92
@property
93
def value_type(self) -> DataType: ...
94
95
@property
96
def value_contains_null(self) -> bool: ...
97
98
class StructType:
99
def __init__(self, fields: list[Field]): ...
100
101
@property
102
def fields(self) -> list[Field]: ...
103
```
104
105
Type system supporting primitive types, collections, and nested structures.
106
107
## Usage Examples
108
109
### Basic Schema Creation
110
111
```python
112
from deltalake import Schema, Field
113
from deltalake.schema import PrimitiveType, ArrayType, MapType, StructType
114
115
# Simple schema with primitive types
116
schema = Schema([
117
Field("id", PrimitiveType("integer"), nullable=False),
118
Field("name", PrimitiveType("string"), nullable=True),
119
Field("age", PrimitiveType("integer"), nullable=True),
120
Field("salary", PrimitiveType("double"), nullable=True),
121
Field("is_active", PrimitiveType("boolean"), nullable=False),
122
Field("created_at", PrimitiveType("timestamp"), nullable=False)
123
])
124
125
# Print schema information
126
for field in schema.fields:
127
print(f"{field.name}: {field.data_type} (nullable: {field.nullable})")
128
```
129
130
### Complex Data Types
131
132
```python
133
# Array type
134
tags_field = Field(
135
"tags",
136
ArrayType(PrimitiveType("string"), contains_null=False),
137
nullable=True
138
)
139
140
# Map type for key-value pairs
141
metadata_field = Field(
142
"metadata",
143
MapType(
144
key_type=PrimitiveType("string"),
145
value_type=PrimitiveType("string"),
146
value_contains_null=True
147
),
148
nullable=True
149
)
150
151
# Nested struct type
152
address_struct = StructType([
153
Field("street", PrimitiveType("string")),
154
Field("city", PrimitiveType("string")),
155
Field("zipcode", PrimitiveType("string")),
156
Field("country", PrimitiveType("string"))
157
])
158
159
address_field = Field("address", address_struct, nullable=True)
160
161
# Combined complex schema
162
complex_schema = Schema([
163
Field("id", PrimitiveType("integer"), nullable=False),
164
Field("name", PrimitiveType("string"), nullable=False),
165
tags_field,
166
metadata_field,
167
address_field
168
])
169
```
170
171
### Schema from PyArrow
172
173
```python
174
import pyarrow as pa
175
176
# Create PyArrow schema
177
arrow_schema = pa.schema([
178
pa.field("id", pa.int64(), nullable=False),
179
pa.field("name", pa.string()),
180
pa.field("scores", pa.list_(pa.float64())),
181
pa.field("created_at", pa.timestamp('us'))
182
])
183
184
# Convert to Delta schema
185
delta_schema = Schema.from_pyarrow(arrow_schema)
186
187
# Convert back to PyArrow
188
converted_arrow = delta_schema.to_pyarrow()
189
```
190
191
### Schema Serialization
192
193
```python
194
# Convert schema to JSON
195
schema_json = schema.to_json()
196
print("Schema as JSON:")
197
print(schema_json)
198
199
# Recreate schema from JSON
200
recreated_schema = Schema.from_json(schema_json)
201
202
# Verify fields match
203
assert len(schema.fields) == len(recreated_schema.fields)
204
for original, recreated in zip(schema.fields, recreated_schema.fields):
205
assert original.name == recreated.name
206
assert original.nullable == recreated.nullable
207
```
208
209
### Working with Field Metadata
210
211
```python
212
# Field with metadata
213
documented_field = Field(
214
"user_id",
215
PrimitiveType("integer"),
216
nullable=False,
217
metadata={
218
"description": "Unique identifier for user",
219
"source_system": "user_management",
220
"pii": False,
221
"format": "int64"
222
}
223
)
224
225
# Access metadata
226
print(f"Field metadata: {documented_field.metadata}")
227
print(f"Description: {documented_field.metadata.get('description')}")
228
print(f"Contains PII: {documented_field.metadata.get('pii')}")
229
```
230
231
### Schema Evolution Examples
232
233
```python
234
from deltalake import DeltaTable, write_deltalake
235
236
# Original schema
237
original_schema = Schema([
238
Field("id", PrimitiveType("integer"), nullable=False),
239
Field("name", PrimitiveType("string"), nullable=True),
240
Field("age", PrimitiveType("integer"), nullable=True)
241
])
242
243
# Create table with original schema
244
dt = DeltaTable.create("path/to/evolving-table", schema=original_schema)
245
246
# Add data with additional column (schema evolution)
247
import pandas as pd
248
249
evolved_data = pd.DataFrame({
250
'id': [4, 5, 6],
251
'name': ['New Person 1', 'New Person 2', 'New Person 3'],
252
'age': [25, 30, 35],
253
'department': ['Engineering', 'Sales', 'Marketing'] # New column
254
})
255
256
# Write with schema merge mode
257
write_deltalake(
258
"path/to/evolving-table",
259
evolved_data,
260
mode="append",
261
schema_mode="merge" # Allow schema evolution
262
)
263
264
# Check evolved schema
265
dt = DeltaTable("path/to/evolving-table")
266
evolved_schema = dt.schema()
267
print("Evolved schema:")
268
for field in evolved_schema.fields:
269
print(f" {field.name}: {field.data_type}")
270
```
271
272
### Primitive Data Types
273
274
Available primitive types:
275
- `"boolean"` - Boolean values
276
- `"byte"` - 8-bit signed integer
277
- `"short"` - 16-bit signed integer
278
- `"integer"` - 32-bit signed integer
279
- `"long"` - 64-bit signed integer
280
- `"float"` - 32-bit floating point
281
- `"double"` - 64-bit floating point
282
- `"decimal"` - Arbitrary precision decimal
283
- `"string"` - UTF-8 string
284
- `"binary"` - Binary data
285
- `"date"` - Date (year, month, day)
286
- `"timestamp"` - Timestamp with microsecond precision
287
288
```python
289
# Examples of all primitive types
290
all_types_schema = Schema([
291
Field("bool_col", PrimitiveType("boolean")),
292
Field("byte_col", PrimitiveType("byte")),
293
Field("short_col", PrimitiveType("short")),
294
Field("int_col", PrimitiveType("integer")),
295
Field("long_col", PrimitiveType("long")),
296
Field("float_col", PrimitiveType("float")),
297
Field("double_col", PrimitiveType("double")),
298
Field("decimal_col", PrimitiveType("decimal")),
299
Field("string_col", PrimitiveType("string")),
300
Field("binary_col", PrimitiveType("binary")),
301
Field("date_col", PrimitiveType("date")),
302
Field("timestamp_col", PrimitiveType("timestamp"))
303
])
304
```
305
306
### Validation and Constraints
307
308
```python
309
# Schema validation when creating tables
310
try:
311
# This will validate the schema structure
312
dt = DeltaTable.create(
313
"path/to/validated-table",
314
schema=complex_schema,
315
mode="error"
316
)
317
print("Schema validation passed")
318
except Exception as e:
319
print(f"Schema validation failed: {e}")
320
321
# Check schema compatibility
322
def schemas_compatible(schema1: Schema, schema2: Schema) -> bool:
323
"""Check if two schemas are compatible for merging"""
324
schema1_fields = {f.name: f for f in schema1.fields}
325
schema2_fields = {f.name: f for f in schema2.fields}
326
327
# Check common fields have compatible types
328
for name in schema1_fields.keys() & schema2_fields.keys():
329
field1 = schema1_fields[name]
330
field2 = schema2_fields[name]
331
332
# Simple type comparison (real implementation would be more complex)
333
if field1.data_type != field2.data_type:
334
return False
335
336
# Nullable compatibility: can't make nullable field non-nullable
337
if field1.nullable and not field2.nullable:
338
return False
339
340
return True
341
342
# Test compatibility
343
compatible = schemas_compatible(original_schema, evolved_schema)
344
print(f"Schemas are compatible: {compatible}")
345
```
346
347
## TableAlterer Class
348
349
The TableAlterer class provides advanced schema and table modification capabilities accessed through `DeltaTable.alter`.
350
351
```python { .api }
352
class TableAlterer:
353
def add_feature(
354
self,
355
feature: TableFeatures | list[TableFeatures],
356
allow_protocol_versions_increase: bool = False,
357
commit_properties: CommitProperties | None = None,
358
post_commithook_properties: PostCommitHookProperties | None = None,
359
) -> None: ...
360
361
def add_columns(
362
self,
363
fields: Field | list[Field],
364
commit_properties: CommitProperties | None = None,
365
post_commithook_properties: PostCommitHookProperties | None = None,
366
) -> None: ...
367
368
def add_constraint(
369
self,
370
constraints: dict[str, str],
371
post_commithook_properties: PostCommitHookProperties | None = None,
372
commit_properties: CommitProperties | None = None,
373
) -> None: ...
374
375
def drop_constraint(
376
self,
377
name: str,
378
raise_if_not_exists: bool = True,
379
post_commithook_properties: PostCommitHookProperties | None = None,
380
commit_properties: CommitProperties | None = None,
381
) -> None: ...
382
383
def set_table_properties(
384
self,
385
properties: dict[str, str],
386
raise_if_not_exists: bool = True,
387
commit_properties: CommitProperties | None = None,
388
) -> None: ...
389
390
def set_table_name(
391
self,
392
name: str,
393
commit_properties: CommitProperties | None = None,
394
) -> None: ...
395
396
def set_table_description(
397
self,
398
description: str,
399
commit_properties: CommitProperties | None = None,
400
) -> None: ...
401
402
def set_column_metadata(
403
self,
404
column: str,
405
metadata: dict[str, str],
406
commit_properties: CommitProperties | None = None,
407
post_commithook_properties: PostCommitHookProperties | None = None,
408
) -> None: ...
409
```
410
411
Provides comprehensive table and schema alteration capabilities including feature management, column operations, constraints, and metadata modifications.