0
# Type System
1
2
Comprehensive type system with storage types (stype) and logical types (ltype) for precise data type control and efficient memory usage.
3
4
## Capabilities
5
6
### Storage Types (stype)
7
8
```python { .api }
9
class stype(Enum):
10
"""Storage type enumeration for precise memory layout control"""
11
12
void = 0 # No data
13
bool8 = 1 # 8-bit boolean
14
int8 = 2 # 8-bit signed integer
15
int16 = 3 # 16-bit signed integer
16
int32 = 4 # 32-bit signed integer
17
int64 = 5 # 64-bit signed integer
18
float32 = 6 # 32-bit floating point
19
float64 = 7 # 64-bit floating point
20
str32 = 11 # String with 32-bit offsets
21
str64 = 12 # String with 64-bit offsets
22
arr32 = 13 # Array with 32-bit offsets
23
arr64 = 14 # Array with 64-bit offsets
24
date32 = 17 # Date (days since epoch)
25
time64 = 18 # Timestamp (nanoseconds since epoch)
26
obj64 = 21 # Python object references
27
cat8 = 22 # Categorical with 8-bit codes
28
cat16 = 23 # Categorical with 16-bit codes
29
cat32 = 24 # Categorical with 32-bit codes
30
31
@property
32
def code(self) -> str:
33
"""Two-character string representation"""
34
35
@property
36
def ltype(self) -> 'ltype':
37
"""Corresponding logical type"""
38
39
@property
40
def ctype(self):
41
"""ctypes class for C-level type"""
42
43
@property
44
def dtype(self):
45
"""numpy.dtype equivalent"""
46
47
@property
48
def min(self):
49
"""Minimum representable value"""
50
51
@property
52
def max(self):
53
"""Maximum representable value"""
54
```
55
56
### Logical Types (ltype)
57
58
```python { .api }
59
class ltype(Enum):
60
"""Logical type enumeration for high-level data categories"""
61
62
void = 0 # No data
63
bool = 1 # Boolean values
64
int = 2 # Integer values
65
real = 3 # Real/floating point values
66
str = 4 # String/text values
67
time = 5 # Date/time values
68
obj = 7 # Object values
69
invalid = 8 # Invalid/unsupported type
70
71
@property
72
def stypes(self) -> list:
73
"""List of stypes that represent this ltype"""
74
```
75
76
### Type Conversion
77
78
```python { .api }
79
def as_type(frame_or_column, new_type) -> Frame:
80
"""
81
Convert frame or column to specified type.
82
83
Parameters:
84
- frame_or_column: Frame or column expression to convert
85
- new_type: Target stype, ltype, or Type object
86
87
Returns:
88
Frame or expression with converted types
89
"""
90
91
class Type:
92
"""Type system helper for datatable operations"""
93
pass
94
95
def categories(column) -> Frame:
96
"""
97
Extract category labels from categorical column.
98
99
Parameters:
100
- column: Categorical column expression
101
102
Returns:
103
Frame with unique category labels
104
"""
105
106
def codes(column) -> FExpr:
107
"""
108
Extract category codes from categorical column.
109
110
Parameters:
111
- column: Categorical column expression
112
113
Returns:
114
Integer codes for categorical values
115
"""
116
```
117
118
## Type Examples
119
120
### Working with Storage Types
121
122
```python
123
import datatable as dt
124
125
# Create Frame with specific types
126
DT = dt.Frame({
127
'small_int': [1, 2, 3],
128
'big_int': [1000000, 2000000, 3000000],
129
'text': ['a', 'b', 'c'],
130
'flag': [True, False, True]
131
}, stypes=[dt.int8, dt.int64, dt.str32, dt.bool8])
132
133
# Check types
134
print(DT.stypes) # (stype.int8, stype.int64, stype.str32, stype.bool8)
135
print(DT.ltypes) # (ltype.int, ltype.int, ltype.str, ltype.bool)
136
137
# Access type properties
138
print(dt.int8.min, dt.int8.max) # (-127, 127)
139
print(dt.int64.min, dt.int64.max) # Large integer bounds
140
print(dt.str32.code) # 's4'
141
```
142
143
### Type Conversion Examples
144
145
```python
146
# Convert specific columns
147
DT_converted = DT[:, dt.update(
148
small_as_big=dt.as_type(f.small_int, dt.int64),
149
big_as_float=dt.as_type(f.big_int, dt.float64),
150
text_as_cat=dt.as_type(f.text, dt.cat8)
151
)]
152
153
# Convert entire frame
154
DT_all_float = dt.as_type(DT, dt.float64)
155
156
# Convert with expressions
157
DT_conditional = DT[:, dt.update(
158
smart_type=dt.ifelse(f.big_int > 1500000,
159
dt.as_type(f.big_int, dt.float32),
160
dt.as_type(f.big_int, dt.int32))
161
)]
162
```
163
164
### Memory Optimization
165
166
```python
167
# Use smaller types for memory efficiency
168
large_data = dt.Frame({
169
'id': range(1000000), # Default int64
170
'category': ['A'] * 500000 + ['B'] * 500000, # Default str64
171
'flag': [True, False] * 500000, # Default bool8
172
'small_val': [x % 100 for x in range(1000000)] # Default int64
173
})
174
175
# Optimize memory usage
176
optimized = large_data[:, dt.update(
177
id=dt.as_type(f.id, dt.int32), # Sufficient for 1M records
178
category=dt.as_type(f.category, dt.cat8), # Categorical for repeated values
179
small_val=dt.as_type(f.small_val, dt.int8) # Values 0-99 fit in int8
180
)]
181
182
# Check memory savings
183
print(f"Original stypes: {large_data.stypes}")
184
print(f"Optimized stypes: {optimized.stypes}")
185
```
186
187
### Date and Time Types
188
189
```python
190
# Working with temporal data
191
dates = dt.Frame({
192
'date_str': ['2023-01-01', '2023-06-15', '2023-12-31'],
193
'timestamp_str': ['2023-01-01 12:30:45', '2023-06-15 09:15:20', '2023-12-31 23:59:59']
194
})
195
196
# Convert to temporal types
197
temporal = dates[:, dt.update(
198
date_val=dt.as_type(f.date_str, dt.date32),
199
timestamp_val=dt.as_type(f.timestamp_str, dt.time64)
200
)]
201
202
# Extract components
203
components = temporal[:, dt.update(
204
year=dt.time.year(f.timestamp_val),
205
month=dt.time.month(f.timestamp_val),
206
day=dt.time.day(f.timestamp_val),
207
hour=dt.time.hour(f.timestamp_val)
208
)]
209
```
210
211
### String Type Optimization
212
213
```python
214
# Choose appropriate string type based on data size
215
short_strings = dt.Frame({'text': ['a', 'bb', 'ccc']})
216
long_strings = dt.Frame({'text': ['very long string' * 100] * 1000})
217
218
# str32 for smaller datasets/strings
219
short_optimized = dt.as_type(short_strings, {'text': dt.str32})
220
221
# str64 for larger datasets/strings
222
long_optimized = dt.as_type(long_strings, {'text': dt.str64})
223
224
# Check string properties
225
print(f"str32 supports up to {2**31-1} characters")
226
print(f"str64 supports up to {2**63-1} characters")
227
```
228
229
### Categorical Types
230
231
```python
232
# Convert repeated strings to categorical
233
categories = dt.Frame({
234
'color': ['red', 'blue', 'green'] * 10000,
235
'size': ['small', 'medium', 'large'] * 10000
236
})
237
238
# Use categorical types for memory efficiency
239
categorical = categories[:, dt.update(
240
color_cat=dt.as_type(f.color, dt.cat8), # Up to 255 categories
241
size_cat=dt.as_type(f.size, dt.cat8)
242
)]
243
244
# Access categorical information
245
color_codes = categorical[:, dt.codes(f.color_cat)]
246
color_categories = categorical[:, dt.categories(f.color_cat)]
247
```
248
249
### Type Checking and Validation
250
251
```python
252
def validate_types(frame, expected_types):
253
"""Validate frame has expected types"""
254
actual_types = frame.stypes
255
for i, (actual, expected) in enumerate(zip(actual_types, expected_types)):
256
if actual != expected:
257
column_name = frame.names[i]
258
print(f"Column {column_name}: expected {expected}, got {actual}")
259
return False
260
return True
261
262
# Usage
263
DT = dt.Frame({'A': [1, 2, 3], 'B': [1.1, 2.2, 3.3]})
264
is_valid = validate_types(DT, [dt.int64, dt.float64])
265
```
266
267
### Automatic Type Detection
268
269
```python
270
# datatable automatically detects appropriate types
271
mixed_data = dt.Frame({
272
'integers': [1, 2, 3, 4],
273
'floats': [1.1, 2.2, 3.3, 4.4],
274
'strings': ['a', 'b', 'c', 'd'],
275
'booleans': [True, False, True, False],
276
'mixed_numbers': [1, 2.5, 3, 4.7] # Will be float64
277
})
278
279
print("Auto-detected types:", mixed_data.stypes)
280
281
# Override auto-detection
282
explicit_types = dt.Frame({
283
'integers': [1, 2, 3, 4],
284
'floats': [1.1, 2.2, 3.3, 4.4]
285
}, stypes=[dt.int32, dt.float32])
286
```
287
288
### Type Compatibility and Coercion
289
290
```python
291
# Type promotion in operations
292
int_col = dt.Frame({'x': [1, 2, 3]}, stype=dt.int32)
293
float_col = dt.Frame({'y': [1.1, 2.2, 3.3]}, stype=dt.float32)
294
295
# Operations promote to common type
296
combined = dt.cbind(int_col, float_col)
297
result = combined[:, f.x + f.y] # Result will be float64
298
299
# Explicit control over type promotion
300
result_controlled = combined[:,
301
dt.as_type(f.x, dt.float32) + f.y # Keep as float32
302
]
303
```
304
305
## Type Constants
306
307
The following type constants are available directly from the datatable module:
308
309
```python
310
# Available as dt.typename
311
dt.void, dt.bool8
312
dt.int8, dt.int16, dt.int32, dt.int64
313
dt.float32, dt.float64
314
dt.str32, dt.str64
315
dt.obj64
316
```