Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
—
Comprehensive type system with storage types (stype) and logical types (ltype) for precise data type control and efficient memory usage.
class stype(Enum):
"""Storage type enumeration for precise memory layout control"""
void = 0 # No data
bool8 = 1 # 8-bit boolean
int8 = 2 # 8-bit signed integer
int16 = 3 # 16-bit signed integer
int32 = 4 # 32-bit signed integer
int64 = 5 # 64-bit signed integer
float32 = 6 # 32-bit floating point
float64 = 7 # 64-bit floating point
str32 = 11 # String with 32-bit offsets
str64 = 12 # String with 64-bit offsets
arr32 = 13 # Array with 32-bit offsets
arr64 = 14 # Array with 64-bit offsets
date32 = 17 # Date (days since epoch)
time64 = 18 # Timestamp (nanoseconds since epoch)
obj64 = 21 # Python object references
cat8 = 22 # Categorical with 8-bit codes
cat16 = 23 # Categorical with 16-bit codes
cat32 = 24 # Categorical with 32-bit codes
@property
def code(self) -> str:
"""Two-character string representation"""
@property
def ltype(self) -> 'ltype':
"""Corresponding logical type"""
@property
def ctype(self):
"""ctypes class for C-level type"""
@property
def dtype(self):
"""numpy.dtype equivalent"""
@property
def min(self):
"""Minimum representable value"""
@property
def max(self):
"""Maximum representable value"""class ltype(Enum):
"""Logical type enumeration for high-level data categories"""
void = 0 # No data
bool = 1 # Boolean values
int = 2 # Integer values
real = 3 # Real/floating point values
str = 4 # String/text values
time = 5 # Date/time values
obj = 7 # Object values
invalid = 8 # Invalid/unsupported type
@property
def stypes(self) -> list:
"""List of stypes that represent this ltype"""def as_type(frame_or_column, new_type) -> Frame:
"""
Convert frame or column to specified type.
Parameters:
- frame_or_column: Frame or column expression to convert
- new_type: Target stype, ltype, or Type object
Returns:
Frame or expression with converted types
"""
class Type:
"""Type system helper for datatable operations"""
pass
def categories(column) -> Frame:
"""
Extract category labels from categorical column.
Parameters:
- column: Categorical column expression
Returns:
Frame with unique category labels
"""
def codes(column) -> FExpr:
"""
Extract category codes from categorical column.
Parameters:
- column: Categorical column expression
Returns:
Integer codes for categorical values
"""import datatable as dt
# Create Frame with specific types
DT = dt.Frame({
'small_int': [1, 2, 3],
'big_int': [1000000, 2000000, 3000000],
'text': ['a', 'b', 'c'],
'flag': [True, False, True]
}, stypes=[dt.int8, dt.int64, dt.str32, dt.bool8])
# Check types
print(DT.stypes) # (stype.int8, stype.int64, stype.str32, stype.bool8)
print(DT.ltypes) # (ltype.int, ltype.int, ltype.str, ltype.bool)
# Access type properties
print(dt.int8.min, dt.int8.max) # (-127, 127)
print(dt.int64.min, dt.int64.max) # Large integer bounds
print(dt.str32.code) # 's4'# Convert specific columns
DT_converted = DT[:, dt.update(
small_as_big=dt.as_type(f.small_int, dt.int64),
big_as_float=dt.as_type(f.big_int, dt.float64),
text_as_cat=dt.as_type(f.text, dt.cat8)
)]
# Convert entire frame
DT_all_float = dt.as_type(DT, dt.float64)
# Convert with expressions
DT_conditional = DT[:, dt.update(
smart_type=dt.ifelse(f.big_int > 1500000,
dt.as_type(f.big_int, dt.float32),
dt.as_type(f.big_int, dt.int32))
)]# Use smaller types for memory efficiency
large_data = dt.Frame({
'id': range(1000000), # Default int64
'category': ['A'] * 500000 + ['B'] * 500000, # Default str64
'flag': [True, False] * 500000, # Default bool8
'small_val': [x % 100 for x in range(1000000)] # Default int64
})
# Optimize memory usage
optimized = large_data[:, dt.update(
id=dt.as_type(f.id, dt.int32), # Sufficient for 1M records
category=dt.as_type(f.category, dt.cat8), # Categorical for repeated values
small_val=dt.as_type(f.small_val, dt.int8) # Values 0-99 fit in int8
)]
# Check memory savings
print(f"Original stypes: {large_data.stypes}")
print(f"Optimized stypes: {optimized.stypes}")# Working with temporal data
dates = dt.Frame({
'date_str': ['2023-01-01', '2023-06-15', '2023-12-31'],
'timestamp_str': ['2023-01-01 12:30:45', '2023-06-15 09:15:20', '2023-12-31 23:59:59']
})
# Convert to temporal types
temporal = dates[:, dt.update(
date_val=dt.as_type(f.date_str, dt.date32),
timestamp_val=dt.as_type(f.timestamp_str, dt.time64)
)]
# Extract components
components = temporal[:, dt.update(
year=dt.time.year(f.timestamp_val),
month=dt.time.month(f.timestamp_val),
day=dt.time.day(f.timestamp_val),
hour=dt.time.hour(f.timestamp_val)
)]# Choose appropriate string type based on data size
short_strings = dt.Frame({'text': ['a', 'bb', 'ccc']})
long_strings = dt.Frame({'text': ['very long string' * 100] * 1000})
# str32 for smaller datasets/strings
short_optimized = dt.as_type(short_strings, {'text': dt.str32})
# str64 for larger datasets/strings
long_optimized = dt.as_type(long_strings, {'text': dt.str64})
# Check string properties
print(f"str32 supports up to {2**31-1} characters")
print(f"str64 supports up to {2**63-1} characters")# Convert repeated strings to categorical
categories = dt.Frame({
'color': ['red', 'blue', 'green'] * 10000,
'size': ['small', 'medium', 'large'] * 10000
})
# Use categorical types for memory efficiency
categorical = categories[:, dt.update(
color_cat=dt.as_type(f.color, dt.cat8), # Up to 255 categories
size_cat=dt.as_type(f.size, dt.cat8)
)]
# Access categorical information
color_codes = categorical[:, dt.codes(f.color_cat)]
color_categories = categorical[:, dt.categories(f.color_cat)]def validate_types(frame, expected_types):
"""Validate frame has expected types"""
actual_types = frame.stypes
for i, (actual, expected) in enumerate(zip(actual_types, expected_types)):
if actual != expected:
column_name = frame.names[i]
print(f"Column {column_name}: expected {expected}, got {actual}")
return False
return True
# Usage
DT = dt.Frame({'A': [1, 2, 3], 'B': [1.1, 2.2, 3.3]})
is_valid = validate_types(DT, [dt.int64, dt.float64])# datatable automatically detects appropriate types
mixed_data = dt.Frame({
'integers': [1, 2, 3, 4],
'floats': [1.1, 2.2, 3.3, 4.4],
'strings': ['a', 'b', 'c', 'd'],
'booleans': [True, False, True, False],
'mixed_numbers': [1, 2.5, 3, 4.7] # Will be float64
})
print("Auto-detected types:", mixed_data.stypes)
# Override auto-detection
explicit_types = dt.Frame({
'integers': [1, 2, 3, 4],
'floats': [1.1, 2.2, 3.3, 4.4]
}, stypes=[dt.int32, dt.float32])# Type promotion in operations
int_col = dt.Frame({'x': [1, 2, 3]}, stype=dt.int32)
float_col = dt.Frame({'y': [1.1, 2.2, 3.3]}, stype=dt.float32)
# Operations promote to common type
combined = dt.cbind(int_col, float_col)
result = combined[:, f.x + f.y] # Result will be float64
# Explicit control over type promotion
result_controlled = combined[:,
dt.as_type(f.x, dt.float32) + f.y # Keep as float32
]The following type constants are available directly from the datatable module:
# Available as dt.typename
dt.void, dt.bool8
dt.int8, dt.int16, dt.int32, dt.int64
dt.float32, dt.float64
dt.str32, dt.str64
dt.obj64Install with Tessl CLI
npx tessl i tessl/pypi-datatable