tessl/pypi-datatable

Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support

—

Pending

Overview

Eval results

Files

Type System

Name: tessl/pypi-datatable
Author: tessl

Comprehensive type system with storage types (stype) and logical types (ltype) for precise data type control and efficient memory usage.

Capabilities

Storage Types (stype)

class stype(Enum):
    """Storage type enumeration for precise memory layout control"""
    
    void = 0       # No data
    bool8 = 1      # 8-bit boolean
    int8 = 2       # 8-bit signed integer
    int16 = 3      # 16-bit signed integer  
    int32 = 4      # 32-bit signed integer
    int64 = 5      # 64-bit signed integer
    float32 = 6    # 32-bit floating point
    float64 = 7    # 64-bit floating point
    str32 = 11     # String with 32-bit offsets
    str64 = 12     # String with 64-bit offsets
    arr32 = 13     # Array with 32-bit offsets
    arr64 = 14     # Array with 64-bit offsets
    date32 = 17    # Date (days since epoch)
    time64 = 18    # Timestamp (nanoseconds since epoch)
    obj64 = 21     # Python object references
    cat8 = 22      # Categorical with 8-bit codes
    cat16 = 23     # Categorical with 16-bit codes
    cat32 = 24     # Categorical with 32-bit codes
    
    @property
    def code(self) -> str:
        """Two-character string representation"""
    
    @property
    def ltype(self) -> 'ltype':
        """Corresponding logical type"""
    
    @property
    def ctype(self):
        """ctypes class for C-level type"""
    
    @property
    def dtype(self):
        """numpy.dtype equivalent"""
    
    @property
    def min(self):
        """Minimum representable value"""
    
    @property
    def max(self):
        """Maximum representable value"""

Logical Types (ltype)

class ltype(Enum):
    """Logical type enumeration for high-level data categories"""
    
    void = 0       # No data
    bool = 1       # Boolean values
    int = 2        # Integer values  
    real = 3       # Real/floating point values
    str = 4        # String/text values
    time = 5       # Date/time values
    obj = 7        # Object values
    invalid = 8    # Invalid/unsupported type
    
    @property
    def stypes(self) -> list:
        """List of stypes that represent this ltype"""

Type Conversion

def as_type(frame_or_column, new_type) -> Frame:
    """
    Convert frame or column to specified type.
    
    Parameters:
    - frame_or_column: Frame or column expression to convert
    - new_type: Target stype, ltype, or Type object
    
    Returns:
    Frame or expression with converted types
    """

class Type:
    """Type system helper for datatable operations"""
    pass

def categories(column) -> Frame:
    """
    Extract category labels from categorical column.
    
    Parameters:
    - column: Categorical column expression
    
    Returns:
    Frame with unique category labels
    """

def codes(column) -> FExpr:
    """
    Extract category codes from categorical column.
    
    Parameters:
    - column: Categorical column expression
    
    Returns:
    Integer codes for categorical values
    """

Type Examples

Working with Storage Types

import datatable as dt

# Create Frame with specific types
DT = dt.Frame({
    'small_int': [1, 2, 3],
    'big_int': [1000000, 2000000, 3000000],
    'text': ['a', 'b', 'c'],
    'flag': [True, False, True]
}, stypes=[dt.int8, dt.int64, dt.str32, dt.bool8])

# Check types
print(DT.stypes)    # (stype.int8, stype.int64, stype.str32, stype.bool8)
print(DT.ltypes)    # (ltype.int, ltype.int, ltype.str, ltype.bool)

# Access type properties
print(dt.int8.min, dt.int8.max)        # (-127, 127)
print(dt.int64.min, dt.int64.max)      # Large integer bounds
print(dt.str32.code)                   # 's4'

Type Conversion Examples

# Convert specific columns
DT_converted = DT[:, dt.update(
    small_as_big=dt.as_type(f.small_int, dt.int64),
    big_as_float=dt.as_type(f.big_int, dt.float64),
    text_as_cat=dt.as_type(f.text, dt.cat8)
)]

# Convert entire frame
DT_all_float = dt.as_type(DT, dt.float64)

# Convert with expressions
DT_conditional = DT[:, dt.update(
    smart_type=dt.ifelse(f.big_int > 1500000,
                        dt.as_type(f.big_int, dt.float32),
                        dt.as_type(f.big_int, dt.int32))
)]

Memory Optimization

# Use smaller types for memory efficiency
large_data = dt.Frame({
    'id': range(1000000),           # Default int64
    'category': ['A'] * 500000 + ['B'] * 500000,  # Default str64
    'flag': [True, False] * 500000,  # Default bool8
    'small_val': [x % 100 for x in range(1000000)]  # Default int64
})

# Optimize memory usage
optimized = large_data[:, dt.update(
    id=dt.as_type(f.id, dt.int32),          # Sufficient for 1M records
    category=dt.as_type(f.category, dt.cat8), # Categorical for repeated values
    small_val=dt.as_type(f.small_val, dt.int8)  # Values 0-99 fit in int8
)]

# Check memory savings
print(f"Original stypes: {large_data.stypes}")
print(f"Optimized stypes: {optimized.stypes}")

Date and Time Types

# Working with temporal data
dates = dt.Frame({
    'date_str': ['2023-01-01', '2023-06-15', '2023-12-31'],
    'timestamp_str': ['2023-01-01 12:30:45', '2023-06-15 09:15:20', '2023-12-31 23:59:59']
})

# Convert to temporal types
temporal = dates[:, dt.update(
    date_val=dt.as_type(f.date_str, dt.date32),
    timestamp_val=dt.as_type(f.timestamp_str, dt.time64)
)]

# Extract components
components = temporal[:, dt.update(
    year=dt.time.year(f.timestamp_val),
    month=dt.time.month(f.timestamp_val),
    day=dt.time.day(f.timestamp_val),
    hour=dt.time.hour(f.timestamp_val)
)]

String Type Optimization

# Choose appropriate string type based on data size
short_strings = dt.Frame({'text': ['a', 'bb', 'ccc']})
long_strings = dt.Frame({'text': ['very long string' * 100] * 1000})

# str32 for smaller datasets/strings
short_optimized = dt.as_type(short_strings, {'text': dt.str32})

# str64 for larger datasets/strings  
long_optimized = dt.as_type(long_strings, {'text': dt.str64})

# Check string properties
print(f"str32 supports up to {2**31-1} characters")
print(f"str64 supports up to {2**63-1} characters")

Categorical Types

# Convert repeated strings to categorical
categories = dt.Frame({
    'color': ['red', 'blue', 'green'] * 10000,
    'size': ['small', 'medium', 'large'] * 10000
})

# Use categorical types for memory efficiency
categorical = categories[:, dt.update(
    color_cat=dt.as_type(f.color, dt.cat8),    # Up to 255 categories
    size_cat=dt.as_type(f.size, dt.cat8)
)]

# Access categorical information
color_codes = categorical[:, dt.codes(f.color_cat)]
color_categories = categorical[:, dt.categories(f.color_cat)]

Type Checking and Validation

def validate_types(frame, expected_types):
    """Validate frame has expected types"""
    actual_types = frame.stypes
    for i, (actual, expected) in enumerate(zip(actual_types, expected_types)):
        if actual != expected:
            column_name = frame.names[i]
            print(f"Column {column_name}: expected {expected}, got {actual}")
            return False
    return True

# Usage
DT = dt.Frame({'A': [1, 2, 3], 'B': [1.1, 2.2, 3.3]})
is_valid = validate_types(DT, [dt.int64, dt.float64])

Automatic Type Detection

# datatable automatically detects appropriate types
mixed_data = dt.Frame({
    'integers': [1, 2, 3, 4],
    'floats': [1.1, 2.2, 3.3, 4.4],
    'strings': ['a', 'b', 'c', 'd'],
    'booleans': [True, False, True, False],
    'mixed_numbers': [1, 2.5, 3, 4.7]  # Will be float64
})

print("Auto-detected types:", mixed_data.stypes)

# Override auto-detection
explicit_types = dt.Frame({
    'integers': [1, 2, 3, 4],
    'floats': [1.1, 2.2, 3.3, 4.4]
}, stypes=[dt.int32, dt.float32])

Type Compatibility and Coercion

# Type promotion in operations
int_col = dt.Frame({'x': [1, 2, 3]}, stype=dt.int32)
float_col = dt.Frame({'y': [1.1, 2.2, 3.3]}, stype=dt.float32)

# Operations promote to common type
combined = dt.cbind(int_col, float_col)
result = combined[:, f.x + f.y]  # Result will be float64

# Explicit control over type promotion
result_controlled = combined[:, 
    dt.as_type(f.x, dt.float32) + f.y  # Keep as float32
]

Type Constants

The following type constants are available directly from the datatable module:

# Available as dt.typename
dt.void, dt.bool8
dt.int8, dt.int16, dt.int32, dt.int64
dt.float32, dt.float64
dt.str32, dt.str64
dt.obj64

Install with Tessl CLI

npx tessl i tessl/pypi-datatable

docs

core-data-structures.md

mathematical-functions.md

reductions-aggregations.md