or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-structures.mddata-manipulation.mdexpression-system.mdfile-io.mdindex.mdmathematical-functions.mdreductions-aggregations.mdrow-operations.mdset-operations.mdstring-operations.mdtime-operations.mdtype-system.md

type-system.mddocs/

0

# Type System

1

2

Comprehensive type system with storage types (stype) and logical types (ltype) for precise data type control and efficient memory usage.

3

4

## Capabilities

5

6

### Storage Types (stype)

7

8

```python { .api }

9

class stype(Enum):

10

"""Storage type enumeration for precise memory layout control"""

11

12

void = 0 # No data

13

bool8 = 1 # 8-bit boolean

14

int8 = 2 # 8-bit signed integer

15

int16 = 3 # 16-bit signed integer

16

int32 = 4 # 32-bit signed integer

17

int64 = 5 # 64-bit signed integer

18

float32 = 6 # 32-bit floating point

19

float64 = 7 # 64-bit floating point

20

str32 = 11 # String with 32-bit offsets

21

str64 = 12 # String with 64-bit offsets

22

arr32 = 13 # Array with 32-bit offsets

23

arr64 = 14 # Array with 64-bit offsets

24

date32 = 17 # Date (days since epoch)

25

time64 = 18 # Timestamp (nanoseconds since epoch)

26

obj64 = 21 # Python object references

27

cat8 = 22 # Categorical with 8-bit codes

28

cat16 = 23 # Categorical with 16-bit codes

29

cat32 = 24 # Categorical with 32-bit codes

30

31

@property

32

def code(self) -> str:

33

"""Two-character string representation"""

34

35

@property

36

def ltype(self) -> 'ltype':

37

"""Corresponding logical type"""

38

39

@property

40

def ctype(self):

41

"""ctypes class for C-level type"""

42

43

@property

44

def dtype(self):

45

"""numpy.dtype equivalent"""

46

47

@property

48

def min(self):

49

"""Minimum representable value"""

50

51

@property

52

def max(self):

53

"""Maximum representable value"""

54

```

55

56

### Logical Types (ltype)

57

58

```python { .api }

59

class ltype(Enum):

60

"""Logical type enumeration for high-level data categories"""

61

62

void = 0 # No data

63

bool = 1 # Boolean values

64

int = 2 # Integer values

65

real = 3 # Real/floating point values

66

str = 4 # String/text values

67

time = 5 # Date/time values

68

obj = 7 # Object values

69

invalid = 8 # Invalid/unsupported type

70

71

@property

72

def stypes(self) -> list:

73

"""List of stypes that represent this ltype"""

74

```

75

76

### Type Conversion

77

78

```python { .api }

79

def as_type(frame_or_column, new_type) -> Frame:

80

"""

81

Convert frame or column to specified type.

82

83

Parameters:

84

- frame_or_column: Frame or column expression to convert

85

- new_type: Target stype, ltype, or Type object

86

87

Returns:

88

Frame or expression with converted types

89

"""

90

91

class Type:

92

"""Type system helper for datatable operations"""

93

pass

94

95

def categories(column) -> Frame:

96

"""

97

Extract category labels from categorical column.

98

99

Parameters:

100

- column: Categorical column expression

101

102

Returns:

103

Frame with unique category labels

104

"""

105

106

def codes(column) -> FExpr:

107

"""

108

Extract category codes from categorical column.

109

110

Parameters:

111

- column: Categorical column expression

112

113

Returns:

114

Integer codes for categorical values

115

"""

116

```

117

118

## Type Examples

119

120

### Working with Storage Types

121

122

```python

123

import datatable as dt

124

125

# Create Frame with specific types

126

DT = dt.Frame({

127

'small_int': [1, 2, 3],

128

'big_int': [1000000, 2000000, 3000000],

129

'text': ['a', 'b', 'c'],

130

'flag': [True, False, True]

131

}, stypes=[dt.int8, dt.int64, dt.str32, dt.bool8])

132

133

# Check types

134

print(DT.stypes) # (stype.int8, stype.int64, stype.str32, stype.bool8)

135

print(DT.ltypes) # (ltype.int, ltype.int, ltype.str, ltype.bool)

136

137

# Access type properties

138

print(dt.int8.min, dt.int8.max) # (-127, 127)

139

print(dt.int64.min, dt.int64.max) # Large integer bounds

140

print(dt.str32.code) # 's4'

141

```

142

143

### Type Conversion Examples

144

145

```python

146

# Convert specific columns

147

DT_converted = DT[:, dt.update(

148

small_as_big=dt.as_type(f.small_int, dt.int64),

149

big_as_float=dt.as_type(f.big_int, dt.float64),

150

text_as_cat=dt.as_type(f.text, dt.cat8)

151

)]

152

153

# Convert entire frame

154

DT_all_float = dt.as_type(DT, dt.float64)

155

156

# Convert with expressions

157

DT_conditional = DT[:, dt.update(

158

smart_type=dt.ifelse(f.big_int > 1500000,

159

dt.as_type(f.big_int, dt.float32),

160

dt.as_type(f.big_int, dt.int32))

161

)]

162

```

163

164

### Memory Optimization

165

166

```python

167

# Use smaller types for memory efficiency

168

large_data = dt.Frame({

169

'id': range(1000000), # Default int64

170

'category': ['A'] * 500000 + ['B'] * 500000, # Default str64

171

'flag': [True, False] * 500000, # Default bool8

172

'small_val': [x % 100 for x in range(1000000)] # Default int64

173

})

174

175

# Optimize memory usage

176

optimized = large_data[:, dt.update(

177

id=dt.as_type(f.id, dt.int32), # Sufficient for 1M records

178

category=dt.as_type(f.category, dt.cat8), # Categorical for repeated values

179

small_val=dt.as_type(f.small_val, dt.int8) # Values 0-99 fit in int8

180

)]

181

182

# Check memory savings

183

print(f"Original stypes: {large_data.stypes}")

184

print(f"Optimized stypes: {optimized.stypes}")

185

```

186

187

### Date and Time Types

188

189

```python

190

# Working with temporal data

191

dates = dt.Frame({

192

'date_str': ['2023-01-01', '2023-06-15', '2023-12-31'],

193

'timestamp_str': ['2023-01-01 12:30:45', '2023-06-15 09:15:20', '2023-12-31 23:59:59']

194

})

195

196

# Convert to temporal types

197

temporal = dates[:, dt.update(

198

date_val=dt.as_type(f.date_str, dt.date32),

199

timestamp_val=dt.as_type(f.timestamp_str, dt.time64)

200

)]

201

202

# Extract components

203

components = temporal[:, dt.update(

204

year=dt.time.year(f.timestamp_val),

205

month=dt.time.month(f.timestamp_val),

206

day=dt.time.day(f.timestamp_val),

207

hour=dt.time.hour(f.timestamp_val)

208

)]

209

```

210

211

### String Type Optimization

212

213

```python

214

# Choose appropriate string type based on data size

215

short_strings = dt.Frame({'text': ['a', 'bb', 'ccc']})

216

long_strings = dt.Frame({'text': ['very long string' * 100] * 1000})

217

218

# str32 for smaller datasets/strings

219

short_optimized = dt.as_type(short_strings, {'text': dt.str32})

220

221

# str64 for larger datasets/strings

222

long_optimized = dt.as_type(long_strings, {'text': dt.str64})

223

224

# Check string properties

225

print(f"str32 supports up to {2**31-1} characters")

226

print(f"str64 supports up to {2**63-1} characters")

227

```

228

229

### Categorical Types

230

231

```python

232

# Convert repeated strings to categorical

233

categories = dt.Frame({

234

'color': ['red', 'blue', 'green'] * 10000,

235

'size': ['small', 'medium', 'large'] * 10000

236

})

237

238

# Use categorical types for memory efficiency

239

categorical = categories[:, dt.update(

240

color_cat=dt.as_type(f.color, dt.cat8), # Up to 255 categories

241

size_cat=dt.as_type(f.size, dt.cat8)

242

)]

243

244

# Access categorical information

245

color_codes = categorical[:, dt.codes(f.color_cat)]

246

color_categories = categorical[:, dt.categories(f.color_cat)]

247

```

248

249

### Type Checking and Validation

250

251

```python

252

def validate_types(frame, expected_types):

253

"""Validate frame has expected types"""

254

actual_types = frame.stypes

255

for i, (actual, expected) in enumerate(zip(actual_types, expected_types)):

256

if actual != expected:

257

column_name = frame.names[i]

258

print(f"Column {column_name}: expected {expected}, got {actual}")

259

return False

260

return True

261

262

# Usage

263

DT = dt.Frame({'A': [1, 2, 3], 'B': [1.1, 2.2, 3.3]})

264

is_valid = validate_types(DT, [dt.int64, dt.float64])

265

```

266

267

### Automatic Type Detection

268

269

```python

270

# datatable automatically detects appropriate types

271

mixed_data = dt.Frame({

272

'integers': [1, 2, 3, 4],

273

'floats': [1.1, 2.2, 3.3, 4.4],

274

'strings': ['a', 'b', 'c', 'd'],

275

'booleans': [True, False, True, False],

276

'mixed_numbers': [1, 2.5, 3, 4.7] # Will be float64

277

})

278

279

print("Auto-detected types:", mixed_data.stypes)

280

281

# Override auto-detection

282

explicit_types = dt.Frame({

283

'integers': [1, 2, 3, 4],

284

'floats': [1.1, 2.2, 3.3, 4.4]

285

}, stypes=[dt.int32, dt.float32])

286

```

287

288

### Type Compatibility and Coercion

289

290

```python

291

# Type promotion in operations

292

int_col = dt.Frame({'x': [1, 2, 3]}, stype=dt.int32)

293

float_col = dt.Frame({'y': [1.1, 2.2, 3.3]}, stype=dt.float32)

294

295

# Operations promote to common type

296

combined = dt.cbind(int_col, float_col)

297

result = combined[:, f.x + f.y] # Result will be float64

298

299

# Explicit control over type promotion

300

result_controlled = combined[:,

301

dt.as_type(f.x, dt.float32) + f.y # Keep as float32

302

]

303

```

304

305

## Type Constants

306

307

The following type constants are available directly from the datatable module:

308

309

```python

310

# Available as dt.typename

311

dt.void, dt.bool8

312

dt.int8, dt.int16, dt.int32, dt.int64

313

dt.float32, dt.float64

314

dt.str32, dt.str64

315

dt.obj64

316

```