or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

chat-interface.mdclient-management.mddocument-prompt-template.mdembeddings.mdevaluation.mdexplanations.mdindex.mdprompt-construction.mdsteering.mdstructured-output.mdtext-completion.mdtokenization.mdtranslation.mdutilities.md

tokenization.mddocs/

0

# Tokenization & Text Processing

1

2

Convert between text and tokens, with support for different tokenization strategies and detokenization. Provides low-level access to model tokenization for debugging, analysis, and advanced prompt construction.

3

4

## Capabilities

5

6

### Text Tokenization

7

8

Convert text strings to token representations with flexible output options.

9

10

```python { .api }

11

class TokenizationRequest:

12

prompt: str

13

tokens: bool

14

token_ids: bool

15

"""

16

Request for text tokenization.

17

18

Attributes:

19

- prompt: Text string to tokenize

20

- tokens: Return text representation of tokens

21

- token_ids: Return numeric token IDs

22

"""

23

24

def to_json(self) -> Mapping[str, Any]:

25

"""Serialize request to JSON format."""

26

27

class TokenizationResponse:

28

tokens: Optional[Sequence[str]] = None

29

token_ids: Optional[Sequence[int]] = None

30

"""

31

Response from tokenization request.

32

33

Attributes:

34

- tokens: Text tokens (if requested)

35

- token_ids: Numeric token IDs (if requested)

36

"""

37

38

@staticmethod

39

def from_json(json: Dict[str, Any]) -> TokenizationResponse:

40

"""Create response from JSON data."""

41

42

def tokenize(

43

self,

44

request: TokenizationRequest,

45

model: str

46

) -> TokenizationResponse:

47

"""

48

Tokenize text using model-specific tokenizer.

49

50

Parameters:

51

- request: Tokenization configuration

52

- model: Model name for tokenizer selection

53

54

Returns:

55

TokenizationResponse with tokens and/or token IDs

56

"""

57

```

58

59

### Token Detokenization

60

61

Convert token IDs back to readable text with proper spacing and formatting.

62

63

```python { .api }

64

class DetokenizationRequest:

65

token_ids: Sequence[int]

66

"""

67

Request for token detokenization.

68

69

Attributes:

70

- token_ids: Sequence of token IDs to convert back to text

71

"""

72

73

def to_json(self) -> Mapping[str, Any]:

74

"""Serialize request to JSON format."""

75

76

class DetokenizationResponse:

77

result: str

78

"""

79

Response from detokenization request.

80

81

Attributes:

82

- result: Reconstructed text from token IDs

83

"""

84

85

@staticmethod

86

def from_json(json: Dict[str, Any]) -> DetokenizationResponse:

87

"""Create response from JSON data."""

88

89

def detokenize(

90

self,

91

request: DetokenizationRequest,

92

model: str

93

) -> DetokenizationResponse:

94

"""

95

Convert token IDs back to text.

96

97

Parameters:

98

- request: Detokenization configuration with token IDs

99

- model: Model name for tokenizer selection

100

101

Returns:

102

DetokenizationResponse with reconstructed text

103

"""

104

```

105

106

### Tokenizer Access

107

108

Direct access to model tokenizers for advanced use cases and offline processing.

109

110

```python { .api }

111

def tokenizer(self, model: str) -> Tokenizer:

112

"""

113

Get tokenizer instance for specified model.

114

115

Parameters:

116

- model: Model name

117

118

Returns:

119

Tokenizer object for direct use

120

"""

121

122

async def tokenizer(self, model: str) -> Tokenizer:

123

"""

124

Get tokenizer instance for specified model (async).

125

126

Parameters:

127

- model: Model name

128

129

Returns:

130

Tokenizer object for direct use

131

"""

132

```

133

134

### Usage Examples

135

136

Comprehensive tokenization examples for debugging, analysis, and advanced prompt construction:

137

138

```python

139

from aleph_alpha_client import (

140

Client, TokenizationRequest, DetokenizationRequest,

141

Tokens, Prompt

142

)

143

144

client = Client(token="your-api-token")

145

146

# Basic tokenization - get both tokens and IDs

147

text = "Hello world! How are you today?"

148

request = TokenizationRequest(

149

prompt=text,

150

tokens=True, # Get text tokens

151

token_ids=True # Get numeric IDs

152

)

153

154

response = client.tokenize(request, model="luminous-extended")

155

156

print(f"Original text: {text}")

157

print(f"Tokens: {response.tokens}")

158

print(f"Token IDs: {response.token_ids}")

159

print(f"Number of tokens: {len(response.token_ids) if response.token_ids else 0}")

160

161

# Analyze tokenization patterns

162

def analyze_tokenization(text: str, model: str):

163

"""Analyze how text gets tokenized."""

164

request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)

165

response = client.tokenize(request, model=model)

166

167

print(f"\nText: '{text}'")

168

print(f"Tokenization analysis:")

169

170

if response.tokens and response.token_ids:

171

for token, token_id in zip(response.tokens, response.token_ids):

172

print(f" '{token}' -> {token_id}")

173

174

return response

175

176

# Test different text patterns

177

analyze_tokenization("machine learning", "luminous-extended")

178

analyze_tokenization("MachineLearning", "luminous-extended")

179

analyze_tokenization("machine_learning", "luminous-extended")

180

analyze_tokenization("🤖 AI robot", "luminous-extended")

181

182

# Token counting for cost estimation

183

def count_tokens(text: str, model: str) -> int:

184

"""Count tokens in text for cost estimation."""

185

request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)

186

response = client.tokenize(request, model=model)

187

return len(response.token_ids) if response.token_ids else 0

188

189

texts = [

190

"Short text",

191

"This is a longer text that will have more tokens than the short one above.",

192

"Very long text with multiple sentences. Each sentence adds tokens. More sentences mean more tokens and higher costs for API calls."

193

]

194

195

for text in texts:

196

token_count = count_tokens(text, "luminous-extended")

197

print(f"'{text[:30]}...': {token_count} tokens")

198

199

# Detokenization - convert tokens back to text

200

token_ids = [1234, 5678, 9012, 3456] # Example token IDs

201

detok_request = DetokenizationRequest(token_ids=token_ids)

202

detok_response = client.detokenize(detok_request, model="luminous-extended")

203

204

print(f"Token IDs: {token_ids}")

205

print(f"Detokenized text: '{detok_response.result}'")

206

207

# Round-trip testing (tokenize then detokenize)

208

def test_round_trip(text: str, model: str):

209

"""Test tokenization -> detokenization round trip."""

210

# Tokenize

211

tok_request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)

212

tok_response = client.tokenize(tok_request, model=model)

213

214

if not tok_response.token_ids:

215

print("No token IDs returned")

216

return

217

218

# Detokenize

219

detok_request = DetokenizationRequest(token_ids=tok_response.token_ids)

220

detok_response = client.detokenize(detok_request, model=model)

221

222

print(f"Original: '{text}'")

223

print(f"Round-trip: '{detok_response.result}'")

224

print(f"Match: {text == detok_response.result}")

225

print()

226

227

test_round_trip("Hello world!", "luminous-extended")

228

test_round_trip("Python programming", "luminous-extended")

229

230

# Advanced: Build prompts with token-level control

231

def build_token_controlled_prompt(text: str, model: str, emphasis_tokens: list[int]):

232

"""Build prompt with token-level attention control."""

233

# First tokenize to get token IDs

234

tok_request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)

235

tok_response = client.tokenize(tok_request, model=model)

236

237

if not tok_response.token_ids:

238

return None

239

240

# Create token controls for specified positions

241

from aleph_alpha_client import TokenControl

242

controls = [

243

TokenControl(pos=pos, factor=2.0)

244

for pos in emphasis_tokens

245

if pos < len(tok_response.token_ids)

246

]

247

248

# Build tokens object with controls

249

tokens = Tokens(

250

tokens=tok_response.token_ids,

251

controls=controls

252

)

253

254

return Prompt([tokens])

255

256

# Emphasize tokens at positions 2 and 4

257

controlled_prompt = build_token_controlled_prompt(

258

"Machine learning is fascinating technology",

259

"luminous-extended",

260

emphasis_tokens=[2, 4]

261

)

262

263

if controlled_prompt:

264

print("Created prompt with token-level attention control")

265

266

# Multi-language tokenization comparison

267

multilingual_texts = {

268

"English": "Hello, how are you?",

269

"German": "Hallo, wie geht es dir?",

270

"French": "Bonjour, comment allez-vous?",

271

"Spanish": "Hola, ¿cómo estás?",

272

"Japanese": "こんにちは、元気ですか?"

273

}

274

275

print("Multi-language tokenization comparison:")

276

for language, text in multilingual_texts.items():

277

token_count = count_tokens(text, "luminous-extended")

278

print(f"{language:10}: {token_count:2d} tokens - '{text}'")

279

280

# Direct tokenizer usage (if available)

281

try:

282

tokenizer = client.tokenizer("luminous-extended")

283

print(f"Got tokenizer: {tokenizer}")

284

# Use tokenizer directly for offline processing

285

except Exception as e:

286

print(f"Direct tokenizer access not available: {e}")

287

288

# Special token analysis

289

special_texts = [

290

"<start>", # Special tokens

291

"[MASK]", # Mask tokens

292

"\n\n\n", # Whitespace

293

"word word", # Repeated words

294

"123456", # Numbers

295

"user@email.com" # Email

296

]

297

298

print("\nSpecial token analysis:")

299

for text in special_texts:

300

request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)

301

response = client.tokenize(request, model="luminous-extended")

302

303

token_count = len(response.token_ids) if response.token_ids else 0

304

tokens_str = str(response.tokens) if response.tokens else "None"

305

306

print(f"'{text:15}' -> {token_count:2d} tokens: {tokens_str}")

307

```