0
# Tokenization & Text Processing
1
2
Convert between text and tokens, with support for different tokenization strategies and detokenization. Provides low-level access to model tokenization for debugging, analysis, and advanced prompt construction.
3
4
## Capabilities
5
6
### Text Tokenization
7
8
Convert text strings to token representations with flexible output options.
9
10
```python { .api }
11
class TokenizationRequest:
12
prompt: str
13
tokens: bool
14
token_ids: bool
15
"""
16
Request for text tokenization.
17
18
Attributes:
19
- prompt: Text string to tokenize
20
- tokens: Return text representation of tokens
21
- token_ids: Return numeric token IDs
22
"""
23
24
def to_json(self) -> Mapping[str, Any]:
25
"""Serialize request to JSON format."""
26
27
class TokenizationResponse:
28
tokens: Optional[Sequence[str]] = None
29
token_ids: Optional[Sequence[int]] = None
30
"""
31
Response from tokenization request.
32
33
Attributes:
34
- tokens: Text tokens (if requested)
35
- token_ids: Numeric token IDs (if requested)
36
"""
37
38
@staticmethod
39
def from_json(json: Dict[str, Any]) -> TokenizationResponse:
40
"""Create response from JSON data."""
41
42
def tokenize(
43
self,
44
request: TokenizationRequest,
45
model: str
46
) -> TokenizationResponse:
47
"""
48
Tokenize text using model-specific tokenizer.
49
50
Parameters:
51
- request: Tokenization configuration
52
- model: Model name for tokenizer selection
53
54
Returns:
55
TokenizationResponse with tokens and/or token IDs
56
"""
57
```
58
59
### Token Detokenization
60
61
Convert token IDs back to readable text with proper spacing and formatting.
62
63
```python { .api }
64
class DetokenizationRequest:
65
token_ids: Sequence[int]
66
"""
67
Request for token detokenization.
68
69
Attributes:
70
- token_ids: Sequence of token IDs to convert back to text
71
"""
72
73
def to_json(self) -> Mapping[str, Any]:
74
"""Serialize request to JSON format."""
75
76
class DetokenizationResponse:
77
result: str
78
"""
79
Response from detokenization request.
80
81
Attributes:
82
- result: Reconstructed text from token IDs
83
"""
84
85
@staticmethod
86
def from_json(json: Dict[str, Any]) -> DetokenizationResponse:
87
"""Create response from JSON data."""
88
89
def detokenize(
90
self,
91
request: DetokenizationRequest,
92
model: str
93
) -> DetokenizationResponse:
94
"""
95
Convert token IDs back to text.
96
97
Parameters:
98
- request: Detokenization configuration with token IDs
99
- model: Model name for tokenizer selection
100
101
Returns:
102
DetokenizationResponse with reconstructed text
103
"""
104
```
105
106
### Tokenizer Access
107
108
Direct access to model tokenizers for advanced use cases and offline processing.
109
110
```python { .api }
111
def tokenizer(self, model: str) -> Tokenizer:
112
"""
113
Get tokenizer instance for specified model.
114
115
Parameters:
116
- model: Model name
117
118
Returns:
119
Tokenizer object for direct use
120
"""
121
122
async def tokenizer(self, model: str) -> Tokenizer:
123
"""
124
Get tokenizer instance for specified model (async).
125
126
Parameters:
127
- model: Model name
128
129
Returns:
130
Tokenizer object for direct use
131
"""
132
```
133
134
### Usage Examples
135
136
Comprehensive tokenization examples for debugging, analysis, and advanced prompt construction:
137
138
```python
139
from aleph_alpha_client import (
140
Client, TokenizationRequest, DetokenizationRequest,
141
Tokens, Prompt
142
)
143
144
client = Client(token="your-api-token")
145
146
# Basic tokenization - get both tokens and IDs
147
text = "Hello world! How are you today?"
148
request = TokenizationRequest(
149
prompt=text,
150
tokens=True, # Get text tokens
151
token_ids=True # Get numeric IDs
152
)
153
154
response = client.tokenize(request, model="luminous-extended")
155
156
print(f"Original text: {text}")
157
print(f"Tokens: {response.tokens}")
158
print(f"Token IDs: {response.token_ids}")
159
print(f"Number of tokens: {len(response.token_ids) if response.token_ids else 0}")
160
161
# Analyze tokenization patterns
162
def analyze_tokenization(text: str, model: str):
163
"""Analyze how text gets tokenized."""
164
request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
165
response = client.tokenize(request, model=model)
166
167
print(f"\nText: '{text}'")
168
print(f"Tokenization analysis:")
169
170
if response.tokens and response.token_ids:
171
for token, token_id in zip(response.tokens, response.token_ids):
172
print(f" '{token}' -> {token_id}")
173
174
return response
175
176
# Test different text patterns
177
analyze_tokenization("machine learning", "luminous-extended")
178
analyze_tokenization("MachineLearning", "luminous-extended")
179
analyze_tokenization("machine_learning", "luminous-extended")
180
analyze_tokenization("🤖 AI robot", "luminous-extended")
181
182
# Token counting for cost estimation
183
def count_tokens(text: str, model: str) -> int:
184
"""Count tokens in text for cost estimation."""
185
request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
186
response = client.tokenize(request, model=model)
187
return len(response.token_ids) if response.token_ids else 0
188
189
texts = [
190
"Short text",
191
"This is a longer text that will have more tokens than the short one above.",
192
"Very long text with multiple sentences. Each sentence adds tokens. More sentences mean more tokens and higher costs for API calls."
193
]
194
195
for text in texts:
196
token_count = count_tokens(text, "luminous-extended")
197
print(f"'{text[:30]}...': {token_count} tokens")
198
199
# Detokenization - convert tokens back to text
200
token_ids = [1234, 5678, 9012, 3456] # Example token IDs
201
detok_request = DetokenizationRequest(token_ids=token_ids)
202
detok_response = client.detokenize(detok_request, model="luminous-extended")
203
204
print(f"Token IDs: {token_ids}")
205
print(f"Detokenized text: '{detok_response.result}'")
206
207
# Round-trip testing (tokenize then detokenize)
208
def test_round_trip(text: str, model: str):
209
"""Test tokenization -> detokenization round trip."""
210
# Tokenize
211
tok_request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
212
tok_response = client.tokenize(tok_request, model=model)
213
214
if not tok_response.token_ids:
215
print("No token IDs returned")
216
return
217
218
# Detokenize
219
detok_request = DetokenizationRequest(token_ids=tok_response.token_ids)
220
detok_response = client.detokenize(detok_request, model=model)
221
222
print(f"Original: '{text}'")
223
print(f"Round-trip: '{detok_response.result}'")
224
print(f"Match: {text == detok_response.result}")
225
print()
226
227
test_round_trip("Hello world!", "luminous-extended")
228
test_round_trip("Python programming", "luminous-extended")
229
230
# Advanced: Build prompts with token-level control
231
def build_token_controlled_prompt(text: str, model: str, emphasis_tokens: list[int]):
232
"""Build prompt with token-level attention control."""
233
# First tokenize to get token IDs
234
tok_request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
235
tok_response = client.tokenize(tok_request, model=model)
236
237
if not tok_response.token_ids:
238
return None
239
240
# Create token controls for specified positions
241
from aleph_alpha_client import TokenControl
242
controls = [
243
TokenControl(pos=pos, factor=2.0)
244
for pos in emphasis_tokens
245
if pos < len(tok_response.token_ids)
246
]
247
248
# Build tokens object with controls
249
tokens = Tokens(
250
tokens=tok_response.token_ids,
251
controls=controls
252
)
253
254
return Prompt([tokens])
255
256
# Emphasize tokens at positions 2 and 4
257
controlled_prompt = build_token_controlled_prompt(
258
"Machine learning is fascinating technology",
259
"luminous-extended",
260
emphasis_tokens=[2, 4]
261
)
262
263
if controlled_prompt:
264
print("Created prompt with token-level attention control")
265
266
# Multi-language tokenization comparison
267
multilingual_texts = {
268
"English": "Hello, how are you?",
269
"German": "Hallo, wie geht es dir?",
270
"French": "Bonjour, comment allez-vous?",
271
"Spanish": "Hola, ¿cómo estás?",
272
"Japanese": "こんにちは、元気ですか?"
273
}
274
275
print("Multi-language tokenization comparison:")
276
for language, text in multilingual_texts.items():
277
token_count = count_tokens(text, "luminous-extended")
278
print(f"{language:10}: {token_count:2d} tokens - '{text}'")
279
280
# Direct tokenizer usage (if available)
281
try:
282
tokenizer = client.tokenizer("luminous-extended")
283
print(f"Got tokenizer: {tokenizer}")
284
# Use tokenizer directly for offline processing
285
except Exception as e:
286
print(f"Direct tokenizer access not available: {e}")
287
288
# Special token analysis
289
special_texts = [
290
"<start>", # Special tokens
291
"[MASK]", # Mask tokens
292
"\n\n\n", # Whitespace
293
"word word", # Repeated words
294
"123456", # Numbers
295
"user@email.com" # Email
296
]
297
298
print("\nSpecial token analysis:")
299
for text in special_texts:
300
request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
301
response = client.tokenize(request, model="luminous-extended")
302
303
token_count = len(response.token_ids) if response.token_ids else 0
304
tokens_str = str(response.tokens) if response.tokens else "None"
305
306
print(f"'{text:15}' -> {token_count:2d} tokens: {tokens_str}")
307
```