Tessl Tile for pypi/aleph-alpha-client@11.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

chat-interface.md client-management.md document-prompt-template.md embeddings.md evaluation.md explanations.md index.md prompt-construction.md steering.md structured-output.md text-completion.md tokenization.md translation.md utilities.md

tokenization.mddocs/

0
# Tokenization & Text Processing
1

2
Convert between text and tokens, with support for different tokenization strategies and detokenization. Provides low-level access to model tokenization for debugging, analysis, and advanced prompt construction.
3

4
## Capabilities
5

6
### Text Tokenization
7

8
Convert text strings to token representations with flexible output options.
9

10
```python { .api }
11
class TokenizationRequest:
12
    prompt: str
13
    tokens: bool
14
    token_ids: bool
15
    """
16
    Request for text tokenization.
17
    
18
    Attributes:
19
    - prompt: Text string to tokenize
20
    - tokens: Return text representation of tokens
21
    - token_ids: Return numeric token IDs
22
    """
23

24
    def to_json(self) -> Mapping[str, Any]:
25
        """Serialize request to JSON format."""
26

27
class TokenizationResponse:
28
    tokens: Optional[Sequence[str]] = None
29
    token_ids: Optional[Sequence[int]] = None
30
    """
31
    Response from tokenization request.
32
    
33
    Attributes:
34
    - tokens: Text tokens (if requested)
35
    - token_ids: Numeric token IDs (if requested)
36
    """
37

38
    @staticmethod
39
    def from_json(json: Dict[str, Any]) -> TokenizationResponse:
40
        """Create response from JSON data."""
41

42
def tokenize(
43
    self, 
44
    request: TokenizationRequest, 
45
    model: str
46
) -> TokenizationResponse:
47
    """
48
    Tokenize text using model-specific tokenizer.
49
    
50
    Parameters:
51
    - request: Tokenization configuration
52
    - model: Model name for tokenizer selection
53
    
54
    Returns:
55
    TokenizationResponse with tokens and/or token IDs
56
    """
57
```
58

59
### Token Detokenization
60

61
Convert token IDs back to readable text with proper spacing and formatting.
62

63
```python { .api }
64
class DetokenizationRequest:
65
    token_ids: Sequence[int]
66
    """
67
    Request for token detokenization.
68
    
69
    Attributes:
70
    - token_ids: Sequence of token IDs to convert back to text
71
    """
72

73
    def to_json(self) -> Mapping[str, Any]:
74
        """Serialize request to JSON format."""
75

76
class DetokenizationResponse:
77
    result: str
78
    """
79
    Response from detokenization request.
80
    
81
    Attributes:
82
    - result: Reconstructed text from token IDs
83
    """
84

85
    @staticmethod
86
    def from_json(json: Dict[str, Any]) -> DetokenizationResponse:
87
        """Create response from JSON data."""
88

89
def detokenize(
90
    self, 
91
    request: DetokenizationRequest, 
92
    model: str
93
) -> DetokenizationResponse:
94
    """
95
    Convert token IDs back to text.
96
    
97
    Parameters:
98
    - request: Detokenization configuration with token IDs
99
    - model: Model name for tokenizer selection
100
    
101
    Returns:
102
    DetokenizationResponse with reconstructed text
103
    """
104
```
105

106
### Tokenizer Access
107

108
Direct access to model tokenizers for advanced use cases and offline processing.
109

110
```python { .api }
111
def tokenizer(self, model: str) -> Tokenizer:
112
    """
113
    Get tokenizer instance for specified model.
114
    
115
    Parameters:
116
    - model: Model name
117
    
118
    Returns:
119
    Tokenizer object for direct use
120
    """
121

122
async def tokenizer(self, model: str) -> Tokenizer:
123
    """
124
    Get tokenizer instance for specified model (async).
125
    
126
    Parameters:
127
    - model: Model name
128
    
129
    Returns:
130
    Tokenizer object for direct use
131
    """
132
```
133

134
### Usage Examples
135

136
Comprehensive tokenization examples for debugging, analysis, and advanced prompt construction:
137

138
```python
139
from aleph_alpha_client import (
140
    Client, TokenizationRequest, DetokenizationRequest,
141
    Tokens, Prompt
142
)
143

144
client = Client(token="your-api-token")
145

146
# Basic tokenization - get both tokens and IDs
147
text = "Hello world! How are you today?"
148
request = TokenizationRequest(
149
    prompt=text,
150
    tokens=True,    # Get text tokens
151
    token_ids=True  # Get numeric IDs
152
)
153

154
response = client.tokenize(request, model="luminous-extended")
155

156
print(f"Original text: {text}")
157
print(f"Tokens: {response.tokens}")
158
print(f"Token IDs: {response.token_ids}")
159
print(f"Number of tokens: {len(response.token_ids) if response.token_ids else 0}")
160

161
# Analyze tokenization patterns
162
def analyze_tokenization(text: str, model: str):
163
    """Analyze how text gets tokenized."""
164
    request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
165
    response = client.tokenize(request, model=model)
166
    
167
    print(f"\nText: '{text}'")
168
    print(f"Tokenization analysis:")
169
    
170
    if response.tokens and response.token_ids:
171
        for token, token_id in zip(response.tokens, response.token_ids):
172
            print(f"  '{token}' -> {token_id}")
173
    
174
    return response
175

176
# Test different text patterns
177
analyze_tokenization("machine learning", "luminous-extended")
178
analyze_tokenization("MachineLearning", "luminous-extended")  
179
analyze_tokenization("machine_learning", "luminous-extended")
180
analyze_tokenization("🤖 AI robot", "luminous-extended")
181

182
# Token counting for cost estimation
183
def count_tokens(text: str, model: str) -> int:
184
    """Count tokens in text for cost estimation."""
185
    request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
186
    response = client.tokenize(request, model=model)
187
    return len(response.token_ids) if response.token_ids else 0
188

189
texts = [
190
    "Short text",
191
    "This is a longer text that will have more tokens than the short one above.",
192
    "Very long text with multiple sentences. Each sentence adds tokens. More sentences mean more tokens and higher costs for API calls."
193
]
194

195
for text in texts:
196
    token_count = count_tokens(text, "luminous-extended")
197
    print(f"'{text[:30]}...': {token_count} tokens")
198

199
# Detokenization - convert tokens back to text
200
token_ids = [1234, 5678, 9012, 3456]  # Example token IDs
201
detok_request = DetokenizationRequest(token_ids=token_ids)
202
detok_response = client.detokenize(detok_request, model="luminous-extended")
203

204
print(f"Token IDs: {token_ids}")
205
print(f"Detokenized text: '{detok_response.result}'")
206

207
# Round-trip testing (tokenize then detokenize)
208
def test_round_trip(text: str, model: str):
209
    """Test tokenization -> detokenization round trip."""
210
    # Tokenize
211
    tok_request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
212
    tok_response = client.tokenize(tok_request, model=model)
213
    
214
    if not tok_response.token_ids:
215
        print("No token IDs returned")
216
        return
217
    
218
    # Detokenize
219
    detok_request = DetokenizationRequest(token_ids=tok_response.token_ids)
220
    detok_response = client.detokenize(detok_request, model=model)
221
    
222
    print(f"Original:     '{text}'")
223
    print(f"Round-trip:   '{detok_response.result}'")
224
    print(f"Match: {text == detok_response.result}")
225
    print()
226

227
test_round_trip("Hello world!", "luminous-extended")
228
test_round_trip("Python programming", "luminous-extended")
229

230
# Advanced: Build prompts with token-level control
231
def build_token_controlled_prompt(text: str, model: str, emphasis_tokens: list[int]):
232
    """Build prompt with token-level attention control."""
233
    # First tokenize to get token IDs
234
    tok_request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
235
    tok_response = client.tokenize(tok_request, model=model)
236
    
237
    if not tok_response.token_ids:
238
        return None
239
    
240
    # Create token controls for specified positions
241
    from aleph_alpha_client import TokenControl
242
    controls = [
243
        TokenControl(pos=pos, factor=2.0) 
244
        for pos in emphasis_tokens 
245
        if pos < len(tok_response.token_ids)
246
    ]
247
    
248
    # Build tokens object with controls
249
    tokens = Tokens(
250
        tokens=tok_response.token_ids,
251
        controls=controls
252
    )
253
    
254
    return Prompt([tokens])
255

256
# Emphasize tokens at positions 2 and 4
257
controlled_prompt = build_token_controlled_prompt(
258
    "Machine learning is fascinating technology", 
259
    "luminous-extended",
260
    emphasis_tokens=[2, 4]
261
)
262

263
if controlled_prompt:
264
    print("Created prompt with token-level attention control")
265

266
# Multi-language tokenization comparison
267
multilingual_texts = {
268
    "English": "Hello, how are you?",
269
    "German": "Hallo, wie geht es dir?", 
270
    "French": "Bonjour, comment allez-vous?",
271
    "Spanish": "Hola, ¿cómo estás?",
272
    "Japanese": "こんにちは、元気ですか？"
273
}
274

275
print("Multi-language tokenization comparison:")
276
for language, text in multilingual_texts.items():
277
    token_count = count_tokens(text, "luminous-extended")
278
    print(f"{language:10}: {token_count:2d} tokens - '{text}'")
279

280
# Direct tokenizer usage (if available)
281
try:
282
    tokenizer = client.tokenizer("luminous-extended")
283
    print(f"Got tokenizer: {tokenizer}")
284
    # Use tokenizer directly for offline processing
285
except Exception as e:
286
    print(f"Direct tokenizer access not available: {e}")
287

288
# Special token analysis
289
special_texts = [
290
    "<start>",      # Special tokens
291
    "[MASK]",       # Mask tokens  
292
    "\n\n\n",       # Whitespace
293
    "word word",    # Repeated words
294
    "123456",       # Numbers
295
    "user@email.com" # Email
296
]
297

298
print("\nSpecial token analysis:")
299
for text in special_texts:
300
    request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
301
    response = client.tokenize(request, model="luminous-extended")
302
    
303
    token_count = len(response.token_ids) if response.token_ids else 0
304
    tokens_str = str(response.tokens) if response.tokens else "None"
305
    
306
    print(f"'{text:15}' -> {token_count:2d} tokens: {tokens_str}")
307
```

Version

Tile

Files

tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tokenization.mddocs/