Python client to interact with Aleph Alpha API endpoints
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Convert between text and tokens, with support for different tokenization strategies and detokenization. Provides low-level access to model tokenization for debugging, analysis, and advanced prompt construction.
Convert text strings to token representations with flexible output options.
class TokenizationRequest:
prompt: str
tokens: bool
token_ids: bool
"""
Request for text tokenization.
Attributes:
- prompt: Text string to tokenize
- tokens: Return text representation of tokens
- token_ids: Return numeric token IDs
"""
def to_json(self) -> Mapping[str, Any]:
"""Serialize request to JSON format."""
class TokenizationResponse:
tokens: Optional[Sequence[str]] = None
token_ids: Optional[Sequence[int]] = None
"""
Response from tokenization request.
Attributes:
- tokens: Text tokens (if requested)
- token_ids: Numeric token IDs (if requested)
"""
@staticmethod
def from_json(json: Dict[str, Any]) -> TokenizationResponse:
"""Create response from JSON data."""
def tokenize(
self,
request: TokenizationRequest,
model: str
) -> TokenizationResponse:
"""
Tokenize text using model-specific tokenizer.
Parameters:
- request: Tokenization configuration
- model: Model name for tokenizer selection
Returns:
TokenizationResponse with tokens and/or token IDs
"""Convert token IDs back to readable text with proper spacing and formatting.
class DetokenizationRequest:
token_ids: Sequence[int]
"""
Request for token detokenization.
Attributes:
- token_ids: Sequence of token IDs to convert back to text
"""
def to_json(self) -> Mapping[str, Any]:
"""Serialize request to JSON format."""
class DetokenizationResponse:
result: str
"""
Response from detokenization request.
Attributes:
- result: Reconstructed text from token IDs
"""
@staticmethod
def from_json(json: Dict[str, Any]) -> DetokenizationResponse:
"""Create response from JSON data."""
def detokenize(
self,
request: DetokenizationRequest,
model: str
) -> DetokenizationResponse:
"""
Convert token IDs back to text.
Parameters:
- request: Detokenization configuration with token IDs
- model: Model name for tokenizer selection
Returns:
DetokenizationResponse with reconstructed text
"""Direct access to model tokenizers for advanced use cases and offline processing.
def tokenizer(self, model: str) -> Tokenizer:
"""
Get tokenizer instance for specified model.
Parameters:
- model: Model name
Returns:
Tokenizer object for direct use
"""
async def tokenizer(self, model: str) -> Tokenizer:
"""
Get tokenizer instance for specified model (async).
Parameters:
- model: Model name
Returns:
Tokenizer object for direct use
"""Comprehensive tokenization examples for debugging, analysis, and advanced prompt construction:
from aleph_alpha_client import (
Client, TokenizationRequest, DetokenizationRequest,
Tokens, Prompt
)
client = Client(token="your-api-token")
# Basic tokenization - get both tokens and IDs
text = "Hello world! How are you today?"
request = TokenizationRequest(
prompt=text,
tokens=True, # Get text tokens
token_ids=True # Get numeric IDs
)
response = client.tokenize(request, model="luminous-extended")
print(f"Original text: {text}")
print(f"Tokens: {response.tokens}")
print(f"Token IDs: {response.token_ids}")
print(f"Number of tokens: {len(response.token_ids) if response.token_ids else 0}")
# Analyze tokenization patterns
def analyze_tokenization(text: str, model: str):
"""Analyze how text gets tokenized."""
request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
response = client.tokenize(request, model=model)
print(f"\nText: '{text}'")
print(f"Tokenization analysis:")
if response.tokens and response.token_ids:
for token, token_id in zip(response.tokens, response.token_ids):
print(f" '{token}' -> {token_id}")
return response
# Test different text patterns
analyze_tokenization("machine learning", "luminous-extended")
analyze_tokenization("MachineLearning", "luminous-extended")
analyze_tokenization("machine_learning", "luminous-extended")
analyze_tokenization("🤖 AI robot", "luminous-extended")
# Token counting for cost estimation
def count_tokens(text: str, model: str) -> int:
"""Count tokens in text for cost estimation."""
request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
response = client.tokenize(request, model=model)
return len(response.token_ids) if response.token_ids else 0
texts = [
"Short text",
"This is a longer text that will have more tokens than the short one above.",
"Very long text with multiple sentences. Each sentence adds tokens. More sentences mean more tokens and higher costs for API calls."
]
for text in texts:
token_count = count_tokens(text, "luminous-extended")
print(f"'{text[:30]}...': {token_count} tokens")
# Detokenization - convert tokens back to text
token_ids = [1234, 5678, 9012, 3456] # Example token IDs
detok_request = DetokenizationRequest(token_ids=token_ids)
detok_response = client.detokenize(detok_request, model="luminous-extended")
print(f"Token IDs: {token_ids}")
print(f"Detokenized text: '{detok_response.result}'")
# Round-trip testing (tokenize then detokenize)
def test_round_trip(text: str, model: str):
"""Test tokenization -> detokenization round trip."""
# Tokenize
tok_request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
tok_response = client.tokenize(tok_request, model=model)
if not tok_response.token_ids:
print("No token IDs returned")
return
# Detokenize
detok_request = DetokenizationRequest(token_ids=tok_response.token_ids)
detok_response = client.detokenize(detok_request, model=model)
print(f"Original: '{text}'")
print(f"Round-trip: '{detok_response.result}'")
print(f"Match: {text == detok_response.result}")
print()
test_round_trip("Hello world!", "luminous-extended")
test_round_trip("Python programming", "luminous-extended")
# Advanced: Build prompts with token-level control
def build_token_controlled_prompt(text: str, model: str, emphasis_tokens: list[int]):
"""Build prompt with token-level attention control."""
# First tokenize to get token IDs
tok_request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
tok_response = client.tokenize(tok_request, model=model)
if not tok_response.token_ids:
return None
# Create token controls for specified positions
from aleph_alpha_client import TokenControl
controls = [
TokenControl(pos=pos, factor=2.0)
for pos in emphasis_tokens
if pos < len(tok_response.token_ids)
]
# Build tokens object with controls
tokens = Tokens(
tokens=tok_response.token_ids,
controls=controls
)
return Prompt([tokens])
# Emphasize tokens at positions 2 and 4
controlled_prompt = build_token_controlled_prompt(
"Machine learning is fascinating technology",
"luminous-extended",
emphasis_tokens=[2, 4]
)
if controlled_prompt:
print("Created prompt with token-level attention control")
# Multi-language tokenization comparison
multilingual_texts = {
"English": "Hello, how are you?",
"German": "Hallo, wie geht es dir?",
"French": "Bonjour, comment allez-vous?",
"Spanish": "Hola, ¿cómo estás?",
"Japanese": "こんにちは、元気ですか?"
}
print("Multi-language tokenization comparison:")
for language, text in multilingual_texts.items():
token_count = count_tokens(text, "luminous-extended")
print(f"{language:10}: {token_count:2d} tokens - '{text}'")
# Direct tokenizer usage (if available)
try:
tokenizer = client.tokenizer("luminous-extended")
print(f"Got tokenizer: {tokenizer}")
# Use tokenizer directly for offline processing
except Exception as e:
print(f"Direct tokenizer access not available: {e}")
# Special token analysis
special_texts = [
"<start>", # Special tokens
"[MASK]", # Mask tokens
"\n\n\n", # Whitespace
"word word", # Repeated words
"123456", # Numbers
"user@email.com" # Email
]
print("\nSpecial token analysis:")
for text in special_texts:
request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
response = client.tokenize(request, model="luminous-extended")
token_count = len(response.token_ids) if response.token_ids else 0
tokens_str = str(response.tokens) if response.tokens else "None"
print(f"'{text:15}' -> {token_count:2d} tokens: {tokens_str}")Install with Tessl CLI
npx tessl i tessl/pypi-aleph-alpha-client