tessl/pypi-aleph-alpha-client

Python client to interact with Aleph Alpha API endpoints

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Tokenization & Text Processing

Name: tessl/pypi-aleph-alpha-client
Author: tessl

Convert between text and tokens, with support for different tokenization strategies and detokenization. Provides low-level access to model tokenization for debugging, analysis, and advanced prompt construction.

Capabilities

Text Tokenization

Convert text strings to token representations with flexible output options.

class TokenizationRequest:
    prompt: str
    tokens: bool
    token_ids: bool
    """
    Request for text tokenization.
    
    Attributes:
    - prompt: Text string to tokenize
    - tokens: Return text representation of tokens
    - token_ids: Return numeric token IDs
    """

    def to_json(self) -> Mapping[str, Any]:
        """Serialize request to JSON format."""

class TokenizationResponse:
    tokens: Optional[Sequence[str]] = None
    token_ids: Optional[Sequence[int]] = None
    """
    Response from tokenization request.
    
    Attributes:
    - tokens: Text tokens (if requested)
    - token_ids: Numeric token IDs (if requested)
    """

    @staticmethod
    def from_json(json: Dict[str, Any]) -> TokenizationResponse:
        """Create response from JSON data."""

def tokenize(
    self, 
    request: TokenizationRequest, 
    model: str
) -> TokenizationResponse:
    """
    Tokenize text using model-specific tokenizer.
    
    Parameters:
    - request: Tokenization configuration
    - model: Model name for tokenizer selection
    
    Returns:
    TokenizationResponse with tokens and/or token IDs
    """

Token Detokenization

Convert token IDs back to readable text with proper spacing and formatting.

class DetokenizationRequest:
    token_ids: Sequence[int]
    """
    Request for token detokenization.
    
    Attributes:
    - token_ids: Sequence of token IDs to convert back to text
    """

    def to_json(self) -> Mapping[str, Any]:
        """Serialize request to JSON format."""

class DetokenizationResponse:
    result: str
    """
    Response from detokenization request.
    
    Attributes:
    - result: Reconstructed text from token IDs
    """

    @staticmethod
    def from_json(json: Dict[str, Any]) -> DetokenizationResponse:
        """Create response from JSON data."""

def detokenize(
    self, 
    request: DetokenizationRequest, 
    model: str
) -> DetokenizationResponse:
    """
    Convert token IDs back to text.
    
    Parameters:
    - request: Detokenization configuration with token IDs
    - model: Model name for tokenizer selection
    
    Returns:
    DetokenizationResponse with reconstructed text
    """

Tokenizer Access

Direct access to model tokenizers for advanced use cases and offline processing.

def tokenizer(self, model: str) -> Tokenizer:
    """
    Get tokenizer instance for specified model.
    
    Parameters:
    - model: Model name
    
    Returns:
    Tokenizer object for direct use
    """

async def tokenizer(self, model: str) -> Tokenizer:
    """
    Get tokenizer instance for specified model (async).
    
    Parameters:
    - model: Model name
    
    Returns:
    Tokenizer object for direct use
    """

Usage Examples

Comprehensive tokenization examples for debugging, analysis, and advanced prompt construction:

from aleph_alpha_client import (
    Client, TokenizationRequest, DetokenizationRequest,
    Tokens, Prompt
)

client = Client(token="your-api-token")

# Basic tokenization - get both tokens and IDs
text = "Hello world! How are you today?"
request = TokenizationRequest(
    prompt=text,
    tokens=True,    # Get text tokens
    token_ids=True  # Get numeric IDs
)

response = client.tokenize(request, model="luminous-extended")

print(f"Original text: {text}")
print(f"Tokens: {response.tokens}")
print(f"Token IDs: {response.token_ids}")
print(f"Number of tokens: {len(response.token_ids) if response.token_ids else 0}")

# Analyze tokenization patterns
def analyze_tokenization(text: str, model: str):
    """Analyze how text gets tokenized."""
    request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
    response = client.tokenize(request, model=model)
    
    print(f"\nText: '{text}'")
    print(f"Tokenization analysis:")
    
    if response.tokens and response.token_ids:
        for token, token_id in zip(response.tokens, response.token_ids):
            print(f"  '{token}' -> {token_id}")
    
    return response

# Test different text patterns
analyze_tokenization("machine learning", "luminous-extended")
analyze_tokenization("MachineLearning", "luminous-extended")  
analyze_tokenization("machine_learning", "luminous-extended")
analyze_tokenization("🤖 AI robot", "luminous-extended")

# Token counting for cost estimation
def count_tokens(text: str, model: str) -> int:
    """Count tokens in text for cost estimation."""
    request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
    response = client.tokenize(request, model=model)
    return len(response.token_ids) if response.token_ids else 0

texts = [
    "Short text",
    "This is a longer text that will have more tokens than the short one above.",
    "Very long text with multiple sentences. Each sentence adds tokens. More sentences mean more tokens and higher costs for API calls."
]

for text in texts:
    token_count = count_tokens(text, "luminous-extended")
    print(f"'{text[:30]}...': {token_count} tokens")

# Detokenization - convert tokens back to text
token_ids = [1234, 5678, 9012, 3456]  # Example token IDs
detok_request = DetokenizationRequest(token_ids=token_ids)
detok_response = client.detokenize(detok_request, model="luminous-extended")

print(f"Token IDs: {token_ids}")
print(f"Detokenized text: '{detok_response.result}'")

# Round-trip testing (tokenize then detokenize)
def test_round_trip(text: str, model: str):
    """Test tokenization -> detokenization round trip."""
    # Tokenize
    tok_request = TokenizationRequest(prompt=text, tokens=False, token_ids=True)
    tok_response = client.tokenize(tok_request, model=model)
    
    if not tok_response.token_ids:
        print("No token IDs returned")
        return
    
    # Detokenize
    detok_request = DetokenizationRequest(token_ids=tok_response.token_ids)
    detok_response = client.detokenize(detok_request, model=model)
    
    print(f"Original:     '{text}'")
    print(f"Round-trip:   '{detok_response.result}'")
    print(f"Match: {text == detok_response.result}")
    print()

test_round_trip("Hello world!", "luminous-extended")
test_round_trip("Python programming", "luminous-extended")

# Advanced: Build prompts with token-level control
def build_token_controlled_prompt(text: str, model: str, emphasis_tokens: list[int]):
    """Build prompt with token-level attention control."""
    # First tokenize to get token IDs
    tok_request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
    tok_response = client.tokenize(tok_request, model=model)
    
    if not tok_response.token_ids:
        return None
    
    # Create token controls for specified positions
    from aleph_alpha_client import TokenControl
    controls = [
        TokenControl(pos=pos, factor=2.0) 
        for pos in emphasis_tokens 
        if pos < len(tok_response.token_ids)
    ]
    
    # Build tokens object with controls
    tokens = Tokens(
        tokens=tok_response.token_ids,
        controls=controls
    )
    
    return Prompt([tokens])

# Emphasize tokens at positions 2 and 4
controlled_prompt = build_token_controlled_prompt(
    "Machine learning is fascinating technology", 
    "luminous-extended",
    emphasis_tokens=[2, 4]
)

if controlled_prompt:
    print("Created prompt with token-level attention control")

# Multi-language tokenization comparison
multilingual_texts = {
    "English": "Hello, how are you?",
    "German": "Hallo, wie geht es dir?", 
    "French": "Bonjour, comment allez-vous?",
    "Spanish": "Hola, ¿cómo estás?",
    "Japanese": "こんにちは、元気ですか？"
}

print("Multi-language tokenization comparison:")
for language, text in multilingual_texts.items():
    token_count = count_tokens(text, "luminous-extended")
    print(f"{language:10}: {token_count:2d} tokens - '{text}'")

# Direct tokenizer usage (if available)
try:
    tokenizer = client.tokenizer("luminous-extended")
    print(f"Got tokenizer: {tokenizer}")
    # Use tokenizer directly for offline processing
except Exception as e:
    print(f"Direct tokenizer access not available: {e}")

# Special token analysis
special_texts = [
    "<start>",      # Special tokens
    "[MASK]",       # Mask tokens  
    "\n\n\n",       # Whitespace
    "word word",    # Repeated words
    "123456",       # Numbers
    "user@email.com" # Email
]

print("\nSpecial token analysis:")
for text in special_texts:
    request = TokenizationRequest(prompt=text, tokens=True, token_ids=True)
    response = client.tokenize(request, model="luminous-extended")
    
    token_count = len(response.token_ids) if response.token_ids else 0
    tokens_str = str(response.tokens) if response.tokens else "None"
    
    print(f"'{text:15}' -> {token_count:2d} tokens: {tokens_str}")

Install with Tessl CLI

npx tessl i tessl/pypi-aleph-alpha-client

docs

chat-interface.md

client-management.md

document-prompt-template.md

prompt-construction.md

tessl/pypi-aleph-alpha-client

tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

Tokenization & Text Processing

Capabilities

Text Tokenization

Token Detokenization

Tokenizer Access

Usage Examples

tokenization.mddocs/