Advanced text processing capabilities including tokenization, sentence boundary detection, format preservation during translation, and byte pair encoding support. These modules provide the foundation for high-quality neural machine translation with proper text segmentation and formatting preservation.
Abstract tokenization interface and concrete implementations for neural machine translation models, supporting both SentencePiece and Byte Pair Encoding (BPE) tokenization strategies.
class Tokenizer:
"""Abstract base class for text tokenization."""
def encode(self, sentence: str) -> List[str]:
"""
Encode sentence into tokens.
Args:
sentence (str): Input sentence to tokenize
Returns:
List[str]: List of token strings
"""
def decode(self, tokens: List[str]) -> str:
"""
Decode tokens back into sentence.
Args:
tokens (List[str]): List of token strings
Returns:
str: Reconstructed sentence
"""Implementation using Google's SentencePiece library for subword tokenization.
class SentencePieceTokenizer(Tokenizer):
def __init__(self, model_file: Path):
"""
Initialize SentencePiece tokenizer.
Args:
model_file (Path): Path to SentencePiece model file
"""
def lazy_processor(self) -> spm.SentencePieceProcessor:
"""
Get lazy-loaded SentencePiece processor.
Returns:
smp.SentencePieceProcessor: SentencePiece processor instance
"""
def encode(self, sentence: str) -> List[str]:
"""Encode sentence using SentencePiece model."""
def decode(self, tokens: List[str]) -> str:
"""Decode tokens using SentencePiece model."""Implementation using Byte Pair Encoding for subword tokenization.
class BPETokenizer(Tokenizer):
def __init__(self, model_file: Path, from_code: str, to_code: str):
"""
Initialize BPE tokenizer.
Args:
model_file (Path): Path to BPE model file
from_code (str): Source language code
to_code (str): Target language code
"""
def lazy_load(self):
"""Lazy load BPE tokenizer components."""
def encode(self, sentence: str) -> List[str]:
"""Encode sentence using BPE tokenization."""
def decode(self, tokens: List[str]) -> str:
"""Decode BPE tokens back to sentence."""Intelligent sentence segmentation for improved translation quality, supporting both neural model-based and language model-based boundary detection.
def get_sbd_package() -> Package | None:
"""
Get sentence boundary detection package if available.
Returns:
Package | None: SBD package or None if not installed
"""
def generate_fewshot_sbd_prompt(input_text: str, sentence_guess_length: int = 150) -> str:
"""
Generate few-shot prompt for sentence boundary detection.
Args:
input_text (str): Text to analyze for sentence boundaries
sentence_guess_length (int): Estimated sentence length for segmentation
Returns:
str: Formatted prompt for language model
"""
def parse_fewshot_response(response_text: str) -> str | None:
"""
Parse language model response for sentence boundary detection.
Args:
response_text (str): Raw response from language model
Returns:
str | None: Parsed sentence boundary information or None if parsing fails
"""
def process_seq2seq_sbd(input_text: str, sbd_translated_guess: str) -> int:
"""
Process sequence-to-sequence sentence boundary detection.
Args:
input_text (str): Original input text
sbd_translated_guess (str): Translated boundary guess
Returns:
int: Detected sentence boundary position
"""
def detect_sentence(
input_text: str,
sbd_translation,
sentence_guess_length: int = 150
) -> int:
"""
Detect sentence boundaries in input text.
Args:
input_text (str): Text to analyze
sbd_translation: Translation object for boundary detection
sentence_guess_length (int): Estimated sentence length
Returns:
int: Position of sentence boundary
"""DETECT_SENTENCE_BOUNDARIES_TOKEN: str # Token for boundary detection
SENTENCE_BOUNDARY_TOKEN: str # Token marking sentence boundaries
FEWSHOT_BOUNDARY_TOKEN: str # Token for few-shot boundary detectionTag-based system for preserving text formatting during translation, enabling translation of structured content while maintaining original formatting.
class ITag:
"""Abstract interface for tag tree structures."""
translateable: bool # Whether this tag should be translated
def text(self) -> str:
"""
Get combined text content of all children.
Returns:
str: Combined text content
"""class Tag(ITag):
def __init__(self, children: ITag | str, translateable: bool = True):
"""
Initialize tag with children.
Args:
children (ITag | str): Child tags or text content
translateable (bool): Whether content should be translated
"""
children: ITag | str # Child content (tags or text)
def text(self) -> str:
"""Get combined text from all children."""def depth(tag: ITag | str) -> int:
"""
Calculate depth of tag tree structure.
Args:
tag (ITag | str): Tag tree or text
Returns:
int: Maximum depth of tag tree
"""
def translate_preserve_formatting(underlying_translation: ITranslation, input_text: str) -> str:
"""
Translate text while preserving original formatting.
Args:
underlying_translation (ITranslation): Translation to use
input_text (str): Text with formatting to translate
Returns:
str: Translated text with preserved formatting
"""
def inject_tags_inference(underlying_translation: ITranslation, tag: ITag) -> ITag | None:
"""
Translate with tag injection for format preservation.
Args:
underlying_translation (ITranslation): Translation to use
tag (ITag): Tag structure to translate
Returns:
ITag | None: Translated tag structure or None if failed
"""
def translate_tags(underlying_translation: ITranslation, tag: ITag | str) -> ITag | str:
"""
Translate tag tree structure.
Args:
underlying_translation (ITranslation): Translation to use
tag (ITag | str): Tag tree or text to translate
Returns:
ITag | str: Translated tag tree or text
"""Comprehensive BPE implementation for advanced subword tokenization, supporting custom vocabularies and glossaries.
class BPE:
def __init__(
self,
codes,
merges: int = -1,
separator: str = '@@',
vocab = None,
glossaries = None
):
"""
Initialize BPE encoder.
Args:
codes: BPE merge codes
merges (int): Number of merge operations (-1 for all)
separator (str): Subword separator token
vocab: Vocabulary for lookup
glossaries: Protected terms that shouldn't be split
"""
def process_line(self, line):
"""
Process single line of text with BPE.
Args:
line: Input line to process
Returns:
Processed line with BPE applied
"""
def segment(self, sentence):
"""
Segment sentence using BPE.
Args:
sentence: Input sentence
Returns:
BPE-segmented sentence
"""
def segment_tokens(self, tokens):
"""
Segment list of tokens using BPE.
Args:
tokens: List of tokens to segment
Returns:
List of BPE-segmented tokens
"""def create_parser(subparsers=None):
"""
Create argument parser for BPE command-line interface.
Args:
subparsers: Optional subparser group
Returns:
Argument parser for BPE operations
"""
def get_pairs(word):
"""
Get all symbol pairs in a word.
Args:
word: Input word to analyze
Returns:
Set of symbol pairs found in word
"""
def encode(
orig,
bpe_codes,
bpe_codes_reverse,
vocab,
separator,
version,
cache,
glossaries=None
):
"""
Encode word using Byte Pair Encoding.
Args:
orig: Original word to encode
bpe_codes: BPE merge codes
bpe_codes_reverse: Reverse BPE codes lookup
vocab: Vocabulary for encoding
separator: Subword separator
version: BPE version
cache: Encoding cache
glossaries: Protected glossary terms
Returns:
BPE-encoded word
"""
def recursive_split(segment, bpe_codes, vocab, separator, final=False):
"""
Recursively split segments using BPE codes.
Args:
segment: Text segment to split
bpe_codes: BPE merge codes
vocab: Vocabulary lookup
separator: Subword separator
final (bool): Whether this is final splitting pass
Returns:
Recursively split segments
"""
def check_vocab_and_split(orig, bpe_codes, vocab, separator):
"""
Check vocabulary and split word if needed.
Args:
orig: Original word
bpe_codes: BPE codes
vocab: Vocabulary
separator: Separator token
Returns:
Vocabulary-checked and split word
"""
def read_vocabulary(vocab_file, threshold):
"""
Read vocabulary from file with frequency threshold.
Args:
vocab_file: Path to vocabulary file
threshold: Minimum frequency threshold
Returns:
Vocabulary dictionary with frequencies
"""
def isolate_glossary(word, glossary):
"""
Isolate glossary terms from word for protection during BPE.
Args:
word: Input word
glossary: Glossary terms to protect
Returns:
Word with protected glossary terms
"""from argostranslate.tokenizer import SentencePieceTokenizer, BPETokenizer
from pathlib import Path
# SentencePiece tokenization
sp_model = Path("/path/to/model.spm")
if sp_model.exists():
tokenizer = SentencePieceTokenizer(sp_model)
# Tokenize text
tokens = tokenizer.encode("Hello, how are you?")
print(f"Tokens: {tokens}")
# Reconstruct text
reconstructed = tokenizer.decode(tokens)
print(f"Reconstructed: {reconstructed}")
# BPE tokenization
bpe_model = Path("/path/to/bpe.model")
if bpe_model.exists():
bpe_tokenizer = BPETokenizer(bpe_model, "en", "es")
bpe_tokenizer.lazy_load()
tokens = bpe_tokenizer.encode("Machine learning")
print(f"BPE tokens: {tokens}")from argostranslate.sbd import (
get_sbd_package,
detect_sentence,
generate_fewshot_sbd_prompt,
parse_fewshot_response
)
# Get SBD package if available
sbd_package = get_sbd_package()
if sbd_package:
# Use with translation for boundary detection
long_text = "This is sentence one. This is sentence two. This continues..."
# Detect sentence boundary
boundary = detect_sentence(long_text, sbd_translation, sentence_guess_length=100)
first_sentence = long_text[:boundary]
remaining_text = long_text[boundary:]
print(f"First sentence: {first_sentence}")
print(f"Remaining: {remaining_text}")
# Generate few-shot prompt for language model
prompt = generate_fewshot_sbd_prompt("Hello world. How are you today?", 150)
print(f"SBD Prompt: {prompt}")from argostranslate.tags import (
Tag,
translate_preserve_formatting,
translate_tags,
depth
)
from argostranslate import translate
# Create tag structure for formatted text
formatted_text = "<b>Hello</b> <i>world</i>!"
# Use format preservation during translation
translation = translate.get_translation_from_codes("en", "es")
if translation:
preserved_translation = translate_preserve_formatting(
translation,
formatted_text
)
print(f"Preserved formatting: {preserved_translation}")
# Work with tag trees directly
tag_tree = Tag("Hello world", translateable=True)
print(f"Tag depth: {depth(tag_tree)}")
print(f"Tag text: {tag_tree.text()}")
# Translate tag structure
if translation:
translated_tags = translate_tags(translation, tag_tree)
print(f"Translated tags: {translated_tags}")from argostranslate.apply_bpe import BPE, encode, get_pairs, read_vocabulary
# Initialize BPE with custom settings
bpe = BPE(
codes=bpe_codes,
merges=10000,
separator='@@',
vocab=vocabulary,
glossaries=['proper_nouns', 'technical_terms']
)
# Process text with BPE
text = "Neural machine translation"
segmented = bpe.segment(text)
print(f"BPE segmented: {segmented}")
# Work with tokens
tokens = ["machine", "learning", "algorithms"]
segmented_tokens = bpe.segment_tokens(tokens)
print(f"Segmented tokens: {segmented_tokens}")
# Advanced BPE operations
word_pairs = get_pairs("translation")
print(f"Symbol pairs: {word_pairs}")
# Read vocabulary with threshold
vocab_file = Path("/path/to/vocab.txt")
if vocab_file.exists():
vocabulary = read_vocabulary(vocab_file, threshold=50)
print(f"Vocabulary size: {len(vocabulary)}")from argostranslate import translate
from argostranslate.tokenizer import SentencePieceTokenizer
from argostranslate.sbd import detect_sentence
from argostranslate.tags import translate_preserve_formatting
def advanced_translate(text: str, from_code: str, to_code: str) -> str:
"""Advanced translation with full text processing pipeline."""
# Get translation
translation = translate.get_translation_from_codes(from_code, to_code)
if not translation:
return text
# Preserve formatting during translation
result = translate_preserve_formatting(translation, text)
return result
# Use advanced translation
formatted_text = "<p>This is <strong>important</strong> text.</p>"
result = advanced_translate(formatted_text, "en", "fr")
print(f"Advanced translation: {result}")from argostranslate.tokenizer import Tokenizer
from typing import List
class CustomTokenizer(Tokenizer):
"""Custom tokenizer implementation."""
def __init__(self, vocab_size: int = 10000):
self.vocab_size = vocab_size
# Initialize custom tokenization logic
def encode(self, sentence: str) -> List[str]:
"""Custom encoding logic."""
# Implement custom tokenization
return sentence.split() # Simple word-level tokenization
def decode(self, tokens: List[str]) -> str:
"""Custom decoding logic."""
return " ".join(tokens)
# Use custom tokenizer
custom_tokenizer = CustomTokenizer(vocab_size=5000)
tokens = custom_tokenizer.encode("Custom tokenization example")
reconstructed = custom_tokenizer.decode(tokens)
print(f"Custom tokens: {tokens}")
print(f"Reconstructed: {reconstructed}")The text processing modules integrate seamlessly with the main translation pipeline:
These modules are automatically used by translation packages when appropriate, but can also be accessed directly for custom translation workflows and advanced integration scenarios.