The Real First Universal Charset Detector providing modern, fast, and reliable character encoding detection as an alternative to chardet.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Structured containers for charset detection results that provide detailed information about detected encodings, confidence metrics, language identification, and text decoding capabilities. These classes enable comprehensive analysis and manipulation of detection outcomes.
Container for a single charset detection result with comprehensive encoding information and decoded text access.
class CharsetMatch:
"""
Represents a single charset detection result.
Provides detailed information about the detected encoding including
confidence metrics, language detection, and decoded text access.
"""
# Core properties
encoding: str # Detected encoding name (IANA standard)
language: str # Detected language or "Unknown"
chaos: float # Mess ratio (0.0 = perfect, 1.0 = complete chaos)
coherence: float # Language coherence ratio (0.0-1.0)
# Percentage representations
percent_chaos: float # Chaos as percentage (0-100)
percent_coherence: float # Coherence as percentage (0-100)
# Additional properties
encoding_aliases: list[str] # Alternative names for encoding
languages: list[str] # All detected languages
bom: bool # Whether BOM/signature was detected
byte_order_mark: bool # Alias for bom
raw: bytes # Original input bytes
alphabets: list[str] # Detected Unicode ranges
could_be_from_charset: list[str] # Alternative possible encodings
fingerprint: str # SHA256 hash of normalized output
submatch: list[CharsetMatch] # Alternative matches with same result
has_submatch: bool # Whether alternative matches exist
multi_byte_usage: float # Ratio of multi-byte character usage
def __str__(self) -> str:
"""
Decode bytes to string using detected encoding.
Returns:
str: Decoded text content
Raises:
UnicodeDecodeError: If decoding fails (rare with validated matches)
"""
def output(self, encoding: str = "utf_8") -> bytes:
"""
Re-encode content to target encoding.
Parameters:
- encoding: Target encoding name (default: "utf_8")
Returns:
bytes: Content encoded in target encoding
"""
def add_submatch(self, other: CharsetMatch) -> None:
"""
Add alternative match with same decoded result.
Parameters:
- other: Alternative CharsetMatch with same fingerprint but different encoding
Raises:
ValueError: If other is not a CharsetMatch instance or equals self
Note: Reduces memory usage by linking similar results
"""Usage Example:
import charset_normalizer
raw_data = b'\xc4\x8cesk\xc3\xbd text' # Czech text in UTF-8
results = charset_normalizer.from_bytes(raw_data)
match = results.best()
if match:
# Basic information
print(f"Encoding: {match.encoding}") # utf_8
print(f"Language: {match.language}") # Czech
print(f"Confidence: {100 - match.percent_chaos:.1f}%")
# Decoded text
text = str(match)
print(f"Text: {text}")
# Alternative encodings
print(f"Could also be: {match.could_be_from_charset}")
# Unicode analysis
print(f"Alphabets: {match.alphabets}") # ['Latin Extended-A', 'Basic Latin']
# Re-encode to different format
windows_bytes = match.output('windows-1252')
print(f"Windows-1252: {windows_bytes}")Ordered collection of charset detection results sorted by confidence, supporting iteration and intelligent selection of best matches.
class CharsetMatches:
"""
Container for multiple CharsetMatch results.
Maintains results sorted by confidence (best first) and provides
convenient access methods for result selection and analysis.
"""
def __init__(self, results: list[CharsetMatch] | None = None):
"""
Initialize with optional list of results.
Parameters:
- results: Initial list of CharsetMatch objects
"""
def best(self) -> CharsetMatch | None:
"""
Get the highest confidence match.
Returns:
CharsetMatch | None: Best match or None if no results
"""
def first(self) -> CharsetMatch | None:
"""
Alias for best() method (for backward compatibility).
Returns:
CharsetMatch | None: Best match or None if no results
"""
def append(self, item: CharsetMatch) -> None:
"""
Add new match maintaining sort order.
Parameters:
- item: CharsetMatch to add
Raises:
ValueError: If item is not a CharsetMatch instance
"""
def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Access match by index or encoding name.
Parameters:
- item: Index (int) or encoding name (str)
Returns:
CharsetMatch: Matching result
Raises:
KeyError: If index/encoding not found
IndexError: If index out of range
"""
def __len__(self) -> int:
"""Get number of results."""
def __bool__(self) -> bool:
"""Check if any results exist."""
def __iter__(self) -> Iterator[CharsetMatch]:
"""Iterate over all results in confidence order."""Usage Example:
import charset_normalizer
raw_data = b'Ambiguous \xe9ncoding' # Could be multiple encodings
results = charset_normalizer.from_bytes(raw_data)
# Check if any results found
if results:
print(f"Found {len(results)} possible encodings")
# Get best match
best = results.best()
print(f"Best: {best.encoding} ({100-best.percent_chaos:.1f}% confidence)")
# Access by index
if len(results) > 1:
second_best = results[1]
print(f"Second: {second_best.encoding}")
# Access by encoding name
try:
utf8_match = results['utf_8']
print(f"UTF-8 chaos: {utf8_match.percent_chaos:.1f}%")
except KeyError:
print("UTF-8 not detected")
# Iterate all results
for i, match in enumerate(results):
print(f"{i+1}. {match.encoding}: {100-match.percent_chaos:.1f}% confidence")
# Compare different decodings
try:
decoded = str(match)
print(f" Text: {decoded}")
except UnicodeDecodeError:
print(f" Decoding failed")Specialized result container for command-line interface operations with JSON serialization support.
class CliDetectionResult:
"""
CLI-specific detection result container.
Structured for command-line output and JSON serialization,
containing all relevant detection information in a flat format.
"""
# Properties
path: str # Input file path
unicode_path: str | None # Unicode-normalized path
encoding: str | None # Detected encoding
encoding_aliases: list[str] # Alternative encoding names
alternative_encodings: list[str] # Other possible encodings
language: str # Detected language
alphabets: list[str] # Unicode ranges found
has_sig_or_bom: bool # BOM/signature present
chaos: float # Mess ratio (0.0-1.0)
coherence: float # Coherence ratio (0.0-1.0)
is_preferred: bool # Whether this is the preferred result
def to_json(self) -> str:
"""
Serialize result to JSON string.
Returns:
str: JSON representation with proper formatting
"""
@property
def __dict__(self) -> dict[str, Any]:
"""
Get result as dictionary for serialization.
Returns:
dict: All properties as key-value pairs
"""Usage Example:
# Note: CliDetectionResult is typically created internally by CLI operations
# This shows the structure for understanding the API
# Hypothetical CLI result creation (normally done by CLI functions)
cli_result = CliDetectionResult(
path='document.txt',
encoding='utf_8',
encoding_aliases=['utf-8', 'u8'],
alternative_encodings=['ascii'],
language='English',
alphabets=['Basic Latin'],
has_sig_or_bom=False,
chaos=0.02,
coherence=0.85,
unicode_path=None,
is_preferred=True
)
# JSON serialization
json_output = cli_result.to_json()
print(json_output)
# Dictionary access
result_dict = cli_result.__dict__
print(f"Encoding: {result_dict['encoding']}")
print(f"Confidence: {(1.0 - result_dict['chaos']) * 100:.1f}%")# Evaluate detection confidence
match = results.best()
if match:
confidence = 100 - match.percent_chaos
if confidence >= 95:
print("Very high confidence")
elif confidence >= 85:
print("High confidence")
elif confidence >= 70:
print("Moderate confidence")
else:
print("Low confidence - manual verification recommended")# Compare multiple encoding possibilities
if len(results) > 1:
print("Multiple encoding candidates:")
for match in results[:3]: # Top 3 candidates
confidence = 100 - match.percent_chaos
print(f"- {match.encoding}: {confidence:.1f}% confidence")
print(f" Language: {match.language}")
print(f" Preview: {str(match)[:50]}...")# Select encoding based on expected language
expected_language = "French"
for match in results:
if match.language == expected_language:
print(f"Found {expected_language} text in {match.encoding}")
selected_text = str(match)
break
else:
# Fall back to best overall match
selected_text = str(results.best())Install with Tessl CLI
npx tessl i tessl/pypi-charset-normalizer