The Real First Universal Charset Detector providing modern, fast, and reliable character encoding detection as an alternative to chardet.
npx @tessl/cli install tessl/pypi-charset-normalizer@3.4.0The Real First Universal Charset Detector. A modern, fast, and reliable character encoding detection library that serves as an open-source alternative to chardet. It uses advanced heuristics to detect character encodings from raw bytes by testing multiple encoding tables, measuring noise levels, and selecting the best match through comprehensive analysis including language detection and coherence scoring.
pip install charset-normalizerimport charset_normalizerStandard imports for charset detection:
from charset_normalizer import from_bytes, from_fp, from_path, is_binary
from charset_normalizer import CharsetMatch, CharsetMatchesLegacy compatibility:
from charset_normalizer import detect # chardet compatibilityType annotations (for advanced usage):
from typing import BinaryIO, Iterator
from os import PathLike
import loggingimport charset_normalizer
# Detect encoding from bytes
with open('unknown_file.txt', 'rb') as f:
raw_data = f.read()
results = charset_normalizer.from_bytes(raw_data)
best_guess = results.best()
if best_guess:
print(f"Detected encoding: {best_guess.encoding}")
print(f"Confidence (chaos): {best_guess.percent_chaos}%")
print(f"Language: {best_guess.language}")
# Get the decoded text
decoded_text = str(best_guess)
print(decoded_text)
# Detect directly from file path
results = charset_normalizer.from_path('unknown_file.txt')
best_guess = results.best()
if best_guess:
print(f"File encoding: {best_guess.encoding}")
# Check if content is binary
is_text = not charset_normalizer.is_binary('data_file.bin')
print(f"Is text file: {is_text}")Charset Normalizer uses a multi-step detection process:
This architecture enables highly accurate charset detection across 99+ supported encodings while maintaining performance and reliability.
Primary charset detection methods for bytes, file pointers, and file paths. Includes binary content detection to distinguish text from non-text data.
def from_bytes(sequences, **kwargs) -> CharsetMatches: ...
def from_fp(fp, **kwargs) -> CharsetMatches: ...
def from_path(path, **kwargs) -> CharsetMatches: ...
def is_binary(fp_or_path_or_payload, **kwargs) -> bool: ...Structured containers for charset detection results, providing detailed information about detected encodings, confidence levels, language detection, and text decoding capabilities.
class CharsetMatch:
encoding: str
language: str
chaos: float
coherence: float
def __str__(self) -> str: ...
class CharsetMatches:
def best(self) -> CharsetMatch | None: ...
def __getitem__(self, item) -> CharsetMatch: ...Chardet-compatible detection function for easy migration from chardet to charset-normalizer while maintaining backward compatibility.
def detect(byte_str, should_rename_legacy=False, **kwargs) -> dict: ...Command-line interface and programmatic CLI functions for charset detection, file processing, and interactive operations.
from charset_normalizer.cli import cli_detect, query_yes_no
def cli_detect(
paths: list[str],
alternatives: bool = False,
normalize: bool = False,
minimal: bool = False,
replace: bool = False,
force: bool = False,
threshold: float = 0.2,
verbose: bool = False
) -> None: ...
def query_yes_no(question: str, default: str = "yes") -> bool: ...Logger configuration and version information utilities.
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s"
) -> None:
"""
Configure a logger with custom handler, level, and format.
Parameters:
- name: Logger name (default: "charset_normalizer")
- level: Logging level (default: logging.INFO)
- format_string: Log message format (default: includes timestamp, level, message)
Returns:
None
Note: Sets up a StreamHandler with the specified configuration
"""
__version__: str # Package version string
VERSION: list[str] # Version components as list# Type aliases for language coherence data
CoherenceMatch = tuple[str, float] # (language_name, coherence_score)
CoherenceMatches = list[CoherenceMatch] # List of language matches
# Type aliases for detection results (legacy compatibility)
from typing import TypedDict
class ResultDict(TypedDict):
"""Legacy detection result type for chardet compatibility."""
encoding: str | None # Detected encoding name or None
language: str # Detected language or empty string
confidence: float | None # Confidence score (0.0-1.0) or None
# Import types for function signatures
from typing import BinaryIO, Iterator, Any
from os import PathLike
import logging