The Real First Universal Charset Detector providing modern, fast, and reliable character encoding detection as an alternative to chardet.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Primary charset detection methods that analyze raw bytes, file pointers, or file paths to determine character encoding. These functions form the core of charset-normalizer's detection capabilities and support extensive customization through parameters.
Detects character encoding from raw bytes or bytearray sequences using advanced heuristic analysis.
def from_bytes(
sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Detect charset from raw bytes sequence.
Parameters:
- sequences: Raw bytes or bytearray to analyze
- steps: Number of analysis steps (default: 5)
- chunk_size: Size of data chunks for analysis (default: 512)
- threshold: Mess ratio threshold for encoding rejection (default: 0.2)
- cp_isolation: List of encodings to test exclusively
- cp_exclusion: List of encodings to exclude from testing
- preemptive_behaviour: Enable BOM/signature priority detection (default: True)
- explain: Enable detailed logging for debugging (default: False)
- language_threshold: Minimum coherence for language detection (default: 0.1)
- enable_fallback: Enable fallback to common encodings (default: True)
Returns:
CharsetMatches: Ordered collection of detection results
Raises:
TypeError: If sequences is not bytes or bytearray
"""Usage Example:
import charset_normalizer
# Basic detection
raw_data = b'\xe4\xb8\xad\xe6\x96\x87' # Chinese text in UTF-8
results = charset_normalizer.from_bytes(raw_data)
best_match = results.best()
print(f"Encoding: {best_match.encoding}") # utf_8
print(f"Language: {best_match.language}") # Chinese
# Advanced detection with custom parameters
results = charset_normalizer.from_bytes(
raw_data,
steps=10, # More thorough analysis
threshold=0.1, # Stricter mess threshold
cp_isolation=['utf_8', 'gb2312', 'big5'], # Test only Chinese encodings
explain=True # Enable debug logging
)Detects character encoding from an open file pointer without closing it.
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Detect charset from file pointer.
Parameters:
- fp: Open binary file pointer
- Other parameters: Same as from_bytes
Returns:
CharsetMatches: Ordered collection of detection results
Note: Does not close the file pointer
"""Usage Example:
import charset_normalizer
with open('document.txt', 'rb') as fp:
results = charset_normalizer.from_fp(fp)
best_match = results.best()
if best_match:
print(f"File encoding: {best_match.encoding}")
# File pointer remains open for further operationsDetects character encoding by opening and reading a file from its path.
def from_path(
path: str | bytes | PathLike,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Detect charset from file path.
Parameters:
- path: Path to file (string, bytes, or PathLike object)
- Other parameters: Same as from_bytes
Returns:
CharsetMatches: Ordered collection of detection results
Raises:
IOError: If file cannot be opened or read
"""Usage Example:
import charset_normalizer
from pathlib import Path
# Using string path
results = charset_normalizer.from_path('data/sample.txt')
# Using Path object
file_path = Path('documents/report.csv')
results = charset_normalizer.from_path(file_path)
# With custom settings for CSV files
results = charset_normalizer.from_path(
'data.csv',
cp_isolation=['utf_8', 'iso-8859-1', 'windows-1252'], # Common for CSV
threshold=0.15 # Slightly stricter for structured data
)Determines whether input data represents binary (non-text) content.
def is_binary(
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if input is binary (non-text) content.
Parameters:
- fp_or_path_or_payload: File path, file pointer, or raw bytes
- Other parameters: Same as from_bytes (enable_fallback defaults to False)
Returns:
bool: True if content appears to be binary, False if text
Note: Uses stricter criteria than text detection to avoid false positives
"""Usage Example:
import charset_normalizer
# Check if file is binary
if charset_normalizer.is_binary('image.jpg'):
print("Binary file detected")
else:
print("Text file detected")
# Check raw bytes
data = b'\x89PNG\r\n\x1a\n' # PNG file header
if charset_normalizer.is_binary(data):
print("Binary data")
# Check with file pointer
with open('document.pdf', 'rb') as fp:
if charset_normalizer.is_binary(fp):
print("Binary document")Install with Tessl CLI
npx tessl i tessl/pypi-charset-normalizer