or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli-interface.mdcore-detection.mddetection-results.mdindex.mdlegacy-compatibility.md

detection-results.mddocs/

0

# Detection Results

1

2

Structured containers for charset detection results that provide detailed information about detected encodings, confidence metrics, language identification, and text decoding capabilities. These classes enable comprehensive analysis and manipulation of detection outcomes.

3

4

## Capabilities

5

6

### Individual Match Results

7

8

Container for a single charset detection result with comprehensive encoding information and decoded text access.

9

10

```python { .api }

11

class CharsetMatch:

12

"""

13

Represents a single charset detection result.

14

15

Provides detailed information about the detected encoding including

16

confidence metrics, language detection, and decoded text access.

17

"""

18

19

# Core properties

20

encoding: str # Detected encoding name (IANA standard)

21

language: str # Detected language or "Unknown"

22

chaos: float # Mess ratio (0.0 = perfect, 1.0 = complete chaos)

23

coherence: float # Language coherence ratio (0.0-1.0)

24

25

# Percentage representations

26

percent_chaos: float # Chaos as percentage (0-100)

27

percent_coherence: float # Coherence as percentage (0-100)

28

29

# Additional properties

30

encoding_aliases: list[str] # Alternative names for encoding

31

languages: list[str] # All detected languages

32

bom: bool # Whether BOM/signature was detected

33

byte_order_mark: bool # Alias for bom

34

raw: bytes # Original input bytes

35

alphabets: list[str] # Detected Unicode ranges

36

could_be_from_charset: list[str] # Alternative possible encodings

37

fingerprint: str # SHA256 hash of normalized output

38

submatch: list[CharsetMatch] # Alternative matches with same result

39

has_submatch: bool # Whether alternative matches exist

40

multi_byte_usage: float # Ratio of multi-byte character usage

41

42

def __str__(self) -> str:

43

"""

44

Decode bytes to string using detected encoding.

45

46

Returns:

47

str: Decoded text content

48

49

Raises:

50

UnicodeDecodeError: If decoding fails (rare with validated matches)

51

"""

52

53

def output(self, encoding: str = "utf_8") -> bytes:

54

"""

55

Re-encode content to target encoding.

56

57

Parameters:

58

- encoding: Target encoding name (default: "utf_8")

59

60

Returns:

61

bytes: Content encoded in target encoding

62

"""

63

64

def add_submatch(self, other: CharsetMatch) -> None:

65

"""

66

Add alternative match with same decoded result.

67

68

Parameters:

69

- other: Alternative CharsetMatch with same fingerprint but different encoding

70

71

Raises:

72

ValueError: If other is not a CharsetMatch instance or equals self

73

74

Note: Reduces memory usage by linking similar results

75

"""

76

```

77

78

**Usage Example:**

79

80

```python

81

import charset_normalizer

82

83

raw_data = b'\xc4\x8cesk\xc3\xbd text' # Czech text in UTF-8

84

results = charset_normalizer.from_bytes(raw_data)

85

match = results.best()

86

87

if match:

88

# Basic information

89

print(f"Encoding: {match.encoding}") # utf_8

90

print(f"Language: {match.language}") # Czech

91

print(f"Confidence: {100 - match.percent_chaos:.1f}%")

92

93

# Decoded text

94

text = str(match)

95

print(f"Text: {text}")

96

97

# Alternative encodings

98

print(f"Could also be: {match.could_be_from_charset}")

99

100

# Unicode analysis

101

print(f"Alphabets: {match.alphabets}") # ['Latin Extended-A', 'Basic Latin']

102

103

# Re-encode to different format

104

windows_bytes = match.output('windows-1252')

105

print(f"Windows-1252: {windows_bytes}")

106

```

107

108

### Multiple Match Container

109

110

Ordered collection of charset detection results sorted by confidence, supporting iteration and intelligent selection of best matches.

111

112

```python { .api }

113

class CharsetMatches:

114

"""

115

Container for multiple CharsetMatch results.

116

117

Maintains results sorted by confidence (best first) and provides

118

convenient access methods for result selection and analysis.

119

"""

120

121

def __init__(self, results: list[CharsetMatch] | None = None):

122

"""

123

Initialize with optional list of results.

124

125

Parameters:

126

- results: Initial list of CharsetMatch objects

127

"""

128

129

def best(self) -> CharsetMatch | None:

130

"""

131

Get the highest confidence match.

132

133

Returns:

134

CharsetMatch | None: Best match or None if no results

135

"""

136

137

def first(self) -> CharsetMatch | None:

138

"""

139

Alias for best() method (for backward compatibility).

140

141

Returns:

142

CharsetMatch | None: Best match or None if no results

143

"""

144

145

def append(self, item: CharsetMatch) -> None:

146

"""

147

Add new match maintaining sort order.

148

149

Parameters:

150

- item: CharsetMatch to add

151

152

Raises:

153

ValueError: If item is not a CharsetMatch instance

154

"""

155

156

def __getitem__(self, item: int | str) -> CharsetMatch:

157

"""

158

Access match by index or encoding name.

159

160

Parameters:

161

- item: Index (int) or encoding name (str)

162

163

Returns:

164

CharsetMatch: Matching result

165

166

Raises:

167

KeyError: If index/encoding not found

168

IndexError: If index out of range

169

"""

170

171

def __len__(self) -> int:

172

"""Get number of results."""

173

174

def __bool__(self) -> bool:

175

"""Check if any results exist."""

176

177

def __iter__(self) -> Iterator[CharsetMatch]:

178

"""Iterate over all results in confidence order."""

179

```

180

181

**Usage Example:**

182

183

```python

184

import charset_normalizer

185

186

raw_data = b'Ambiguous \xe9ncoding' # Could be multiple encodings

187

results = charset_normalizer.from_bytes(raw_data)

188

189

# Check if any results found

190

if results:

191

print(f"Found {len(results)} possible encodings")

192

193

# Get best match

194

best = results.best()

195

print(f"Best: {best.encoding} ({100-best.percent_chaos:.1f}% confidence)")

196

197

# Access by index

198

if len(results) > 1:

199

second_best = results[1]

200

print(f"Second: {second_best.encoding}")

201

202

# Access by encoding name

203

try:

204

utf8_match = results['utf_8']

205

print(f"UTF-8 chaos: {utf8_match.percent_chaos:.1f}%")

206

except KeyError:

207

print("UTF-8 not detected")

208

209

# Iterate all results

210

for i, match in enumerate(results):

211

print(f"{i+1}. {match.encoding}: {100-match.percent_chaos:.1f}% confidence")

212

213

# Compare different decodings

214

try:

215

decoded = str(match)

216

print(f" Text: {decoded}")

217

except UnicodeDecodeError:

218

print(f" Decoding failed")

219

```

220

221

### CLI Result Container

222

223

Specialized result container for command-line interface operations with JSON serialization support.

224

225

```python { .api }

226

class CliDetectionResult:

227

"""

228

CLI-specific detection result container.

229

230

Structured for command-line output and JSON serialization,

231

containing all relevant detection information in a flat format.

232

"""

233

234

# Properties

235

path: str # Input file path

236

unicode_path: str | None # Unicode-normalized path

237

encoding: str | None # Detected encoding

238

encoding_aliases: list[str] # Alternative encoding names

239

alternative_encodings: list[str] # Other possible encodings

240

language: str # Detected language

241

alphabets: list[str] # Unicode ranges found

242

has_sig_or_bom: bool # BOM/signature present

243

chaos: float # Mess ratio (0.0-1.0)

244

coherence: float # Coherence ratio (0.0-1.0)

245

is_preferred: bool # Whether this is the preferred result

246

247

def to_json(self) -> str:

248

"""

249

Serialize result to JSON string.

250

251

Returns:

252

str: JSON representation with proper formatting

253

"""

254

255

@property

256

def __dict__(self) -> dict[str, Any]:

257

"""

258

Get result as dictionary for serialization.

259

260

Returns:

261

dict: All properties as key-value pairs

262

"""

263

```

264

265

**Usage Example:**

266

267

```python

268

# Note: CliDetectionResult is typically created internally by CLI operations

269

# This shows the structure for understanding the API

270

271

# Hypothetical CLI result creation (normally done by CLI functions)

272

cli_result = CliDetectionResult(

273

path='document.txt',

274

encoding='utf_8',

275

encoding_aliases=['utf-8', 'u8'],

276

alternative_encodings=['ascii'],

277

language='English',

278

alphabets=['Basic Latin'],

279

has_sig_or_bom=False,

280

chaos=0.02,

281

coherence=0.85,

282

unicode_path=None,

283

is_preferred=True

284

)

285

286

# JSON serialization

287

json_output = cli_result.to_json()

288

print(json_output)

289

290

# Dictionary access

291

result_dict = cli_result.__dict__

292

print(f"Encoding: {result_dict['encoding']}")

293

print(f"Confidence: {(1.0 - result_dict['chaos']) * 100:.1f}%")

294

```

295

296

## Result Analysis Patterns

297

298

### Confidence Assessment

299

300

```python

301

# Evaluate detection confidence

302

match = results.best()

303

if match:

304

confidence = 100 - match.percent_chaos

305

306

if confidence >= 95:

307

print("Very high confidence")

308

elif confidence >= 85:

309

print("High confidence")

310

elif confidence >= 70:

311

print("Moderate confidence")

312

else:

313

print("Low confidence - manual verification recommended")

314

```

315

316

### Multi-Encoding Comparison

317

318

```python

319

# Compare multiple encoding possibilities

320

if len(results) > 1:

321

print("Multiple encoding candidates:")

322

for match in results[:3]: # Top 3 candidates

323

confidence = 100 - match.percent_chaos

324

print(f"- {match.encoding}: {confidence:.1f}% confidence")

325

print(f" Language: {match.language}")

326

print(f" Preview: {str(match)[:50]}...")

327

```

328

329

### Language-Aware Selection

330

331

```python

332

# Select encoding based on expected language

333

expected_language = "French"

334

for match in results:

335

if match.language == expected_language:

336

print(f"Found {expected_language} text in {match.encoding}")

337

selected_text = str(match)

338

break

339

else:

340

# Fall back to best overall match

341

selected_text = str(results.best())

342

```