or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch-processing.mdconfiguration.mdcontext-enhancement.mdcore-analysis.mdentity-recognizers.mdindex.mdpredefined-recognizers.md

core-analysis.mddocs/

0

# Core Analysis Engine

1

2

The core analysis functionality centers around the `AnalyzerEngine` class, which orchestrates PII detection across all recognizers and provides the primary interface for analyzing text.

3

4

## Capabilities

5

6

### AnalyzerEngine

7

8

Main orchestrator class that coordinates PII entity detection using registered recognizers, NLP processing, and optional context enhancement.

9

10

```python { .api }

11

class AnalyzerEngine:

12

"""

13

Central PII detection engine that orchestrates all analysis operations.

14

15

Args:

16

registry: RecognizerRegistry containing entity recognizers

17

nlp_engine: NLP preprocessing engine (spaCy, Stanza, Transformers)

18

app_tracer: Application tracing for monitoring (optional)

19

log_decision_process: Enable detailed decision logging

20

default_score_threshold: Minimum confidence score for results (0.0-1.0)

21

supported_languages: List of supported language codes

22

context_aware_enhancer: Context enhancement processor (optional)

23

"""

24

def __init__(

25

self,

26

registry: RecognizerRegistry = None,

27

nlp_engine: NlpEngine = None,

28

app_tracer: AppTracer = None,

29

log_decision_process: bool = False,

30

default_score_threshold: float = 0,

31

supported_languages: List[str] = None,

32

context_aware_enhancer: Optional[ContextAwareEnhancer] = None

33

): ...

34

35

def analyze(

36

self,

37

text: str,

38

language: str,

39

entities: Optional[List[str]] = None,

40

correlation_id: Optional[str] = None,

41

score_threshold: Optional[float] = None,

42

return_decision_process: Optional[bool] = False,

43

ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,

44

context: Optional[List[str]] = None,

45

allow_list: Optional[List[str]] = None,

46

allow_list_match: Optional[str] = "exact",

47

regex_flags: Optional[int] = None,

48

nlp_artifacts: Optional[NlpArtifacts] = None

49

) -> List[RecognizerResult]:

50

"""

51

Analyze text to detect PII entities.

52

53

Args:

54

text: Input text to analyze

55

language: Language code (e.g., "en", "es", "fr")

56

entities: Specific entity types to detect (None = all supported)

57

correlation_id: Unique identifier for request tracking

58

score_threshold: Minimum confidence score (overrides default)

59

return_decision_process: Include analysis explanations in results

60

ad_hoc_recognizers: Additional custom recognizers for this request

61

context: Keywords that help improve detection accuracy

62

allow_list: Values to exclude from detection results

63

allow_list_match: Allow list matching strategy ("exact" or "fuzzy")

64

regex_flags: Custom regex compilation flags

65

nlp_artifacts: Pre-computed NLP processing results (performance optimization)

66

67

Returns:

68

List of RecognizerResult objects containing detected PII entities

69

"""

70

71

def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:

72

"""

73

Get all loaded recognizers for specified language.

74

75

Args:

76

language: Language code (None = all languages)

77

78

Returns:

79

List of EntityRecognizer instances

80

"""

81

82

def get_supported_entities(self, language: Optional[str] = None) -> List[str]:

83

"""

84

Get all supported entity types for specified language.

85

86

Args:

87

language: Language code (None = all languages)

88

89

Returns:

90

List of entity type strings (e.g., ["PERSON", "PHONE_NUMBER"])

91

"""

92

```

93

94

### AnalyzerRequest

95

96

Data container class that encapsulates all parameters for an analysis request, useful for serialization and API integration.

97

98

```python { .api }

99

class AnalyzerRequest:

100

"""

101

Request data container for analyzer operations.

102

103

Args:

104

req_data: Dictionary containing all request parameters

105

"""

106

def __init__(self, req_data: Dict): ...

107

108

# Properties extracted from req_data dictionary

109

text: str # Text to analyze

110

language: str # Language code

111

entities: Optional[List[str]] # Entity types to detect

112

correlation_id: Optional[str] # Request tracking identifier

113

score_threshold: Optional[float] # Minimum confidence score

114

return_decision_process: Optional[bool] # Include analysis explanations

115

ad_hoc_recognizers: Optional[List[EntityRecognizer]] # Custom recognizers

116

context: Optional[List[str]] # Context enhancement keywords

117

allow_list: Optional[List[str]] # Values to exclude from detection

118

allow_list_match: Optional[str] # Allow list matching strategy

119

regex_flags: Optional[int] # Regex compilation flags

120

```

121

122

### Result Processing

123

124

Core classes for handling and processing analysis results.

125

126

```python { .api }

127

class RecognizerResult:

128

"""

129

Represents a detected PII entity with location and confidence information.

130

131

Args:

132

entity_type: Type of detected entity (e.g., "PERSON", "PHONE_NUMBER")

133

start: Start character position in text

134

end: End character position in text

135

score: Confidence score (0.0 to 1.0)

136

analysis_explanation: Detailed explanation of detection process

137

recognition_metadata: Additional recognizer-specific data

138

"""

139

def __init__(

140

self,

141

entity_type: str,

142

start: int,

143

end: int,

144

score: float,

145

analysis_explanation: AnalysisExplanation = None,

146

recognition_metadata: Dict = None

147

): ...

148

149

def intersects(self, other: RecognizerResult) -> int:

150

"""

151

Check if this result intersects with another result.

152

153

Returns:

154

Number of overlapping characters (0 = no intersection)

155

"""

156

157

def contained_in(self, other: RecognizerResult) -> bool:

158

"""Check if this result is entirely contained within another result."""

159

160

def contains(self, other: RecognizerResult) -> bool:

161

"""Check if this result entirely contains another result."""

162

163

def equal_indices(self, other: RecognizerResult) -> bool:

164

"""Check if start and end positions match another result."""

165

166

def has_conflict(self, other: RecognizerResult) -> bool:

167

"""Check if this result conflicts with another result."""

168

169

def to_dict(self) -> Dict:

170

"""Serialize result to dictionary format."""

171

172

@classmethod

173

def from_json(cls, data: Dict) -> RecognizerResult:

174

"""Create RecognizerResult from JSON/dictionary data."""

175

176

def append_analysis_explanation_text(self, text: str) -> None:

177

"""Add explanatory text to the analysis explanation."""

178

179

class AnalysisExplanation:

180

"""

181

Detailed explanation of why a PII entity was detected.

182

183

Args:

184

recognizer: Name of recognizer that made the detection

185

original_score: Initial confidence score before enhancements

186

pattern_name: Name of matching pattern (for pattern-based recognizers)

187

pattern: Actual regex pattern that matched (for pattern-based recognizers)

188

validation_result: Result of additional validation logic

189

textual_explanation: Human-readable explanation of detection

190

regex_flags: Regex compilation flags used

191

"""

192

def __init__(

193

self,

194

recognizer: str,

195

original_score: float,

196

pattern_name: str = None,

197

pattern: str = None,

198

validation_result: float = None,

199

textual_explanation: str = None,

200

regex_flags: int = None

201

): ...

202

203

def set_improved_score(self, score: float) -> None:

204

"""Update the confidence score and calculate improvement difference."""

205

206

def set_supportive_context_word(self, word: str) -> None:

207

"""Set context word that helped increase confidence score."""

208

209

def append_textual_explanation_line(self, text: str) -> None:

210

"""Append new line to textual explanation."""

211

212

def to_dict(self) -> Dict:

213

"""Serialize explanation to dictionary format."""

214

```

215

216

## Usage Examples

217

218

### Basic Analysis

219

220

```python

221

from presidio_analyzer import AnalyzerEngine

222

223

# Initialize with default settings

224

analyzer = AnalyzerEngine()

225

226

# Analyze text

227

text = "Contact John Smith at john.smith@email.com or call 555-123-4567"

228

results = analyzer.analyze(text=text, language="en")

229

230

# Process results

231

for result in results:

232

detected_text = text[result.start:result.end]

233

print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score:.2f})")

234

```

235

236

### Advanced Analysis with Context

237

238

```python

239

from presidio_analyzer import AnalyzerEngine

240

241

analyzer = AnalyzerEngine()

242

243

# Provide context to improve accuracy

244

text = "Please update my profile with new phone: 555-0199"

245

context = ["phone", "contact", "profile"]

246

247

results = analyzer.analyze(

248

text=text,

249

language="en",

250

context=context,

251

score_threshold=0.5,

252

return_decision_process=True

253

)

254

255

# Examine detailed results

256

for result in results:

257

print(f"Entity: {result.entity_type}")

258

print(f"Score: {result.score}")

259

if result.analysis_explanation:

260

print(f"Explanation: {result.analysis_explanation.textual_explanation}")

261

```

262

263

### Selective Entity Detection

264

265

```python

266

from presidio_analyzer import AnalyzerEngine

267

268

analyzer = AnalyzerEngine()

269

270

# Only detect specific entity types

271

text = "My SSN is 123-45-6789 and email is user@domain.com"

272

results = analyzer.analyze(

273

text=text,

274

language="en",

275

entities=["US_SSN", "EMAIL_ADDRESS"] # Only detect these types

276

)

277

278

print(f"Found {len(results)} entities of requested types")

279

```

280

281

### Using Allow Lists

282

283

```python

284

from presidio_analyzer import AnalyzerEngine

285

286

analyzer = AnalyzerEngine()

287

288

# Exclude known safe values

289

text = "Contact support at support@company.com or use test@example.com for testing"

290

allow_list = ["support@company.com"]

291

292

results = analyzer.analyze(

293

text=text,

294

language="en",

295

allow_list=allow_list,

296

allow_list_match="exact"

297

)

298

299

# Only test@example.com should be detected

300

for result in results:

301

detected_email = text[result.start:result.end]

302

print(f"Detected: {detected_email}") # Should only show test@example.com

303

```

304

305

### Request Object Pattern

306

307

```python

308

from presidio_analyzer import AnalyzerEngine, AnalyzerRequest

309

310

analyzer = AnalyzerEngine()

311

312

# Create structured request

313

request_data = {

314

"text": "Call me at 555-1234 or email john@company.com",

315

"language": "en",

316

"entities": ["PHONE_NUMBER", "EMAIL_ADDRESS"],

317

"score_threshold": 0.6,

318

"return_decision_process": True

319

}

320

321

request = AnalyzerRequest(request_data)

322

results = analyzer.analyze(

323

text=request.text,

324

language=request.language,

325

entities=request.entities,

326

score_threshold=request.score_threshold,

327

return_decision_process=request.return_decision_process

328

)

329

330

print(f"Processed request {request.correlation_id}")

331

```

332

333

### Performance Optimization with Pre-computed NLP

334

335

```python

336

from presidio_analyzer import AnalyzerEngine

337

from presidio_analyzer.nlp_engine import SpacyNlpEngine

338

339

# Initialize with specific NLP engine

340

nlp_engine = SpacyNlpEngine()

341

analyzer = AnalyzerEngine(nlp_engine=nlp_engine)

342

343

text = "Contact John Doe at john.doe@email.com"

344

345

# Pre-compute NLP artifacts for reuse

346

nlp_artifacts = nlp_engine.process_text(text, "en")

347

348

# Use pre-computed artifacts (faster for repeated analysis)

349

results = analyzer.analyze(

350

text=text,

351

language="en",

352

nlp_artifacts=nlp_artifacts

353

)

354

```

355

356

## Error Handling

357

358

```python

359

from presidio_analyzer import AnalyzerEngine

360

361

analyzer = AnalyzerEngine()

362

363

try:

364

results = analyzer.analyze(

365

text="Sample text",

366

language="unsupported_lang" # Invalid language

367

)

368

except ValueError as e:

369

print(f"Invalid parameter: {e}")

370

371

try:

372

results = analyzer.analyze(

373

text="Sample text",

374

language="en",

375

score_threshold=1.5 # Invalid threshold (must be 0.0-1.0)

376

)

377

except ValueError as e:

378

print(f"Invalid score threshold: {e}")

379

```