Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums
—
The core analysis functionality centers around the AnalyzerEngine class, which orchestrates PII detection across all recognizers and provides the primary interface for analyzing text.
Main orchestrator class that coordinates PII entity detection using registered recognizers, NLP processing, and optional context enhancement.
class AnalyzerEngine:
"""
Central PII detection engine that orchestrates all analysis operations.
Args:
registry: RecognizerRegistry containing entity recognizers
nlp_engine: NLP preprocessing engine (spaCy, Stanza, Transformers)
app_tracer: Application tracing for monitoring (optional)
log_decision_process: Enable detailed decision logging
default_score_threshold: Minimum confidence score for results (0.0-1.0)
supported_languages: List of supported language codes
context_aware_enhancer: Context enhancement processor (optional)
"""
def __init__(
self,
registry: RecognizerRegistry = None,
nlp_engine: NlpEngine = None,
app_tracer: AppTracer = None,
log_decision_process: bool = False,
default_score_threshold: float = 0,
supported_languages: List[str] = None,
context_aware_enhancer: Optional[ContextAwareEnhancer] = None
): ...
def analyze(
self,
text: str,
language: str,
entities: Optional[List[str]] = None,
correlation_id: Optional[str] = None,
score_threshold: Optional[float] = None,
return_decision_process: Optional[bool] = False,
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
context: Optional[List[str]] = None,
allow_list: Optional[List[str]] = None,
allow_list_match: Optional[str] = "exact",
regex_flags: Optional[int] = None,
nlp_artifacts: Optional[NlpArtifacts] = None
) -> List[RecognizerResult]:
"""
Analyze text to detect PII entities.
Args:
text: Input text to analyze
language: Language code (e.g., "en", "es", "fr")
entities: Specific entity types to detect (None = all supported)
correlation_id: Unique identifier for request tracking
score_threshold: Minimum confidence score (overrides default)
return_decision_process: Include analysis explanations in results
ad_hoc_recognizers: Additional custom recognizers for this request
context: Keywords that help improve detection accuracy
allow_list: Values to exclude from detection results
allow_list_match: Allow list matching strategy ("exact" or "fuzzy")
regex_flags: Custom regex compilation flags
nlp_artifacts: Pre-computed NLP processing results (performance optimization)
Returns:
List of RecognizerResult objects containing detected PII entities
"""
def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
"""
Get all loaded recognizers for specified language.
Args:
language: Language code (None = all languages)
Returns:
List of EntityRecognizer instances
"""
def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
"""
Get all supported entity types for specified language.
Args:
language: Language code (None = all languages)
Returns:
List of entity type strings (e.g., ["PERSON", "PHONE_NUMBER"])
"""Data container class that encapsulates all parameters for an analysis request, useful for serialization and API integration.
class AnalyzerRequest:
"""
Request data container for analyzer operations.
Args:
req_data: Dictionary containing all request parameters
"""
def __init__(self, req_data: Dict): ...
# Properties extracted from req_data dictionary
text: str # Text to analyze
language: str # Language code
entities: Optional[List[str]] # Entity types to detect
correlation_id: Optional[str] # Request tracking identifier
score_threshold: Optional[float] # Minimum confidence score
return_decision_process: Optional[bool] # Include analysis explanations
ad_hoc_recognizers: Optional[List[EntityRecognizer]] # Custom recognizers
context: Optional[List[str]] # Context enhancement keywords
allow_list: Optional[List[str]] # Values to exclude from detection
allow_list_match: Optional[str] # Allow list matching strategy
regex_flags: Optional[int] # Regex compilation flagsCore classes for handling and processing analysis results.
class RecognizerResult:
"""
Represents a detected PII entity with location and confidence information.
Args:
entity_type: Type of detected entity (e.g., "PERSON", "PHONE_NUMBER")
start: Start character position in text
end: End character position in text
score: Confidence score (0.0 to 1.0)
analysis_explanation: Detailed explanation of detection process
recognition_metadata: Additional recognizer-specific data
"""
def __init__(
self,
entity_type: str,
start: int,
end: int,
score: float,
analysis_explanation: AnalysisExplanation = None,
recognition_metadata: Dict = None
): ...
def intersects(self, other: RecognizerResult) -> int:
"""
Check if this result intersects with another result.
Returns:
Number of overlapping characters (0 = no intersection)
"""
def contained_in(self, other: RecognizerResult) -> bool:
"""Check if this result is entirely contained within another result."""
def contains(self, other: RecognizerResult) -> bool:
"""Check if this result entirely contains another result."""
def equal_indices(self, other: RecognizerResult) -> bool:
"""Check if start and end positions match another result."""
def has_conflict(self, other: RecognizerResult) -> bool:
"""Check if this result conflicts with another result."""
def to_dict(self) -> Dict:
"""Serialize result to dictionary format."""
@classmethod
def from_json(cls, data: Dict) -> RecognizerResult:
"""Create RecognizerResult from JSON/dictionary data."""
def append_analysis_explanation_text(self, text: str) -> None:
"""Add explanatory text to the analysis explanation."""
class AnalysisExplanation:
"""
Detailed explanation of why a PII entity was detected.
Args:
recognizer: Name of recognizer that made the detection
original_score: Initial confidence score before enhancements
pattern_name: Name of matching pattern (for pattern-based recognizers)
pattern: Actual regex pattern that matched (for pattern-based recognizers)
validation_result: Result of additional validation logic
textual_explanation: Human-readable explanation of detection
regex_flags: Regex compilation flags used
"""
def __init__(
self,
recognizer: str,
original_score: float,
pattern_name: str = None,
pattern: str = None,
validation_result: float = None,
textual_explanation: str = None,
regex_flags: int = None
): ...
def set_improved_score(self, score: float) -> None:
"""Update the confidence score and calculate improvement difference."""
def set_supportive_context_word(self, word: str) -> None:
"""Set context word that helped increase confidence score."""
def append_textual_explanation_line(self, text: str) -> None:
"""Append new line to textual explanation."""
def to_dict(self) -> Dict:
"""Serialize explanation to dictionary format."""from presidio_analyzer import AnalyzerEngine
# Initialize with default settings
analyzer = AnalyzerEngine()
# Analyze text
text = "Contact John Smith at john.smith@email.com or call 555-123-4567"
results = analyzer.analyze(text=text, language="en")
# Process results
for result in results:
detected_text = text[result.start:result.end]
print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score:.2f})")from presidio_analyzer import AnalyzerEngine
analyzer = AnalyzerEngine()
# Provide context to improve accuracy
text = "Please update my profile with new phone: 555-0199"
context = ["phone", "contact", "profile"]
results = analyzer.analyze(
text=text,
language="en",
context=context,
score_threshold=0.5,
return_decision_process=True
)
# Examine detailed results
for result in results:
print(f"Entity: {result.entity_type}")
print(f"Score: {result.score}")
if result.analysis_explanation:
print(f"Explanation: {result.analysis_explanation.textual_explanation}")from presidio_analyzer import AnalyzerEngine
analyzer = AnalyzerEngine()
# Only detect specific entity types
text = "My SSN is 123-45-6789 and email is user@domain.com"
results = analyzer.analyze(
text=text,
language="en",
entities=["US_SSN", "EMAIL_ADDRESS"] # Only detect these types
)
print(f"Found {len(results)} entities of requested types")from presidio_analyzer import AnalyzerEngine
analyzer = AnalyzerEngine()
# Exclude known safe values
text = "Contact support at support@company.com or use test@example.com for testing"
allow_list = ["support@company.com"]
results = analyzer.analyze(
text=text,
language="en",
allow_list=allow_list,
allow_list_match="exact"
)
# Only test@example.com should be detected
for result in results:
detected_email = text[result.start:result.end]
print(f"Detected: {detected_email}") # Should only show test@example.comfrom presidio_analyzer import AnalyzerEngine, AnalyzerRequest
analyzer = AnalyzerEngine()
# Create structured request
request_data = {
"text": "Call me at 555-1234 or email john@company.com",
"language": "en",
"entities": ["PHONE_NUMBER", "EMAIL_ADDRESS"],
"score_threshold": 0.6,
"return_decision_process": True
}
request = AnalyzerRequest(request_data)
results = analyzer.analyze(
text=request.text,
language=request.language,
entities=request.entities,
score_threshold=request.score_threshold,
return_decision_process=request.return_decision_process
)
print(f"Processed request {request.correlation_id}")from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
# Initialize with specific NLP engine
nlp_engine = SpacyNlpEngine()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
text = "Contact John Doe at john.doe@email.com"
# Pre-compute NLP artifacts for reuse
nlp_artifacts = nlp_engine.process_text(text, "en")
# Use pre-computed artifacts (faster for repeated analysis)
results = analyzer.analyze(
text=text,
language="en",
nlp_artifacts=nlp_artifacts
)from presidio_analyzer import AnalyzerEngine
analyzer = AnalyzerEngine()
try:
results = analyzer.analyze(
text="Sample text",
language="unsupported_lang" # Invalid language
)
except ValueError as e:
print(f"Invalid parameter: {e}")
try:
results = analyzer.analyze(
text="Sample text",
language="en",
score_threshold=1.5 # Invalid threshold (must be 0.0-1.0)
)
except ValueError as e:
print(f"Invalid score threshold: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-presidio-analyzer