Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums
npx @tessl/cli install tessl/pypi-presidio-analyzer@2.2.0Presidio Analyzer is a Python-based service for detecting PII (Personally Identifiable Information) entities in unstructured text. It provides a pluggable and customizable framework using Named Entity Recognition, regular expressions, rule-based logic, and checksums to identify over 50 types of PII entities across multiple languages.
pip install presidio-analyzerfrom presidio_analyzer import AnalyzerEngineFor comprehensive imports:
from presidio_analyzer import (
AnalyzerEngine,
BatchAnalyzerEngine,
RecognizerResult,
PatternRecognizer,
Pattern,
AnalyzerEngineProvider
)from presidio_analyzer import AnalyzerEngine
# Initialize analyzer
analyzer = AnalyzerEngine()
# Analyze text for PII
text = "My name is John Doe and my phone number is 555-123-4567"
results = analyzer.analyze(text=text, language="en")
# Process results
for result in results:
print(f"Entity: {result.entity_type}")
print(f"Text: {text[result.start:result.end]}")
print(f"Score: {result.score}")
print(f"Location: {result.start}-{result.end}")Presidio Analyzer follows a modular architecture:
This design allows for flexible deployment options from Python scripts to Docker containers and Kubernetes orchestration, while maintaining high extensibility for custom recognizers and detection logic.
Central PII detection functionality including the main AnalyzerEngine class, request handling, and result processing. Provides the primary interface for detecting PII entities in text.
class AnalyzerEngine:
def __init__(
self,
registry: RecognizerRegistry = None,
nlp_engine: NlpEngine = None,
app_tracer: AppTracer = None,
log_decision_process: bool = False,
default_score_threshold: float = 0,
supported_languages: List[str] = None,
context_aware_enhancer: Optional[ContextAwareEnhancer] = None
): ...
def analyze(
self,
text: str,
language: str,
entities: Optional[List[str]] = None,
correlation_id: Optional[str] = None,
score_threshold: Optional[float] = None,
return_decision_process: Optional[bool] = False,
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
context: Optional[List[str]] = None,
allow_list: Optional[List[str]] = None,
allow_list_match: Optional[str] = "exact",
regex_flags: Optional[int] = None,
nlp_artifacts: Optional[NlpArtifacts] = None
) -> List[RecognizerResult]: ...High-performance analysis of large datasets including iterables, dictionaries, and structured data with multiprocessing support and configurable batch sizes.
class BatchAnalyzerEngine:
def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): ...
def analyze_iterator(
self,
texts: Iterable[Union[str, bool, float, int]],
language: str,
batch_size: int = 1,
n_process: int = 1,
**kwargs
) -> List[List[RecognizerResult]]: ...
def analyze_dict(
self,
input_dict: Dict[str, Union[Any, Iterable[Any]]],
language: str,
keys_to_skip: Optional[List[str]] = None,
batch_size: int = 1,
n_process: int = 1,
**kwargs
) -> Iterator[DictAnalyzerResult]: ...Framework for creating custom PII recognizers including abstract base classes, pattern-based recognizers, and remote service integration capabilities.
class EntityRecognizer:
def __init__(
self,
supported_entities: List[str],
name: str = None,
supported_language: str = "en",
version: str = "0.0.1",
context: Optional[List[str]] = None
): ...
def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts
) -> List[RecognizerResult]: ...
class PatternRecognizer(LocalRecognizer):
def __init__(
self,
supported_entity: str,
name: str = None,
supported_language: str = "en",
patterns: List[Pattern] = None,
deny_list: List[str] = None,
context: List[str] = None,
deny_list_score: float = 1.0,
global_regex_flags: Optional[int] = None,
version: str = "0.0.1"
): ...Comprehensive collection of over 50 built-in recognizers for common PII types including generic entities (emails, phone numbers, credit cards) and country-specific identifiers (SSNs, passport numbers, tax IDs).
# Generic recognizers
class CreditCardRecognizer(PatternRecognizer): ...
class EmailRecognizer(PatternRecognizer): ...
class PhoneRecognizer(PatternRecognizer): ...
class IpRecognizer(PatternRecognizer): ...
# US-specific recognizers
class UsSsnRecognizer(PatternRecognizer): ...
class UsLicenseRecognizer(PatternRecognizer): ...
class UsPassportRecognizer(PatternRecognizer): ...
# International recognizers
class IbanRecognizer(PatternRecognizer): ...
class AuMedicareRecognizer(PatternRecognizer): ...
class UkNinoRecognizer(PatternRecognizer): ...Advanced context-aware enhancement that improves detection accuracy by analyzing surrounding text using lemmatization and contextual similarity scoring.
class ContextAwareEnhancer:
def __init__(
self,
context_similarity_factor: float,
min_score_with_context_similarity: float,
context_prefix_count: int,
context_suffix_count: int
): ...
def enhance_using_context(
self,
text: str,
raw_results: List[RecognizerResult],
nlp_artifacts: NlpArtifacts,
recognizers: List[EntityRecognizer],
context: Optional[List[str]] = None
) -> List[RecognizerResult]: ...
class LemmaContextAwareEnhancer(ContextAwareEnhancer):
def __init__(
self,
context_similarity_factor: float = 0.35,
min_score_with_context_similarity: float = 0.4,
context_prefix_count: int = 5,
context_suffix_count: int = 0
): ...Flexible configuration system supporting YAML-based setup, multiple NLP engines (spaCy, Stanza, Transformers), and customizable recognizer registries.
class AnalyzerEngineProvider:
def __init__(
self,
analyzer_engine_conf_file: Optional[Union[Path, str]] = None,
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
recognizer_registry_conf_file: Optional[Union[Path, str]] = None
): ...
def create_engine(self) -> AnalyzerEngine: ...
class RecognizerRegistry:
def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = None,
supported_languages: Optional[List[str]] = None
): ...
def load_predefined_recognizers(
self,
languages: Optional[List[str]] = None,
nlp_engine: NlpEngine = None
) -> None: ...class RecognizerResult:
def __init__(
self,
entity_type: str,
start: int,
end: int,
score: float,
analysis_explanation: AnalysisExplanation = None,
recognition_metadata: Dict = None
): ...
# Properties
entity_type: str # Type of detected entity (e.g., "PERSON", "PHONE_NUMBER")
start: int # Start position in text
end: int # End position in text
score: float # Confidence score (0.0 to 1.0)
analysis_explanation: AnalysisExplanation # Detailed detection explanation
recognition_metadata: Dict # Additional recognizer-specific metadata
class DictAnalyzerResult:
key: str # Dictionary key that was analyzed
value: Union[str, List[str], dict] # Original value
recognizer_results: Union[
List[RecognizerResult],
List[List[RecognizerResult]],
Iterator[DictAnalyzerResult]
] # Detection results
class AnalysisExplanation:
def __init__(
self,
recognizer: str,
original_score: float,
pattern_name: str = None,
pattern: str = None,
validation_result: float = None,
textual_explanation: str = None,
regex_flags: int = None
): ...
# Properties
recognizer: str # Name of recognizer that made detection
original_score: float # Initial confidence score
score: float # Final confidence score (after enhancements)
pattern_name: str # Name of matching pattern (if applicable)
textual_explanation: str # Human-readable explanationclass Pattern:
def __init__(self, name: str, regex: str, score: float): ...
# Properties
name: str # Descriptive name for the pattern
regex: str # Regular expression pattern
score: float # Confidence score when pattern matches
class AnalyzerRequest:
def __init__(self, req_data: Dict): ...
# Properties extracted from req_data
text: str # Text to analyze
language: str # Language code (e.g., "en")
entities: Optional[List[str]] # Entity types to detect
correlation_id: Optional[str] # Request tracking ID
score_threshold: Optional[float] # Minimum confidence score
return_decision_process: Optional[bool] # Include analysis explanations
ad_hoc_recognizers: Optional[List[EntityRecognizer]] # Custom recognizers
context: Optional[List[str]] # Context keywords for enhancement
allow_list: Optional[List[str]] # Values to exclude from detection
allow_list_match: Optional[str] # Match strategy ("exact" or "fuzzy")
regex_flags: Optional[int] # Regex compilation flagsPresidio Analyzer uses standard Python exceptions. Common error scenarios:
Supported languages: English (en), Hebrew (he), Spanish (es), German (de), French (fr), Italian (it), Portuguese (pt), Chinese (zh), Japanese (ja), Hindi (hi), Arabic (ar).
Language-specific recognizers are automatically loaded based on the language parameter in analyze() calls. Some recognizers support multiple languages while others are region-specific.