Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums
—
Presidio Analyzer provides flexible configuration through YAML files, supporting multiple NLP engines, customizable recognizer registries, and various deployment scenarios.
Utility class for creating AnalyzerEngine instances from YAML configuration files.
class AnalyzerEngineProvider:
"""
Factory class for creating configured AnalyzerEngine instances.
Args:
analyzer_engine_conf_file: Path to analyzer configuration YAML file
nlp_engine_conf_file: Path to NLP engine configuration YAML file
recognizer_registry_conf_file: Path to recognizer registry configuration YAML file
"""
def __init__(
self,
analyzer_engine_conf_file: Optional[Union[Path, str]] = None,
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
recognizer_registry_conf_file: Optional[Union[Path, str]] = None
): ...
def create_engine(self) -> AnalyzerEngine:
"""
Create and configure AnalyzerEngine from configuration files.
Returns:
Fully configured AnalyzerEngine instance
"""
def get_configuration(self, conf_file: Optional[Union[Path, str]]) -> Union[Dict[str, Any]]:
"""
Load configuration from YAML file.
Args:
conf_file: Path to configuration file
Returns:
Dictionary containing configuration data
"""
# Properties
configuration: Dict[str, Any] # Loaded configuration data
nlp_engine_conf_file: Optional[str] # Path to NLP engine configuration
recognizer_registry_conf_file: Optional[str] # Path to recognizer registry configurationRegistry that manages and organizes entity recognizers for the analyzer.
class RecognizerRegistry:
"""
Registry for managing entity recognizers.
Args:
recognizers: Initial collection of recognizers to register
global_regex_flags: Default regex compilation flags for pattern recognizers
supported_languages: List of supported language codes
"""
def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = None, # Default: re.DOTALL | re.MULTILINE | re.IGNORECASE
supported_languages: Optional[List[str]] = None
): ...
def load_predefined_recognizers(
self,
languages: Optional[List[str]] = None,
nlp_engine: NlpEngine = None
) -> None:
"""
Load built-in recognizers into the registry.
Args:
languages: Language codes for recognizers to load (None = all supported)
nlp_engine: NLP engine instance for NLP-based recognizers
"""
def add_nlp_recognizer(self, nlp_engine: NlpEngine) -> None:
"""
Add NLP-based recognizer (spaCy, Stanza, Transformers) to registry.
Args:
nlp_engine: Configured NLP engine instance
"""
# Properties
recognizers: List[EntityRecognizer] # List of registered recognizers
global_regex_flags: Optional[int] # Default regex flags
supported_languages: Optional[List[str]] # Supported language codesFactory for creating configured NLP engine instances.
class NlpEngineProvider:
"""
Factory class for creating NLP engine instances from configuration.
Args:
nlp_configuration: Dictionary containing NLP engine configuration
"""
def __init__(self, nlp_configuration: Optional[Dict] = None): ...
def create_engine(self) -> NlpEngine:
"""
Create NLP engine instance based on configuration.
Returns:
Configured NLP engine (spaCy, Stanza, or Transformers)
"""
@staticmethod
def create_nlp_engine_with_spacy(
model_name: str,
nlp_ta_prefix_list: List[str] = None
) -> SpacyNlpEngine:
"""Create spaCy-based NLP engine with specified model."""
@staticmethod
def create_nlp_engine_with_stanza(
model_name: str,
nlp_ta_prefix_list: List[str] = None
) -> StanzaNlpEngine:
"""Create Stanza-based NLP engine with specified model."""
@staticmethod
def create_nlp_engine_with_transformers(
model_name: str,
nlp_ta_prefix_list: List[str] = None
) -> TransformersNlpEngine:
"""Create Transformers-based NLP engine with specified model."""# default_analyzer.yaml
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
- lang_code: es
model_name: es_core_news_md
# Context enhancement settings
context_aware_enhancer:
enable: true
context_similarity_factor: 0.35
min_score_with_context_similarity: 0.4
context_prefix_count: 5
context_suffix_count: 0
# Default score threshold
default_score_threshold: 0.0
# Supported languages
supported_languages:
- en
- es
- fr
- de
- it# spacy.yaml
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
- lang_code: es
model_name: es_core_news_md
- lang_code: fr
model_name: fr_core_news_md
- lang_code: de
model_name: de_core_news_md
- lang_code: it
model_name: it_core_news_md# stanza.yaml
nlp_engine_name: stanza
models:
- lang_code: en
model_name: en
- lang_code: es
model_name: es
- lang_code: fr
model_name: fr
- lang_code: de
model_name: de
- lang_code: it
model_name: it# transformers.yaml
nlp_engine_name: transformers
models:
- lang_code: en
model_name: dslim/bert-base-NER
- lang_code: es
model_name: mrm8488/bert-spanish-cased-finetuned-ner# default_recognizers.yaml
recognizers:
- name: "CreditCardRecognizer"
supported_language: "en"
supported_entities: ["CREDIT_CARD"]
patterns:
- name: "credit_card_visa"
regex: "4[0-9]{12}(?:[0-9]{3})?"
score: 0.9
- name: "credit_card_mastercard"
regex: "5[1-5][0-9]{14}"
score: 0.9
context: ["credit", "card", "payment"]
- name: "PhoneRecognizer"
supported_language: "en"
supported_entities: ["PHONE_NUMBER"]
patterns:
- name: "us_phone"
regex: "\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b"
score: 0.7
context: ["phone", "call", "number", "contact"]from presidio_analyzer import AnalyzerEngineProvider
# Create analyzer from default configuration
provider = AnalyzerEngineProvider()
analyzer = provider.create_engine()
# Use the configured analyzer
text = "Contact John at john@email.com or call 555-123-4567"
results = analyzer.analyze(text=text, language="en")
print(f"Found {len(results)} PII entities using default configuration")from presidio_analyzer import AnalyzerEngineProvider
# Create analyzer with custom configuration files
provider = AnalyzerEngineProvider(
analyzer_engine_conf_file="config/custom_analyzer.yaml",
nlp_engine_conf_file="config/custom_nlp.yaml",
recognizer_registry_conf_file="config/custom_recognizers.yaml"
)
analyzer = provider.create_engine()
# Test with custom configuration
text = "Custom entity detection test"
results = analyzer.analyze(text=text, language="en")from presidio_analyzer import (
AnalyzerEngine, RecognizerRegistry, LemmaContextAwareEnhancer
)
from presidio_analyzer.nlp_engine import SpacyNlpEngine
# Configure NLP engine
nlp_engine = SpacyNlpEngine(models={"en": "en_core_web_lg"})
# Configure recognizer registry
registry = RecognizerRegistry(supported_languages=["en"])
registry.load_predefined_recognizers(languages=["en"], nlp_engine=nlp_engine)
# Configure context enhancement
enhancer = LemmaContextAwareEnhancer(
context_similarity_factor=0.4,
min_score_with_context_similarity=0.3
)
# Create analyzer with custom configuration
analyzer = AnalyzerEngine(
registry=registry,
nlp_engine=nlp_engine,
context_aware_enhancer=enhancer,
default_score_threshold=0.5,
supported_languages=["en"]
)from presidio_analyzer import AnalyzerEngineProvider
import yaml
# Create multi-language configuration
multilingual_config = {
'nlp_engine_name': 'spacy',
'models': [
{'lang_code': 'en', 'model_name': 'en_core_web_lg'},
{'lang_code': 'es', 'model_name': 'es_core_news_md'},
{'lang_code': 'fr', 'model_name': 'fr_core_news_md'},
{'lang_code': 'de', 'model_name': 'de_core_news_md'}
],
'supported_languages': ['en', 'es', 'fr', 'de'],
'default_score_threshold': 0.6
}
# Save configuration to file
with open('multilingual_config.yaml', 'w') as f:
yaml.dump(multilingual_config, f)
# Create analyzer from configuration
provider = AnalyzerEngineProvider(
analyzer_engine_conf_file='multilingual_config.yaml'
)
analyzer = provider.create_engine()
# Test with different languages
texts = {
'en': "Contact John Smith at john@email.com",
'es': "Contacta con Juan en juan@email.com",
'fr': "Contactez Jean à jean@email.com",
'de': "Kontaktieren Sie Johann unter johann@email.com"
}
for language, text in texts.items():
results = analyzer.analyze(text=text, language=language)
print(f"{language}: Found {len(results)} entities")from presidio_analyzer import AnalyzerEngineProvider
import os
from pathlib import Path
def create_analyzer_from_environment():
"""Create analyzer using environment-specific configuration."""
# Get configuration paths from environment variables
config_dir = os.getenv('PRESIDIO_CONFIG_DIR', 'config')
analyzer_config = os.getenv(
'PRESIDIO_ANALYZER_CONFIG',
f'{config_dir}/analyzer.yaml'
)
nlp_config = os.getenv(
'PRESIDIO_NLP_CONFIG',
f'{config_dir}/nlp.yaml'
)
recognizer_config = os.getenv(
'PRESIDIO_RECOGNIZER_CONFIG',
f'{config_dir}/recognizers.yaml'
)
# Verify configuration files exist
for config_file in [analyzer_config, nlp_config, recognizer_config]:
if not Path(config_file).exists():
print(f"Warning: Configuration file not found: {config_file}")
# Create analyzer with environment-specific configuration
provider = AnalyzerEngineProvider(
analyzer_engine_conf_file=analyzer_config,
nlp_engine_conf_file=nlp_config,
recognizer_registry_conf_file=recognizer_config
)
return provider.create_engine()
# Usage with environment variables
# export PRESIDIO_CONFIG_DIR=/etc/presidio
# export PRESIDIO_ANALYZER_CONFIG=/etc/presidio/production_analyzer.yaml
analyzer = create_analyzer_from_environment()from presidio_analyzer import AnalyzerEngineProvider
import yaml
import os
def create_docker_optimized_analyzer():
"""Create analyzer optimized for Docker deployment."""
# Docker-optimized configuration
docker_config = {
'nlp_engine_name': 'spacy',
'models': [
{
'lang_code': 'en',
'model_name': 'en_core_web_sm' # Smaller model for containers
}
],
'supported_languages': ['en'],
'default_score_threshold': 0.5,
'context_aware_enhancer': {
'enable': True,
'context_similarity_factor': 0.35,
'min_score_with_context_similarity': 0.4
}
}
# Write configuration to container filesystem
config_path = '/tmp/docker_analyzer_config.yaml'
with open(config_path, 'w') as f:
yaml.dump(docker_config, f)
# Create analyzer
provider = AnalyzerEngineProvider(
analyzer_engine_conf_file=config_path
)
return provider.create_engine()
# Docker deployment usage
analyzer = create_docker_optimized_analyzer()from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import SpacyNlpEngine
def create_high_performance_analyzer():
"""Create analyzer optimized for high-throughput scenarios."""
# Use lightweight NLP processing
nlp_engine = SpacyNlpEngine(
models={"en": "en_core_web_sm"}, # Smaller, faster model
nlp_configuration={
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}]
}
)
# Create registry with only essential recognizers
registry = RecognizerRegistry(supported_languages=["en"])
# Load only high-confidence, fast recognizers
essential_recognizers = [
"EmailRecognizer",
"PhoneRecognizer",
"CreditCardRecognizer",
"UsSsnRecognizer"
]
registry.load_predefined_recognizers(
languages=["en"],
nlp_engine=nlp_engine
)
# Filter to essential recognizers only
registry.recognizers = [
r for r in registry.recognizers
if r.name in essential_recognizers
]
# Create analyzer without context enhancement for speed
analyzer = AnalyzerEngine(
registry=registry,
nlp_engine=nlp_engine,
context_aware_enhancer=None, # Disable for performance
default_score_threshold=0.7 # Higher threshold for precision
)
return analyzer
# High-performance deployment
analyzer = create_high_performance_analyzer()from presidio_analyzer import (
AnalyzerEngineProvider, PatternRecognizer, Pattern, RecognizerRegistry
)
import yaml
def create_custom_recognizer_config():
"""Create configuration with custom recognizers."""
# Define custom recognizer in YAML format
custom_config = {
'recognizers': [
{
'name': 'CustomEmployeeIdRecognizer',
'supported_language': 'en',
'supported_entities': ['EMPLOYEE_ID'],
'patterns': [
{
'name': 'emp_id_pattern_1',
'regex': r'\bEMP-\d{5}\b',
'score': 0.9
},
{
'name': 'emp_id_pattern_2',
'regex': r'\b[Ee]mployee\s*[Ii][Dd]\s*:?\s*(\d{5})\b',
'score': 0.8
}
],
'context': ['employee', 'staff', 'worker', 'personnel']
},
{
'name': 'CustomProductCodeRecognizer',
'supported_language': 'en',
'supported_entities': ['PRODUCT_CODE'],
'patterns': [
{
'name': 'product_code_pattern',
'regex': r'\bPRD-[A-Z]{2}-\d{4}\b',
'score': 0.9
}
],
'context': ['product', 'item', 'catalog', 'inventory']
}
]
}
# Save custom recognizer configuration
with open('custom_recognizers.yaml', 'w') as f:
yaml.dump(custom_config, f)
# Create analyzer with custom recognizers
provider = AnalyzerEngineProvider(
recognizer_registry_conf_file='custom_recognizers.yaml'
)
return provider.create_engine()
# Usage with custom recognizers
analyzer = create_custom_recognizer_config()
test_text = "Employee ID: 12345 ordered product PRD-AB-1234"
results = analyzer.analyze(text=test_text, language="en")
for result in results:
detected_text = test_text[result.start:result.end]
print(f"Found {result.entity_type}: '{detected_text}'")from presidio_analyzer import AnalyzerEngineProvider
import yaml
from pathlib import Path
def validate_configuration(config_file: str) -> bool:
"""Validate analyzer configuration file."""
try:
# Check if file exists
if not Path(config_file).exists():
print(f"Error: Configuration file not found: {config_file}")
return False
# Load and validate YAML syntax
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
# Validate required fields
required_fields = ['nlp_engine_name', 'models', 'supported_languages']
for field in required_fields:
if field not in config:
print(f"Error: Missing required field: {field}")
return False
# Validate NLP engine name
valid_engines = ['spacy', 'stanza', 'transformers']
if config['nlp_engine_name'] not in valid_engines:
print(f"Error: Invalid NLP engine: {config['nlp_engine_name']}")
return False
# Validate models configuration
if not isinstance(config['models'], list) or not config['models']:
print("Error: Models must be a non-empty list")
return False
for model in config['models']:
if 'lang_code' not in model or 'model_name' not in model:
print("Error: Each model must have 'lang_code' and 'model_name'")
return False
# Try to create analyzer to validate configuration
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=config_file)
analyzer = provider.create_engine()
print(f"Configuration validation successful: {config_file}")
return True
except yaml.YAMLError as e:
print(f"YAML syntax error: {e}")
return False
except Exception as e:
print(f"Configuration error: {e}")
return False
# Validate configuration before deployment
config_file = "config/analyzer.yaml"
if validate_configuration(config_file):
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=config_file)
analyzer = provider.create_engine()
print("Analyzer created successfully")
else:
print("Configuration validation failed")en_core_web_sm) for faster processingInstall with Tessl CLI
npx tessl i tessl/pypi-presidio-analyzer