Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums
—
The entity recognizer framework provides the foundation for creating custom PII detection logic. It includes abstract base classes, pattern-based recognizers, and integration capabilities for remote services.
Abstract base class that defines the interface for all PII entity recognizers in Presidio Analyzer.
class EntityRecognizer:
"""
Abstract base class for all PII entity recognizers.
Args:
supported_entities: List of entity types this recognizer can detect
name: Unique identifier for the recognizer (auto-generated if None)
supported_language: Primary language code supported (default: "en")
version: Version string for the recognizer
context: Optional context keywords that improve detection accuracy
"""
def __init__(
self,
supported_entities: List[str],
name: str = None,
supported_language: str = "en",
version: str = "0.0.1",
context: Optional[List[str]] = None
): ...
def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts
) -> List[RecognizerResult]:
"""
Abstract method: Analyze text to detect PII entities.
Args:
text: Input text to analyze
entities: List of entity types to look for
nlp_artifacts: Pre-processed NLP data (tokens, lemmas, etc.)
Returns:
List of RecognizerResult objects for detected entities
"""
def load(self) -> None:
"""Abstract method: Initialize recognizer resources (models, patterns, etc.)"""
def enhance_using_context(
self,
text: str,
raw_results: List[RecognizerResult],
nlp_artifacts: NlpArtifacts,
recognizers: List[EntityRecognizer],
context: Optional[List[str]] = None
) -> List[RecognizerResult]:
"""
Enhance detection results using contextual information.
Can be overridden by subclasses for custom enhancement logic.
Args:
text: Original input text
raw_results: Initial detection results
nlp_artifacts: NLP processing artifacts
recognizers: All available recognizers
context: Context keywords for enhancement
Returns:
Enhanced list of RecognizerResult objects
"""
def get_supported_entities(self) -> List[str]:
"""Get list of entity types this recognizer supports."""
def get_supported_language(self) -> str:
"""Get primary supported language code."""
def get_version(self) -> str:
"""Get recognizer version string."""
def to_dict(self) -> Dict:
"""Serialize recognizer configuration to dictionary."""
@classmethod
def from_dict(cls, entity_recognizer_dict: Dict) -> EntityRecognizer:
"""Create recognizer instance from dictionary configuration."""
@staticmethod
def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]:
"""Remove duplicate results based on entity type and position."""
@staticmethod
def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
"""Clean input text using replacement patterns."""
# Properties
supported_entities: List[str] # Entity types this recognizer detects
name: str # Unique recognizer identifier
supported_language: str # Primary language code
version: str # Version string
is_loaded: bool # Whether recognizer resources are loaded
context: Optional[List[str]] # Context keywords for enhancement
id: str # Unique instance identifier
# Constants
MIN_SCORE = 0 # Minimum confidence score
MAX_SCORE = 1.0 # Maximum confidence scoreAbstract class for recognizers that run in the same process as the AnalyzerEngine.
class LocalRecognizer(EntityRecognizer):
"""
Abstract base class for recognizers that execute locally within the analyzer process.
Inherits all methods and properties from EntityRecognizer.
"""
passConcrete implementation for pattern-based PII detection using regular expressions and deny lists.
class PatternRecognizer(LocalRecognizer):
"""
PII entity recognizer using regular expressions and deny lists.
Args:
supported_entity: Single entity type this recognizer detects
name: Unique identifier for the recognizer
supported_language: Language code (default: "en")
patterns: List of Pattern objects containing regex rules
deny_list: List of strings that should always be detected
context: Context keywords that improve detection accuracy
deny_list_score: Confidence score for deny list matches (default: 1.0)
global_regex_flags: Default regex compilation flags
version: Version string
"""
def __init__(
self,
supported_entity: str,
name: str = None,
supported_language: str = "en",
patterns: List[Pattern] = None,
deny_list: List[str] = None,
context: List[str] = None,
deny_list_score: float = 1.0,
global_regex_flags: Optional[int] = None, # Default: re.DOTALL | re.MULTILINE | re.IGNORECASE
version: str = "0.0.1"
): ...
def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: Optional[NlpArtifacts] = None,
regex_flags: Optional[int] = None
) -> List[RecognizerResult]:
"""
Analyze text using configured patterns and deny lists.
Args:
text: Input text to analyze
entities: Entity types to detect (must include supported_entity)
nlp_artifacts: Pre-processed NLP data (optional for pattern matching)
regex_flags: Override default regex compilation flags
Returns:
List of RecognizerResult objects for pattern matches
"""
def validate_result(self, pattern_text: str) -> Optional[bool]:
"""
Validate pattern match using custom logic (override in subclasses).
Args:
pattern_text: Matched text from pattern
Returns:
True if valid, False if invalid, None if no validation performed
"""
def invalidate_result(self, pattern_text: str) -> Optional[bool]:
"""
Check if pattern match should be invalidated (override in subclasses).
Args:
pattern_text: Matched text from pattern
Returns:
True if should be invalidated, False if valid, None if no check performed
"""
@staticmethod
def build_regex_explanation(
recognizer_name: str,
pattern_name: str,
pattern: str,
original_score: float,
validation_result: Optional[bool] = None
) -> AnalysisExplanation:
"""Build detailed explanation for regex-based detection."""
def to_dict(self) -> Dict:
"""Serialize pattern recognizer configuration to dictionary."""
@classmethod
def from_dict(cls, entity_recognizer_dict: Dict) -> PatternRecognizer:
"""Create PatternRecognizer from dictionary configuration."""
# Properties
patterns: List[Pattern] # List of regex Pattern objects
deny_list: List[str] # List of strings that indicate PII
context: Optional[List[str]] # Context keywords for enhancement
deny_list_score: float # Confidence score for deny list matches
global_regex_flags: Optional[int] # Default regex compilation flagsAbstract class for recognizers that call external services or run in separate processes.
class RemoteRecognizer(EntityRecognizer):
"""
Abstract base class for recognizers that call external services.
Args:
supported_entities: List of entity types this recognizer can detect
name: Unique identifier for the recognizer
supported_language: Language code
version: Version string
context: Optional context keywords
"""
def __init__(
self,
supported_entities: List[str],
name: Optional[str],
supported_language: str,
version: str,
context: Optional[List[str]] = None
): ...
def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts
) -> List[RecognizerResult]:
"""
Abstract method: Call external service for PII detection.
Must be implemented by concrete subclasses.
"""
def get_supported_entities(self) -> List[str]:
"""Abstract method: Get supported entities from external service."""Represents a regular expression pattern used by PatternRecognizer.
class Pattern:
"""
Regular expression pattern for PII detection.
Args:
name: Descriptive name for the pattern
regex: Regular expression string
score: Confidence score when pattern matches (0.0-1.0)
"""
def __init__(self, name: str, regex: str, score: float): ...
def to_dict(self) -> Dict:
"""Serialize pattern to dictionary format."""
@classmethod
def from_dict(cls, pattern_dict: Dict) -> Pattern:
"""Create Pattern from dictionary data."""
# Properties
name: str # Descriptive pattern name
regex: str # Regular expression string
score: float # Confidence score for matches
compiled_regex: re.Pattern # Compiled regex object
compiled_with_flags: re.Pattern # Compiled regex with flagsfrom presidio_analyzer import PatternRecognizer, Pattern
# Define patterns for custom entity type
employee_id_patterns = [
Pattern(
name="employee_id_format_1",
regex=r"\bEMP-\d{5}\b",
score=0.9
),
Pattern(
name="employee_id_format_2",
regex=r"\b[Ee]mployee\s*[Ii][Dd]\s*:?\s*(\d{5})\b",
score=0.8
)
]
# Create custom recognizer
employee_recognizer = PatternRecognizer(
supported_entity="EMPLOYEE_ID",
name="EmployeeIdRecognizer",
patterns=employee_id_patterns,
context=["employee", "staff", "worker", "personnel"]
)
# Test the recognizer
from presidio_analyzer.nlp_engine import SpacyNlpEngine
nlp_engine = SpacyNlpEngine()
nlp_engine.load()
text = "Contact employee ID: 12345 or use EMP-98765"
nlp_artifacts = nlp_engine.process_text(text, "en")
results = employee_recognizer.analyze(
text=text,
entities=["EMPLOYEE_ID"],
nlp_artifacts=nlp_artifacts
)
for result in results:
detected_text = text[result.start:result.end]
print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score})")from presidio_analyzer import PatternRecognizer
# Create recognizer with deny list
sensitive_terms_recognizer = PatternRecognizer(
supported_entity="SENSITIVE_TERM",
name="SensitiveTermsRecognizer",
deny_list=[
"confidential",
"classified",
"internal use only",
"proprietary"
],
deny_list_score=0.95
)
# Test with text containing deny list terms
text = "This document is marked as confidential and internal use only"
results = sensitive_terms_recognizer.analyze(
text=text,
entities=["SENSITIVE_TERM"],
nlp_artifacts=None # Deny lists don't need NLP processing
)
print(f"Found {len(results)} sensitive terms")from presidio_analyzer import PatternRecognizer, Pattern
import re
class CustomCreditCardRecognizer(PatternRecognizer):
"""Custom credit card recognizer with Luhn algorithm validation."""
def __init__(self):
patterns = [
Pattern(
name="credit_card_generic",
regex=r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
score=0.6 # Lower initial score, validation will increase
)
]
super().__init__(
supported_entity="CREDIT_CARD",
name="CustomCreditCardRecognizer",
patterns=patterns
)
def validate_result(self, pattern_text: str) -> Optional[bool]:
"""Validate credit card number using Luhn algorithm."""
# Remove non-digit characters
digits = re.sub(r'[-\s]', '', pattern_text)
if not digits.isdigit() or len(digits) != 16:
return False
# Luhn algorithm validation
def luhn_check(card_num):
def digits_of(n):
return [int(d) for d in str(n)]
digits = digits_of(card_num)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(d*2))
return checksum % 10 == 0
return luhn_check(digits)
# Use custom recognizer
recognizer = CustomCreditCardRecognizer()
# Test with valid and invalid credit card numbers
text = "Valid: 4532015112830366, Invalid: 1234567890123456"
results = recognizer.analyze(
text=text,
entities=["CREDIT_CARD"],
nlp_artifacts=None
)
for result in results:
card_num = text[result.start:result.end]
print(f"Credit card: {card_num}, Score: {result.score}")from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
# Create custom recognizer
custom_recognizer = PatternRecognizer(
supported_entity="PRODUCT_CODE",
name="ProductCodeRecognizer",
patterns=[
Pattern(
name="product_code_pattern",
regex=r"\bPRD-[A-Z]{2}-\d{4}\b",
score=0.9
)
]
)
# Create registry with custom recognizer
registry = RecognizerRegistry()
registry.recognizers.append(custom_recognizer)
# Load default recognizers
registry.load_predefined_recognizers(languages=["en"])
# Create analyzer with custom registry
analyzer = AnalyzerEngine(registry=registry)
# Test analysis
text = "Order product PRD-AB-1234 and contact john@email.com"
results = analyzer.analyze(text=text, language="en")
for result in results:
detected_text = text[result.start:result.end]
print(f"Found {result.entity_type}: '{detected_text}'")from presidio_analyzer import RemoteRecognizer, RecognizerResult
import requests
class APIBasedRecognizer(RemoteRecognizer):
"""Example remote recognizer that calls external API."""
def __init__(self, api_endpoint: str, api_key: str):
super().__init__(
supported_entities=["CUSTOM_PII"],
name="APIBasedRecognizer",
supported_language="en",
version="1.0.0"
)
self.api_endpoint = api_endpoint
self.api_key = api_key
def load(self) -> None:
"""Initialize connection to remote service."""
# Test API connectivity
headers = {"Authorization": f"Bearer {self.api_key}"}
response = requests.get(f"{self.api_endpoint}/health", headers=headers)
if response.status_code != 200:
raise ConnectionError("Cannot connect to remote PII service")
def analyze(self, text: str, entities: List[str], nlp_artifacts) -> List[RecognizerResult]:
"""Call remote API for PII detection."""
if "CUSTOM_PII" not in entities:
return []
headers = {"Authorization": f"Bearer {self.api_key}"}
payload = {"text": text, "entities": entities}
response = requests.post(
f"{self.api_endpoint}/analyze",
json=payload,
headers=headers
)
results = []
if response.status_code == 200:
api_results = response.json()
for detection in api_results.get("detections", []):
result = RecognizerResult(
entity_type=detection["entity_type"],
start=detection["start"],
end=detection["end"],
score=detection["score"]
)
results.append(result)
return results
def get_supported_entities(self) -> List[str]:
"""Get supported entities from remote service."""
headers = {"Authorization": f"Bearer {self.api_key}"}
response = requests.get(f"{self.api_endpoint}/entities", headers=headers)
if response.status_code == 200:
return response.json().get("entities", [])
return self.supported_entities
# Usage (assuming you have an API endpoint)
# remote_recognizer = APIBasedRecognizer(
# api_endpoint="https://api.example.com/pii",
# api_key="your-api-key"
# )
# remote_recognizer.load()from presidio_analyzer import PatternRecognizer, Pattern
import yaml
def create_recognizer_from_config(config_file: str) -> PatternRecognizer:
"""Create PatternRecognizer from YAML configuration."""
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
# Create patterns from configuration
patterns = []
for pattern_config in config.get('patterns', []):
pattern = Pattern(
name=pattern_config['name'],
regex=pattern_config['regex'],
score=pattern_config['score']
)
patterns.append(pattern)
# Create recognizer
recognizer = PatternRecognizer(
supported_entity=config['entity_type'],
name=config['name'],
patterns=patterns,
deny_list=config.get('deny_list', []),
context=config.get('context', []),
supported_language=config.get('language', 'en')
)
return recognizer
# Example YAML configuration file (recognizer_config.yaml):
"""
name: "CustomBankAccountRecognizer"
entity_type: "BANK_ACCOUNT"
language: "en"
patterns:
- name: "routing_account_pattern"
regex: "\\b\\d{9}[-\\s]\\d{10,12}\\b"
score: 0.8
- name: "account_number_pattern"
regex: "Account\\s*:?\\s*(\\d{10,12})"
score: 0.7
deny_list:
- "0000000000"
- "1111111111"
context:
- "account"
- "banking"
- "routing"
"""
# Create recognizer from configuration
# recognizer = create_recognizer_from_config("recognizer_config.yaml")\b) to avoid partial matchesInstall with Tessl CLI
npx tessl i tessl/pypi-presidio-analyzer