Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums
—
Context-aware enhancement improves PII detection accuracy by analyzing surrounding text and using contextual clues to boost confidence scores for likely PII entities.
Abstract base class for implementing context-aware enhancement logic.
class ContextAwareEnhancer:
"""
Abstract base class for context-aware enhancement implementations.
Args:
context_similarity_factor: Weight factor for context similarity (0.0-1.0)
min_score_with_context_similarity: Minimum score required for context enhancement
context_prefix_count: Number of words to analyze before detected entity
context_suffix_count: Number of words to analyze after detected entity
"""
def __init__(
self,
context_similarity_factor: float,
min_score_with_context_similarity: float,
context_prefix_count: int,
context_suffix_count: int
): ...
def enhance_using_context(
self,
text: str,
raw_results: List[RecognizerResult],
nlp_artifacts: NlpArtifacts,
recognizers: List[EntityRecognizer],
context: Optional[List[str]] = None
) -> List[RecognizerResult]:
"""
Abstract method: Enhance detection results using contextual information.
Args:
text: Original input text
raw_results: Initial detection results from recognizers
nlp_artifacts: NLP processing artifacts (tokens, lemmas, etc.)
recognizers: List of all available recognizers
context: Optional context keywords for enhancement
Returns:
Enhanced list of RecognizerResult objects with improved scores
"""
# Properties
context_similarity_factor: float # Weight for context similarity scoring
min_score_with_context_similarity: float # Minimum score threshold for enhancement
context_prefix_count: int # Words to analyze before entity
context_suffix_count: int # Words to analyze after entity
# Constants
MIN_SCORE = 0 # Minimum confidence score
MAX_SCORE = 1.0 # Maximum confidence scoreConcrete implementation that uses lemmatization for context-aware enhancement.
class LemmaContextAwareEnhancer(ContextAwareEnhancer):
"""
Context-aware enhancer using lemma-based similarity analysis.
Args:
context_similarity_factor: Weight factor for similarity scoring (default: 0.35)
min_score_with_context_similarity: Minimum score for enhancement (default: 0.4)
context_prefix_count: Words to analyze before entity (default: 5)
context_suffix_count: Words to analyze after entity (default: 0)
"""
def __init__(
self,
context_similarity_factor: float = 0.35,
min_score_with_context_similarity: float = 0.4,
context_prefix_count: int = 5,
context_suffix_count: int = 0
): ...
def enhance_using_context(
self,
text: str,
raw_results: List[RecognizerResult],
nlp_artifacts: NlpArtifacts,
recognizers: List[EntityRecognizer],
context: Optional[List[str]] = None
) -> List[RecognizerResult]:
"""
Enhance results using lemma-based context analysis.
Compares lemmatized forms of surrounding words with recognizer context
keywords to identify supporting contextual evidence.
Args:
text: Original input text
raw_results: Initial detection results
nlp_artifacts: NLP processing results with lemmas
recognizers: Available recognizers with context keywords
context: Additional context keywords for this analysis
Returns:
Enhanced RecognizerResult list with boosted confidence scores
"""
@staticmethod
def _find_supportive_word_in_context(
context_list: List[str],
recognizer_context_list: List[str]
) -> str:
"""
Find context words that support PII detection.
Args:
context_list: Surrounding words from text
recognizer_context_list: Context keywords from recognizer
Returns:
First matching supportive word or empty string
"""
def _extract_surrounding_words(
self,
nlp_artifacts: NlpArtifacts,
word: str,
start: int
) -> List[str]:
"""
Extract surrounding words from NLP artifacts.
Args:
nlp_artifacts: NLP processing results
word: Target word/entity
start: Start position of entity in text
Returns:
List of surrounding word lemmas
"""from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
# Create context enhancer with custom settings
enhancer = LemmaContextAwareEnhancer(
context_similarity_factor=0.45, # Stronger context influence
min_score_with_context_similarity=0.3, # Lower threshold for enhancement
context_prefix_count=3, # Look at 3 words before
context_suffix_count=2 # Look at 2 words after
)
# Initialize analyzer with context enhancement
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)
# Analyze text with context benefit
text = "Please update my phone number to 555-0199 in the system"
results = analyzer.analyze(text=text, language="en")
for result in results:
detected_text = text[result.start:result.end]
print(f"Entity: {result.entity_type}")
print(f"Text: '{detected_text}'")
print(f"Score: {result.score:.3f}")
if result.analysis_explanation:
print(f"Context boost: {result.analysis_explanation.textual_explanation}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
# Setup context-aware analyzer
enhancer = LemmaContextAwareEnhancer()
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)
# Text with ambiguous numbers
text = "My new contact is 555-0123 and my employee ID is 98765"
# Provide context to help distinguish phone numbers from other numbers
context_keywords = [
"contact", "phone", "call", "number", "telephone", "mobile", "cell"
]
results = analyzer.analyze(
text=text,
language="en",
context=context_keywords
)
# Context should help boost phone number confidence
for result in results:
detected_text = text[result.start:result.end]
print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score:.3f})")
if result.analysis_explanation and result.analysis_explanation.textual_explanation:
print(f" Enhancement: {result.analysis_explanation.textual_explanation}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
# Create analyzer without context enhancement
analyzer_basic = AnalyzerEngine()
# Create analyzer with context enhancement
enhancer = LemmaContextAwareEnhancer()
analyzer_enhanced = AnalyzerEngine(context_aware_enhancer=enhancer)
# Test text with contextual clues
text = "The patient's medical record shows phone: 555-0199"
# Analyze without context enhancement
basic_results = analyzer_basic.analyze(text=text, language="en")
# Analyze with context enhancement
enhanced_results = analyzer_enhanced.analyze(text=text, language="en")
print("Without context enhancement:")
for result in basic_results:
if result.entity_type == "PHONE_NUMBER":
print(f" Phone score: {result.score:.3f}")
print("\nWith context enhancement:")
for result in enhanced_results:
if result.entity_type == "PHONE_NUMBER":
print(f" Phone score: {result.score:.3f}")
if result.analysis_explanation:
print(f" Explanation: {result.analysis_explanation.textual_explanation}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer, PatternRecognizer, Pattern
# Create custom recognizer with context keywords
employee_recognizer = PatternRecognizer(
supported_entity="EMPLOYEE_ID",
name="EmployeeRecognizer",
patterns=[Pattern("emp_id", r"\b\d{5}\b", 0.6)],
context=["employee", "staff", "worker", "personnel", "emp"]
)
# Setup context-aware analysis
enhancer = LemmaContextAwareEnhancer(
context_similarity_factor=0.4,
min_score_with_context_similarity=0.3
)
# Create analyzer with custom recognizer and context enhancement
from presidio_analyzer import RecognizerRegistry
registry = RecognizerRegistry()
registry.recognizers.append(employee_recognizer)
registry.load_predefined_recognizers(languages=["en"])
analyzer = AnalyzerEngine(
registry=registry,
context_aware_enhancer=enhancer
)
# Test text with multiple contextual entities
text = """
HR Records:
- Employee John Smith (ID: 12345)
- Contact phone: 555-0199
- SSN for tax purposes: 123-45-6789
"""
results = analyzer.analyze(text=text, language="en")
# Show how context affects different entity types
entity_scores = {}
for result in results:
entity_type = result.entity_type
detected_text = text[result.start:result.end]
if entity_type not in entity_scores:
entity_scores[entity_type] = []
entity_scores[entity_type].append({
'text': detected_text,
'score': result.score,
'enhanced': bool(result.analysis_explanation and
result.analysis_explanation.textual_explanation)
})
for entity_type, detections in entity_scores.items():
print(f"\n{entity_type}:")
for detection in detections:
enhancement_marker = " (enhanced)" if detection['enhanced'] else ""
print(f" '{detection['text']}': {detection['score']:.3f}{enhancement_marker}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
def test_context_parameters(text, context_params):
"""Test different context enhancement parameters."""
results_comparison = {}
for name, params in context_params.items():
enhancer = LemmaContextAwareEnhancer(**params)
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)
results = analyzer.analyze(text=text, language="en")
results_comparison[name] = []
for result in results:
results_comparison[name].append({
'entity_type': result.entity_type,
'score': result.score,
'enhanced': bool(result.analysis_explanation and
result.analysis_explanation.textual_explanation)
})
return results_comparison
# Test text
text = "Customer service representative phone number is 555-0123"
# Different parameter configurations
context_configs = {
'conservative': {
'context_similarity_factor': 0.2,
'min_score_with_context_similarity': 0.6,
'context_prefix_count': 3,
'context_suffix_count': 0
},
'balanced': {
'context_similarity_factor': 0.35,
'min_score_with_context_similarity': 0.4,
'context_prefix_count': 5,
'context_suffix_count': 0
},
'aggressive': {
'context_similarity_factor': 0.5,
'min_score_with_context_similarity': 0.2,
'context_prefix_count': 7,
'context_suffix_count': 3
}
}
# Compare results
comparison = test_context_parameters(text, context_configs)
for config_name, results in comparison.items():
print(f"\n{config_name.upper()} configuration:")
for result in results:
enhancement = " (enhanced)" if result['enhanced'] else ""
print(f" {result['entity_type']}: {result['score']:.3f}{enhancement}")from presidio_analyzer import (
AnalyzerEngine, LemmaContextAwareEnhancer, PatternRecognizer,
Pattern, RecognizerRegistry
)
# Create domain-specific recognizer with context
medical_id_recognizer = PatternRecognizer(
supported_entity="MEDICAL_ID",
name="MedicalIdRecognizer",
patterns=[Pattern("medical_id", r"\bMED-\d{6}\b", 0.7)],
context=["medical", "patient", "healthcare", "diagnosis", "treatment", "hospital"]
)
patient_id_recognizer = PatternRecognizer(
supported_entity="PATIENT_ID",
name="PatientIdRecognizer",
patterns=[Pattern("patient_id", r"\bPT-\d{5}\b", 0.6)],
context=["patient", "admission", "discharge", "medical", "record"]
)
# Setup context-aware enhancement
enhancer = LemmaContextAwareEnhancer(
context_similarity_factor=0.4,
min_score_with_context_similarity=0.3,
context_prefix_count=6, # Look at more words for medical context
context_suffix_count=2
)
# Create analyzer with medical recognizers
registry = RecognizerRegistry()
registry.recognizers.extend([medical_id_recognizer, patient_id_recognizer])
registry.load_predefined_recognizers(languages=["en"])
analyzer = AnalyzerEngine(
registry=registry,
context_aware_enhancer=enhancer
)
# Medical text with contextual clues
medical_text = """
Patient medical record shows:
- Patient ID: PT-12345 for admission
- Medical diagnosis code: MED-987654
- Contact phone: 555-0199
- Healthcare provider: Dr. Smith
"""
results = analyzer.analyze(text=medical_text, language="en")
# Show context enhancement effects
for result in results:
detected_text = medical_text[result.start:result.end]
print(f"\nEntity: {result.entity_type}")
print(f"Text: '{detected_text}'")
print(f"Score: {result.score:.3f}")
if result.analysis_explanation and result.analysis_explanation.textual_explanation:
print(f"Context enhancement: {result.analysis_explanation.textual_explanation}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
# Enable detailed decision process logging
enhancer = LemmaContextAwareEnhancer()
analyzer = AnalyzerEngine(
context_aware_enhancer=enhancer,
log_decision_process=True # Enable detailed logging
)
text = "Update customer phone number 555-0123 in the database"
results = analyzer.analyze(
text=text,
language="en",
return_decision_process=True # Include decision details in results
)
for result in results:
detected_text = text[result.start:result.end]
print(f"\nDetected: {result.entity_type} - '{detected_text}'")
print(f"Final Score: {result.score:.3f}")
if result.analysis_explanation:
exp = result.analysis_explanation
print(f"Original Score: {exp.original_score:.3f}")
if exp.score != exp.original_score:
score_change = exp.score - exp.original_score
print(f"Score Change: +{score_change:.3f}")
if exp.textual_explanation:
print(f"Explanation: {exp.textual_explanation}")from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer
import time
def benchmark_context_enhancement(texts, with_context=True):
"""Benchmark context enhancement performance."""
if with_context:
enhancer = LemmaContextAwareEnhancer()
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)
label = "with context enhancement"
else:
analyzer = AnalyzerEngine()
label = "without context enhancement"
start_time = time.time()
total_results = 0
for text in texts:
results = analyzer.analyze(text=text, language="en")
total_results += len(results)
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing {len(texts)} texts {label}:")
print(f" Time: {processing_time:.3f} seconds")
print(f" Results: {total_results}")
print(f" Rate: {len(texts)/processing_time:.1f} texts/second")
return processing_time
# Test texts
test_texts = [
"Customer phone number is 555-0123",
"Employee ID 12345 needs update",
"Medical record MED-98765 for patient",
"Contact email john@company.com for support",
"SSN 123-45-6789 for tax purposes"
] * 20 # 100 texts total
# Benchmark both configurations
time_without = benchmark_context_enhancement(test_texts, with_context=False)
time_with = benchmark_context_enhancement(test_texts, with_context=True)
overhead = ((time_with - time_without) / time_without) * 100
print(f"\nContext enhancement overhead: {overhead:.1f}%")Install with Tessl CLI
npx tessl i tessl/pypi-presidio-analyzer