tessl/pypi-presidio-analyzer

Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums

—

Pending

Overview

Eval results

Files

Context Enhancement

Name: tessl/pypi-presidio-analyzer
Author: tessl

Context-aware enhancement improves PII detection accuracy by analyzing surrounding text and using contextual clues to boost confidence scores for likely PII entities.

Capabilities

ContextAwareEnhancer Base Class

Abstract base class for implementing context-aware enhancement logic.

class ContextAwareEnhancer:
    """
    Abstract base class for context-aware enhancement implementations.
    
    Args:
        context_similarity_factor: Weight factor for context similarity (0.0-1.0)
        min_score_with_context_similarity: Minimum score required for context enhancement
        context_prefix_count: Number of words to analyze before detected entity
        context_suffix_count: Number of words to analyze after detected entity
    """
    def __init__(
        self,
        context_similarity_factor: float,
        min_score_with_context_similarity: float,
        context_prefix_count: int,
        context_suffix_count: int
    ): ...

    def enhance_using_context(
        self,
        text: str,
        raw_results: List[RecognizerResult],
        nlp_artifacts: NlpArtifacts,
        recognizers: List[EntityRecognizer],
        context: Optional[List[str]] = None
    ) -> List[RecognizerResult]:
        """
        Abstract method: Enhance detection results using contextual information.
        
        Args:
            text: Original input text
            raw_results: Initial detection results from recognizers
            nlp_artifacts: NLP processing artifacts (tokens, lemmas, etc.)
            recognizers: List of all available recognizers
            context: Optional context keywords for enhancement
            
        Returns:
            Enhanced list of RecognizerResult objects with improved scores
        """

    # Properties
    context_similarity_factor: float         # Weight for context similarity scoring
    min_score_with_context_similarity: float # Minimum score threshold for enhancement
    context_prefix_count: int                # Words to analyze before entity
    context_suffix_count: int                # Words to analyze after entity

    # Constants
    MIN_SCORE = 0      # Minimum confidence score
    MAX_SCORE = 1.0    # Maximum confidence score

LemmaContextAwareEnhancer

Concrete implementation that uses lemmatization for context-aware enhancement.

class LemmaContextAwareEnhancer(ContextAwareEnhancer):
    """
    Context-aware enhancer using lemma-based similarity analysis.
    
    Args:
        context_similarity_factor: Weight factor for similarity scoring (default: 0.35)
        min_score_with_context_similarity: Minimum score for enhancement (default: 0.4)
        context_prefix_count: Words to analyze before entity (default: 5)
        context_suffix_count: Words to analyze after entity (default: 0)
    """
    def __init__(
        self,
        context_similarity_factor: float = 0.35,
        min_score_with_context_similarity: float = 0.4,
        context_prefix_count: int = 5,
        context_suffix_count: int = 0
    ): ...

    def enhance_using_context(
        self,
        text: str,
        raw_results: List[RecognizerResult],
        nlp_artifacts: NlpArtifacts,
        recognizers: List[EntityRecognizer],
        context: Optional[List[str]] = None
    ) -> List[RecognizerResult]:
        """
        Enhance results using lemma-based context analysis.
        
        Compares lemmatized forms of surrounding words with recognizer context
        keywords to identify supporting contextual evidence.
        
        Args:
            text: Original input text
            raw_results: Initial detection results
            nlp_artifacts: NLP processing results with lemmas
            recognizers: Available recognizers with context keywords
            context: Additional context keywords for this analysis
            
        Returns:
            Enhanced RecognizerResult list with boosted confidence scores
        """

    @staticmethod
    def _find_supportive_word_in_context(
        context_list: List[str],
        recognizer_context_list: List[str]
    ) -> str:
        """
        Find context words that support PII detection.
        
        Args:
            context_list: Surrounding words from text
            recognizer_context_list: Context keywords from recognizer
            
        Returns:
            First matching supportive word or empty string
        """

    def _extract_surrounding_words(
        self,
        nlp_artifacts: NlpArtifacts,
        word: str,
        start: int
    ) -> List[str]:
        """
        Extract surrounding words from NLP artifacts.
        
        Args:
            nlp_artifacts: NLP processing results
            word: Target word/entity
            start: Start position of entity in text
            
        Returns:
            List of surrounding word lemmas
        """

Usage Examples

Basic Context Enhancement Setup

from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer

# Create context enhancer with custom settings
enhancer = LemmaContextAwareEnhancer(
    context_similarity_factor=0.45,  # Stronger context influence
    min_score_with_context_similarity=0.3,  # Lower threshold for enhancement
    context_prefix_count=3,  # Look at 3 words before
    context_suffix_count=2   # Look at 2 words after
)

# Initialize analyzer with context enhancement
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)

# Analyze text with context benefit
text = "Please update my phone number to 555-0199 in the system"

results = analyzer.analyze(text=text, language="en")

for result in results:
    detected_text = text[result.start:result.end]
    print(f"Entity: {result.entity_type}")
    print(f"Text: '{detected_text}'")
    print(f"Score: {result.score:.3f}")
    if result.analysis_explanation:
        print(f"Context boost: {result.analysis_explanation.textual_explanation}")

Providing Explicit Context Keywords

from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer

# Setup context-aware analyzer
enhancer = LemmaContextAwareEnhancer()
analyzer = AnalyzerEngine(context_aware_enhancer=enhancer)

# Text with ambiguous numbers
text = "My new contact is 555-0123 and my employee ID is 98765"

# Provide context to help distinguish phone numbers from other numbers
context_keywords = [
    "contact", "phone", "call", "number", "telephone", "mobile", "cell"
]

results = analyzer.analyze(
    text=text,
    language="en", 
    context=context_keywords
)

# Context should help boost phone number confidence
for result in results:
    detected_text = text[result.start:result.end]
    print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score:.3f})")
    
    if result.analysis_explanation and result.analysis_explanation.textual_explanation:
        print(f"  Enhancement: {result.analysis_explanation.textual_explanation}")

Comparing with and without Context Enhancement

from presidio_analyzer import AnalyzerEngine, LemmaContextAwareEnhancer

# Create analyzer without context enhancement
analyzer_basic = AnalyzerEngine()

# Create analyzer with context enhancement  
enhancer = LemmaContextAwareEnhancer()
analyzer_enhanced = AnalyzerEngine(context_aware_enhancer=enhancer)

# Test text with contextual clues
text = "The patient's medical record shows phone: 555-0199"

# Analyze without context enhancement
basic_results = analyzer_basic.analyze(text=text, language="en")

# Analyze with context enhancement
enhanced_results = analyzer_enhanced.analyze(text=text, language="en")

print("Without context enhancement:")
for result in basic_results:
    if result.entity_type == "PHONE_NUMBER":
        print(f"  Phone score: {result.score:.3f}")

print("\nWith context enhancement:")
for result in enhanced_results:
    if result.entity_type == "PHONE_NUMBER":
        print(f"  Phone score: {result.score:.3f}")
        if result.analysis_explanation:
            print(f"  Explanation: {result.analysis_explanation.textual_explanation}")