tessl/pypi-presidio-analyzer

Python-based service for detecting PII entities in text using Named Entity Recognition, regular expressions, rule-based logic, and checksums

—

Pending

Overview

Eval results

Files

Core Analysis Engine

Name: tessl/pypi-presidio-analyzer
Author: tessl

The core analysis functionality centers around the AnalyzerEngine class, which orchestrates PII detection across all recognizers and provides the primary interface for analyzing text.

Capabilities

AnalyzerEngine

Main orchestrator class that coordinates PII entity detection using registered recognizers, NLP processing, and optional context enhancement.

class AnalyzerEngine:
    """
    Central PII detection engine that orchestrates all analysis operations.
    
    Args:
        registry: RecognizerRegistry containing entity recognizers
        nlp_engine: NLP preprocessing engine (spaCy, Stanza, Transformers)  
        app_tracer: Application tracing for monitoring (optional)
        log_decision_process: Enable detailed decision logging
        default_score_threshold: Minimum confidence score for results (0.0-1.0)
        supported_languages: List of supported language codes
        context_aware_enhancer: Context enhancement processor (optional)
    """
    def __init__(
        self,
        registry: RecognizerRegistry = None,
        nlp_engine: NlpEngine = None,
        app_tracer: AppTracer = None,
        log_decision_process: bool = False,
        default_score_threshold: float = 0,
        supported_languages: List[str] = None,
        context_aware_enhancer: Optional[ContextAwareEnhancer] = None
    ): ...

    def analyze(
        self,
        text: str,
        language: str,
        entities: Optional[List[str]] = None,
        correlation_id: Optional[str] = None,
        score_threshold: Optional[float] = None,
        return_decision_process: Optional[bool] = False,
        ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
        context: Optional[List[str]] = None,
        allow_list: Optional[List[str]] = None,
        allow_list_match: Optional[str] = "exact",
        regex_flags: Optional[int] = None,
        nlp_artifacts: Optional[NlpArtifacts] = None
    ) -> List[RecognizerResult]:
        """
        Analyze text to detect PII entities.
        
        Args:
            text: Input text to analyze
            language: Language code (e.g., "en", "es", "fr")
            entities: Specific entity types to detect (None = all supported)
            correlation_id: Unique identifier for request tracking
            score_threshold: Minimum confidence score (overrides default)
            return_decision_process: Include analysis explanations in results
            ad_hoc_recognizers: Additional custom recognizers for this request
            context: Keywords that help improve detection accuracy
            allow_list: Values to exclude from detection results
            allow_list_match: Allow list matching strategy ("exact" or "fuzzy")
            regex_flags: Custom regex compilation flags
            nlp_artifacts: Pre-computed NLP processing results (performance optimization)
            
        Returns:
            List of RecognizerResult objects containing detected PII entities
        """

    def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
        """
        Get all loaded recognizers for specified language.
        
        Args:
            language: Language code (None = all languages)
            
        Returns:
            List of EntityRecognizer instances
        """

    def get_supported_entities(self, language: Optional[str] = None) -> List[str]:
        """
        Get all supported entity types for specified language.
        
        Args:
            language: Language code (None = all languages)
            
        Returns:
            List of entity type strings (e.g., ["PERSON", "PHONE_NUMBER"])
        """

AnalyzerRequest

Data container class that encapsulates all parameters for an analysis request, useful for serialization and API integration.

class AnalyzerRequest:
    """
    Request data container for analyzer operations.
    
    Args:
        req_data: Dictionary containing all request parameters
    """
    def __init__(self, req_data: Dict): ...

    # Properties extracted from req_data dictionary
    text: str                               # Text to analyze
    language: str                          # Language code  
    entities: Optional[List[str]]          # Entity types to detect
    correlation_id: Optional[str]          # Request tracking identifier
    score_threshold: Optional[float]       # Minimum confidence score
    return_decision_process: Optional[bool] # Include analysis explanations
    ad_hoc_recognizers: Optional[List[EntityRecognizer]]  # Custom recognizers
    context: Optional[List[str]]           # Context enhancement keywords
    allow_list: Optional[List[str]]        # Values to exclude from detection
    allow_list_match: Optional[str]        # Allow list matching strategy
    regex_flags: Optional[int]             # Regex compilation flags

Result Processing

Core classes for handling and processing analysis results.

class RecognizerResult:
    """
    Represents a detected PII entity with location and confidence information.
    
    Args:
        entity_type: Type of detected entity (e.g., "PERSON", "PHONE_NUMBER")
        start: Start character position in text
        end: End character position in text
        score: Confidence score (0.0 to 1.0)
        analysis_explanation: Detailed explanation of detection process
        recognition_metadata: Additional recognizer-specific data
    """
    def __init__(
        self,
        entity_type: str,
        start: int,
        end: int,
        score: float,
        analysis_explanation: AnalysisExplanation = None,
        recognition_metadata: Dict = None
    ): ...

    def intersects(self, other: RecognizerResult) -> int:
        """
        Check if this result intersects with another result.
        
        Returns:
            Number of overlapping characters (0 = no intersection)
        """

    def contained_in(self, other: RecognizerResult) -> bool:
        """Check if this result is entirely contained within another result."""

    def contains(self, other: RecognizerResult) -> bool:
        """Check if this result entirely contains another result."""

    def equal_indices(self, other: RecognizerResult) -> bool:
        """Check if start and end positions match another result."""

    def has_conflict(self, other: RecognizerResult) -> bool:
        """Check if this result conflicts with another result."""

    def to_dict(self) -> Dict:
        """Serialize result to dictionary format."""

    @classmethod
    def from_json(cls, data: Dict) -> RecognizerResult:
        """Create RecognizerResult from JSON/dictionary data."""

    def append_analysis_explanation_text(self, text: str) -> None:
        """Add explanatory text to the analysis explanation."""

class AnalysisExplanation:
    """
    Detailed explanation of why a PII entity was detected.
    
    Args:
        recognizer: Name of recognizer that made the detection
        original_score: Initial confidence score before enhancements
        pattern_name: Name of matching pattern (for pattern-based recognizers)
        pattern: Actual regex pattern that matched (for pattern-based recognizers)
        validation_result: Result of additional validation logic
        textual_explanation: Human-readable explanation of detection
        regex_flags: Regex compilation flags used
    """
    def __init__(
        self,
        recognizer: str,
        original_score: float,
        pattern_name: str = None,
        pattern: str = None,
        validation_result: float = None,
        textual_explanation: str = None,
        regex_flags: int = None
    ): ...

    def set_improved_score(self, score: float) -> None:
        """Update the confidence score and calculate improvement difference."""

    def set_supportive_context_word(self, word: str) -> None:
        """Set context word that helped increase confidence score."""

    def append_textual_explanation_line(self, text: str) -> None:
        """Append new line to textual explanation."""

    def to_dict(self) -> Dict:
        """Serialize explanation to dictionary format."""

Usage Examples

Basic Analysis

from presidio_analyzer import AnalyzerEngine

# Initialize with default settings
analyzer = AnalyzerEngine()

# Analyze text
text = "Contact John Smith at john.smith@email.com or call 555-123-4567"
results = analyzer.analyze(text=text, language="en")

# Process results
for result in results:
    detected_text = text[result.start:result.end]
    print(f"Found {result.entity_type}: '{detected_text}' (score: {result.score:.2f})")

Advanced Analysis with Context

from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

# Provide context to improve accuracy
text = "Please update my profile with new phone: 555-0199"
context = ["phone", "contact", "profile"]

results = analyzer.analyze(
    text=text,
    language="en",
    context=context,
    score_threshold=0.5,
    return_decision_process=True
)

# Examine detailed results
for result in results:
    print(f"Entity: {result.entity_type}")
    print(f"Score: {result.score}")
    if result.analysis_explanation:
        print(f"Explanation: {result.analysis_explanation.textual_explanation}")

Selective Entity Detection

from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

# Only detect specific entity types
text = "My SSN is 123-45-6789 and email is user@domain.com"
results = analyzer.analyze(
    text=text,
    language="en",
    entities=["US_SSN", "EMAIL_ADDRESS"]  # Only detect these types
)

print(f"Found {len(results)} entities of requested types")

Using Allow Lists

from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

# Exclude known safe values
text = "Contact support at support@company.com or use test@example.com for testing"
allow_list = ["support@company.com"]

results = analyzer.analyze(
    text=text,
    language="en",
    allow_list=allow_list,
    allow_list_match="exact"
)

# Only test@example.com should be detected
for result in results:
    detected_email = text[result.start:result.end]
    print(f"Detected: {detected_email}")  # Should only show test@example.com

Request Object Pattern

from presidio_analyzer import AnalyzerEngine, AnalyzerRequest

analyzer = AnalyzerEngine()

# Create structured request
request_data = {
    "text": "Call me at 555-1234 or email john@company.com",
    "language": "en",
    "entities": ["PHONE_NUMBER", "EMAIL_ADDRESS"],
    "score_threshold": 0.6,
    "return_decision_process": True
}

request = AnalyzerRequest(request_data)
results = analyzer.analyze(
    text=request.text,
    language=request.language,
    entities=request.entities,
    score_threshold=request.score_threshold,
    return_decision_process=request.return_decision_process
)

print(f"Processed request {request.correlation_id}")

Performance Optimization with Pre-computed NLP

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine

# Initialize with specific NLP engine
nlp_engine = SpacyNlpEngine()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine)

text = "Contact John Doe at john.doe@email.com"

# Pre-compute NLP artifacts for reuse
nlp_artifacts = nlp_engine.process_text(text, "en")

# Use pre-computed artifacts (faster for repeated analysis)
results = analyzer.analyze(
    text=text,
    language="en",
    nlp_artifacts=nlp_artifacts
)

Error Handling

from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

try:
    results = analyzer.analyze(
        text="Sample text",
        language="unsupported_lang"  # Invalid language
    )
except ValueError as e:
    print(f"Invalid parameter: {e}")

try:
    results = analyzer.analyze(
        text="Sample text", 
        language="en",
        score_threshold=1.5  # Invalid threshold (must be 0.0-1.0)
    )
except ValueError as e:
    print(f"Invalid score threshold: {e}")

Install with Tessl CLI

npx tessl i tessl/pypi-presidio-analyzer

docs

batch-processing.md

configuration.md

context-enhancement.md

core-analysis.md

entity-recognizers.md

index.md

predefined-recognizers.md

tile.json