Presidio Anonymizer package - replaces analyzed text with desired values.
—
The BatchAnonymizerEngine provides efficient anonymization for lists and dictionaries, enabling bulk processing of multiple texts or structured data formats.
Create a batch processor with an optional custom AnonymizerEngine.
def __init__(self, anonymizer_engine: Optional[AnonymizerEngine] = None):
"""
Initialize BatchAnonymizerEngine.
Parameters:
- anonymizer_engine (Optional[AnonymizerEngine]): Custom anonymizer instance,
defaults to new AnonymizerEngine()
"""Usage Example:
from presidio_anonymizer import BatchAnonymizerEngine, AnonymizerEngine
# Use default engine
batch_engine = BatchAnonymizerEngine()
# Use custom engine with added operators
custom_engine = AnonymizerEngine()
custom_engine.add_anonymizer(MyCustomOperator)
batch_engine = BatchAnonymizerEngine(anonymizer_engine=custom_engine)Anonymize a list of texts with corresponding analyzer results.
def anonymize_list(
self,
texts: List[Optional[Union[str, bool, int, float]]],
recognizer_results_list: List[List[RecognizerResult]],
**kwargs
) -> List[Union[str, Any]]:
"""
Anonymize a list of strings.
Parameters:
- texts (List[Optional[Union[str, bool, int, float]]]): List of texts to anonymize.
Non-string types (bool, int, float) are converted to string; other types pass through unchanged
- recognizer_results_list (List[List[RecognizerResult]]): List of analyzer results for each text
- **kwargs: Additional arguments passed to AnonymizerEngine.anonymize()
Returns:
List[Union[str, Any]]: List of anonymized texts, with non-anonymizable items unchanged
"""Usage Examples:
from presidio_anonymizer import BatchAnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
batch_engine = BatchAnonymizerEngine()
# Anonymize multiple texts
texts = [
"John Doe lives in New York",
"Contact Sarah at sarah@email.com",
"Call Mike at 555-1234",
42, # Non-string type
None # None value
]
analyzer_results = [
[RecognizerResult("PERSON", 0, 8, 0.9), RecognizerResult("LOCATION", 18, 26, 0.8)],
[RecognizerResult("PERSON", 8, 13, 0.9), RecognizerResult("EMAIL_ADDRESS", 17, 33, 0.9)],
[RecognizerResult("PERSON", 5, 9, 0.9), RecognizerResult("PHONE_NUMBER", 13, 21, 0.8)],
[], # No analyzer results for number
[] # No analyzer results for None
]
operators = {
"PERSON": OperatorConfig("replace", {"new_value": "[PERSON]"}),
"EMAIL_ADDRESS": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 5}),
"PHONE_NUMBER": OperatorConfig("redact"),
"LOCATION": OperatorConfig("replace", {"new_value": "[LOCATION]"})
}
result = batch_engine.anonymize_list(
texts=texts,
recognizer_results_list=analyzer_results,
operators=operators
)
print(result)
# ['[PERSON] lives in [LOCATION]', 'Contact [PERSON] at sa***@email.com', 'Call [PERSON] at ', '42', None]Anonymize values in nested dictionaries and structured data.
def anonymize_dict(
self,
analyzer_results: Iterable[DictRecognizerResult],
**kwargs
) -> Dict[str, str]:
"""
Anonymize values in a dictionary.
Parameters:
- analyzer_results (Iterable[DictRecognizerResult]): Iterator of DictRecognizerResult
containing analyzer results for dictionary values
- **kwargs: Additional arguments passed to AnonymizerEngine.anonymize()
Returns:
Dict[str, str]: Dictionary with anonymized values
"""Usage Example:
from presidio_anonymizer.entities import DictRecognizerResult
# Example dictionary data
data_dict = {
"user_info": {
"name": "John Doe",
"email": "john@example.com"
},
"contacts": ["Alice Johnson", "Bob Smith"],
"phone": "555-1234",
"age": 30
}
# DictRecognizerResult contains analyzer results for structured data
# This would typically come from presidio-analyzer's analyze_dict method
dict_analyzer_results = [
DictRecognizerResult(
key="user_info",
value={"name": "John Doe", "email": "john@example.com"},
recognizer_results=[
# Nested analyzer results for the dictionary value
]
),
# Additional results for other keys...
]
operators = {
"PERSON": OperatorConfig("replace", {"new_value": "[PERSON]"}),
"EMAIL_ADDRESS": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 5})
}
anonymized_dict = batch_engine.anonymize_dict(
analyzer_results=dict_analyzer_results,
operators=operators
)The batch engine handles different data types appropriately:
# Process rows of tabular data
rows = [
["John Doe", "john@email.com", "555-1234"],
["Jane Smith", "jane@email.com", "555-5678"]
]
# Flatten for processing
texts = [item for row in rows for item in row]
# Process with appropriate analyzer results...# Use same operators across all batch operations
standard_operators = {
"PERSON": OperatorConfig("replace", {"new_value": "[PERSON]"}),
"EMAIL_ADDRESS": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 5})
}
# Apply to lists
list_result = batch_engine.anonymize_list(texts, analyzer_results, operators=standard_operators)
# Apply to dictionaries
dict_result = batch_engine.anonymize_dict(dict_analyzer_results, operators=standard_operators)Install with Tessl CLI
npx tessl i tessl/pypi-presidio-anonymizer