Python library for parsing, formatting, storing and validating international phone numbers with reduced memory footprint
—
Advanced pattern matching to find and extract phone numbers from text, with configurable leniency levels and comprehensive match information. This capability enables extraction of phone numbers from unstructured text like documents, emails, and web pages.
Iterator class that finds phone number matches in text with various leniency options.
class PhoneNumberMatcher:
"""
Iterator for finding phone numbers in text.
Scans through text and yields PhoneNumberMatch objects for
each phone number found, with configurable leniency levels.
"""
def __init__(self, text: str, region: str, leniency: Leniency = None,
max_tries: int = 65536):
"""
Initialize matcher for finding phone numbers in text.
Parameters:
- text: Text to search for phone numbers
- region: Two-letter region code for parsing context
- leniency: Matching strictness level (defaults to Leniency.VALID)
- max_tries: Maximum number of matching attempts to prevent infinite loops
"""
def __iter__(self):
"""Return iterator interface."""
def __next__(self):
"""Get next phone number match."""Represents a phone number found in text with position and metadata information.
class PhoneNumberMatch:
"""
Represents a phone number match found in text.
Contains the matched phone number, its position in the text,
and the raw text that was matched.
"""
def start(self) -> int:
"""
Get the start position of the match in the original text.
Returns:
Zero-based index of match start position
"""
def end(self) -> int:
"""
Get the end position of the match in the original text.
Returns:
Zero-based index of match end position (exclusive)
"""
def number(self) -> PhoneNumber:
"""
Get the parsed phone number from the match.
Returns:
PhoneNumber object representing the matched number
"""
def raw_string(self) -> str:
"""
Get the raw text that was matched.
Returns:
Original text substring that contained the phone number
"""Control how strict the matching algorithm should be when finding phone numbers.
class Leniency:
"""
Leniency levels for phone number matching.
Controls how strict the matcher is when identifying
potential phone numbers in text.
"""
POSSIBLE = 0
"""Match numbers that are possible (basic length checks)."""
VALID = 1
"""Match only valid phone numbers (default level)."""
STRICT_GROUPING = 2
"""Match only numbers with correct punctuation grouping."""
EXACT_GROUPING = 3
"""Match only numbers with exact formatting patterns."""import phonenumbers
# Text containing various phone numbers
text = """
Contact us at 650-253-2222 or call our international line at +44 20 8366 1177.
You can also reach support at (800) 555-1234 or send a fax to 650.253.2223.
Our office number is 1-650-253-2222 extension 1234.
"""
print("Phone numbers found in text:")
for match in phonenumbers.PhoneNumberMatcher(text, "US"):
number = match.number()
formatted = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
print(f" Position {match.start()}-{match.end()}: '{match.raw_string()}' -> {formatted}")import phonenumbers
from phonenumbers import Leniency
text = "Call me at 555-1234 or 1-800-FLOWERS today!"
leniency_levels = [
(Leniency.POSSIBLE, "POSSIBLE"),
(Leniency.VALID, "VALID"),
(Leniency.STRICT_GROUPING, "STRICT_GROUPING"),
(Leniency.EXACT_GROUPING, "EXACT_GROUPING")
]
for leniency, name in leniency_levels:
print(f"\n{name} leniency:")
matches = list(phonenumbers.PhoneNumberMatcher(text, "US", leniency))
print(f" Found {len(matches)} matches")
for match in matches:
formatted = phonenumbers.format_number(
match.number(),
phonenumbers.PhoneNumberFormat.INTERNATIONAL
)
print(f" '{match.raw_string()}' -> {formatted}")import phonenumbers
import re
class PhoneNumberExtractor:
"""Extract and normalize phone numbers from documents."""
def __init__(self, default_region="US", leniency=Leniency.VALID):
self.default_region = default_region
self.leniency = leniency
def extract_from_text(self, text, region=None):
"""Extract all phone numbers from text."""
search_region = region or self.default_region
matches = []
for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
number = match.number()
matches.append({
'raw_text': match.raw_string(),
'start_pos': match.start(),
'end_pos': match.end(),
'parsed_number': number,
'formatted': {
'e164': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164),
'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
'national': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.NATIONAL)
},
'is_valid': phonenumbers.is_valid_number(number),
'number_type': phonenumbers.number_type(number),
'region': phonenumbers.region_code_for_number(number)
})
return matches
def extract_unique_numbers(self, text, region=None):
"""Extract unique phone numbers, removing duplicates."""
all_matches = self.extract_from_text(text, region)
unique_numbers = {}
for match in all_matches:
e164 = match['formatted']['e164']
if e164 not in unique_numbers:
unique_numbers[e164] = match
else:
# Keep the match with better formatting or more context
existing = unique_numbers[e164]
if len(match['raw_text']) > len(existing['raw_text']):
unique_numbers[e164] = match
return list(unique_numbers.values())
def anonymize_text(self, text, replacement="[PHONE]", region=None):
"""Replace phone numbers in text with anonymized placeholders."""
search_region = region or self.default_region
# Find all matches and sort by position (descending to avoid offset issues)
matches = []
for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
matches.append((match.start(), match.end()))
matches.sort(reverse=True)
# Replace from end to beginning
anonymized_text = text
for start, end in matches:
anonymized_text = anonymized_text[:start] + replacement + anonymized_text[end:]
return anonymized_text
# Example usage
extractor = PhoneNumberExtractor("US")
sample_document = """
Please contact our sales team at 1-800-555-SALE (1-800-555-7253) or
our technical support at +1 (650) 253-2222. International customers
can reach us at +44 20 8366 1177 or +33 1 42 68 53 00.
For urgent matters, call our emergency line: 911
For billing questions: 650.253.2223 ext. 100
"""
print("=== Phone Number Extraction ===")
matches = extractor.extract_from_text(sample_document)
for i, match in enumerate(matches):
print(f"{i+1}. '{match['raw_text']}' (pos {match['start_pos']}-{match['end_pos']})")
print(f" -> {match['formatted']['international']}")
print(f" -> Type: {match['number_type']}, Region: {match['region']}")
print()
print("=== Unique Numbers ===")
unique = extractor.extract_unique_numbers(sample_document)
for match in unique:
print(f"- {match['formatted']['international']} ({match['region']})")
print("=== Anonymized Text ===")
anonymized = extractor.anonymize_text(sample_document)
print(anonymized)import phonenumbers
import re
class ContactExtractor:
"""Extract structured contact information from text."""
def __init__(self, default_region="US"):
self.default_region = default_region
self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
def extract_contacts(self, text):
"""Extract phone numbers, emails, and other contact info."""
contacts = {
'phone_numbers': [],
'emails': [],
'text_segments': []
}
# Extract phone numbers
for match in phonenumbers.PhoneNumberMatcher(text, self.default_region):
contacts['phone_numbers'].append({
'raw': match.raw_string(),
'formatted': phonenumbers.format_number(
match.number(),
phonenumbers.PhoneNumberFormat.INTERNATIONAL
),
'type': phonenumbers.number_type(match.number()),
'position': (match.start(), match.end())
})
# Extract email addresses
for match in self.email_pattern.finditer(text):
contacts['emails'].append({
'email': match.group(),
'position': (match.start(), match.end())
})
# Extract text segments between contact info
all_positions = []
for phone in contacts['phone_numbers']:
all_positions.append(phone['position'])
for email in contacts['emails']:
all_positions.append(email['position'])
all_positions.sort()
# Get text segments
last_end = 0
for start, end in all_positions:
if start > last_end:
segment = text[last_end:start].strip()
if segment:
contacts['text_segments'].append(segment)
last_end = end
# Final segment
if last_end < len(text):
segment = text[last_end:].strip()
if segment:
contacts['text_segments'].append(segment)
return contacts
def format_contact_card(self, text):
"""Format extracted contact information as a structured card."""
contacts = self.extract_contacts(text)
card = []
# Group phone numbers by type
phones_by_type = {}
for phone in contacts['phone_numbers']:
phone_type = phone['type']
if phone_type not in phones_by_type:
phones_by_type[phone_type] = []
phones_by_type[phone_type].append(phone['formatted'])
# Format phone numbers
for phone_type, numbers in phones_by_type.items():
type_name = str(phone_type).replace('PhoneNumberType.', '').title()
card.append(f"{type_name}: {', '.join(numbers)}")
# Add emails
if contacts['emails']:
emails = [email['email'] for email in contacts['emails']]
card.append(f"Email: {', '.join(emails)}")
# Add other text
if contacts['text_segments']:
card.append(f"Notes: {' | '.join(contacts['text_segments'])}")
return '\n'.join(card)
# Example usage
extractor = ContactExtractor("US")
business_card_text = """
John Smith - Sales Manager
Acme Corporation
Phone: (650) 253-2222
Mobile: 650.555.1234
Email: john.smith@acme.com
Alternative: jsmith@gmail.com
Call anytime between 9 AM - 5 PM PST
Emergency contact: +1-800-555-HELP
"""
print("=== Contact Extraction ===")
contacts = extractor.extract_contacts(business_card_text)
print(f"Phone numbers found: {len(contacts['phone_numbers'])}")
for phone in contacts['phone_numbers']:
print(f" - {phone['raw']} -> {phone['formatted']} ({phone['type']})")
print(f"\nEmails found: {len(contacts['emails'])}")
for email in contacts['emails']:
print(f" - {email['email']}")
print(f"\nText segments: {len(contacts['text_segments'])}")
for segment in contacts['text_segments']:
print(f" - {segment}")
print("\n=== Formatted Contact Card ===")
card = extractor.format_contact_card(business_card_text)
print(card)import phonenumbers
from concurrent.futures import ThreadPoolExecutor
import json
class BulkPhoneExtractor:
"""Process multiple documents for phone number extraction."""
def __init__(self, default_region="US", max_workers=4):
self.default_region = default_region
self.max_workers = max_workers
def process_document(self, doc_id, text, region=None):
"""Process a single document."""
search_region = region or self.default_region
result = {
'doc_id': doc_id,
'phone_numbers': [],
'stats': {
'total_matches': 0,
'valid_numbers': 0,
'unique_numbers': 0
}
}
seen_numbers = set()
for match in phonenumbers.PhoneNumberMatcher(text, search_region):
number = match.number()
e164 = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164)
is_valid = phonenumbers.is_valid_number(number)
result['phone_numbers'].append({
'raw_text': match.raw_string(),
'e164': e164,
'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
'is_valid': is_valid,
'type': str(phonenumbers.number_type(number)),
'region': phonenumbers.region_code_for_number(number),
'position': [match.start(), match.end()]
})
result['stats']['total_matches'] += 1
if is_valid:
result['stats']['valid_numbers'] += 1
seen_numbers.add(e164)
result['stats']['unique_numbers'] = len(seen_numbers)
return result
def process_documents(self, documents):
"""Process multiple documents in parallel."""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for doc_id, text, region in documents:
future = executor.submit(self.process_document, doc_id, text, region)
futures.append(future)
results = []
for future in futures:
try:
result = future.result()
results.append(result)
except Exception as e:
print(f"Error processing document: {e}")
return results
def generate_summary_report(self, results):
"""Generate summary statistics across all documents."""
total_docs = len(results)
total_matches = sum(r['stats']['total_matches'] for r in results)
total_valid = sum(r['stats']['valid_numbers'] for r in results)
# Collect all unique numbers across documents
all_numbers = set()
regions = {}
types = {}
for result in results:
for phone in result['phone_numbers']:
if phone['is_valid']:
all_numbers.add(phone['e164'])
region = phone['region']
regions[region] = regions.get(region, 0) + 1
phone_type = phone['type']
types[phone_type] = types.get(phone_type, 0) + 1
return {
'summary': {
'total_documents': total_docs,
'total_matches': total_matches,
'valid_numbers': total_valid,
'unique_numbers_global': len(all_numbers),
'average_matches_per_doc': total_matches / total_docs if total_docs > 0 else 0
},
'regions': regions,
'types': types
}
# Example usage
extractor = BulkPhoneExtractor("US", max_workers=2)
# Sample documents to process
documents = [
("doc1", "Call us at 650-253-2222 or +44 20 8366 1177", "US"),
("doc2", "Support: 1-800-555-1234, International: +33 1 42 68 53 00", "US"),
("doc3", "Office: (555) 123-4567, Mobile: 555.987.6543", "US"),
("doc4", "Invalid phone: 123-456, Valid: +1-650-253-2222", "US"),
]
print("=== Bulk Processing Results ===")
results = extractor.process_documents(documents)
for result in results:
print(f"\nDocument {result['doc_id']}:")
print(f" Total matches: {result['stats']['total_matches']}")
print(f" Valid numbers: {result['stats']['valid_numbers']}")
print(f" Unique numbers: {result['stats']['unique_numbers']}")
for phone in result['phone_numbers'][:3]: # Show first 3
status = "✓" if phone['is_valid'] else "✗"
print(f" {status} {phone['raw_text']} -> {phone['international']}")
print("\n=== Summary Report ===")
summary = extractor.generate_summary_report(results)
print(json.dumps(summary, indent=2))Install with Tessl CLI
npx tessl i tessl/pypi-phonenumberslite