tessl/pypi-phonenumberslite

Python library for parsing, formatting, storing and validating international phone numbers with reduced memory footprint

—

Pending

Overview

Eval results

Files

Phone Number Matching

Name: tessl/pypi-phonenumberslite
Author: tessl

Advanced pattern matching to find and extract phone numbers from text, with configurable leniency levels and comprehensive match information. This capability enables extraction of phone numbers from unstructured text like documents, emails, and web pages.

Capabilities

PhoneNumberMatcher Class

Iterator class that finds phone number matches in text with various leniency options.

class PhoneNumberMatcher:
    """
    Iterator for finding phone numbers in text.
    
    Scans through text and yields PhoneNumberMatch objects for
    each phone number found, with configurable leniency levels.
    """
    
    def __init__(self, text: str, region: str, leniency: Leniency = None, 
                 max_tries: int = 65536):
        """
        Initialize matcher for finding phone numbers in text.
        
        Parameters:
        - text: Text to search for phone numbers
        - region: Two-letter region code for parsing context
        - leniency: Matching strictness level (defaults to Leniency.VALID)
        - max_tries: Maximum number of matching attempts to prevent infinite loops
        """
    
    def __iter__(self):
        """Return iterator interface."""
    
    def __next__(self):
        """Get next phone number match."""

PhoneNumberMatch Class

Represents a phone number found in text with position and metadata information.

class PhoneNumberMatch:
    """
    Represents a phone number match found in text.
    
    Contains the matched phone number, its position in the text,
    and the raw text that was matched.
    """
    
    def start(self) -> int:
        """
        Get the start position of the match in the original text.
        
        Returns:
        Zero-based index of match start position
        """
    
    def end(self) -> int:
        """
        Get the end position of the match in the original text.
        
        Returns:
        Zero-based index of match end position (exclusive)
        """
    
    def number(self) -> PhoneNumber:
        """
        Get the parsed phone number from the match.
        
        Returns:
        PhoneNumber object representing the matched number
        """
    
    def raw_string(self) -> str:
        """
        Get the raw text that was matched.
        
        Returns:
        Original text substring that contained the phone number
        """

Leniency Levels

Control how strict the matching algorithm should be when finding phone numbers.

class Leniency:
    """
    Leniency levels for phone number matching.
    
    Controls how strict the matcher is when identifying
    potential phone numbers in text.
    """
    
    POSSIBLE = 0
    """Match numbers that are possible (basic length checks)."""
    
    VALID = 1  
    """Match only valid phone numbers (default level)."""
    
    STRICT_GROUPING = 2
    """Match only numbers with correct punctuation grouping."""
    
    EXACT_GROUPING = 3
    """Match only numbers with exact formatting patterns."""

Usage Examples

Basic Phone Number Extraction

import phonenumbers

# Text containing various phone numbers
text = """
Contact us at 650-253-2222 or call our international line at +44 20 8366 1177.
You can also reach support at (800) 555-1234 or send a fax to 650.253.2223.
Our office number is 1-650-253-2222 extension 1234.
"""

print("Phone numbers found in text:")
for match in phonenumbers.PhoneNumberMatcher(text, "US"):
    number = match.number()
    formatted = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
    print(f"  Position {match.start()}-{match.end()}: '{match.raw_string()}' -> {formatted}")

Leniency Level Comparison

import phonenumbers
from phonenumbers import Leniency

text = "Call me at 555-1234 or 1-800-FLOWERS today!"

leniency_levels = [
    (Leniency.POSSIBLE, "POSSIBLE"),
    (Leniency.VALID, "VALID"),
    (Leniency.STRICT_GROUPING, "STRICT_GROUPING"),
    (Leniency.EXACT_GROUPING, "EXACT_GROUPING")
]

for leniency, name in leniency_levels:
    print(f"\n{name} leniency:")
    matches = list(phonenumbers.PhoneNumberMatcher(text, "US", leniency))
    print(f"  Found {len(matches)} matches")
    
    for match in matches:
        formatted = phonenumbers.format_number(
            match.number(), 
            phonenumbers.PhoneNumberFormat.INTERNATIONAL
        )
        print(f"    '{match.raw_string()}' -> {formatted}")

Document Processing Pipeline

import phonenumbers
import re

class PhoneNumberExtractor:
    """Extract and normalize phone numbers from documents."""
    
    def __init__(self, default_region="US", leniency=Leniency.VALID):
        self.default_region = default_region
        self.leniency = leniency
    
    def extract_from_text(self, text, region=None):
        """Extract all phone numbers from text."""
        search_region = region or self.default_region
        matches = []
        
        for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
            number = match.number()
            
            matches.append({
                'raw_text': match.raw_string(),
                'start_pos': match.start(),
                'end_pos': match.end(),
                'parsed_number': number,
                'formatted': {
                    'e164': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164),
                    'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
                    'national': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.NATIONAL)
                },
                'is_valid': phonenumbers.is_valid_number(number),
                'number_type': phonenumbers.number_type(number),
                'region': phonenumbers.region_code_for_number(number)
            })
        
        return matches
    
    def extract_unique_numbers(self, text, region=None):
        """Extract unique phone numbers, removing duplicates."""
        all_matches = self.extract_from_text(text, region)
        unique_numbers = {}
        
        for match in all_matches:
            e164 = match['formatted']['e164']
            if e164 not in unique_numbers:
                unique_numbers[e164] = match
            else:
                # Keep the match with better formatting or more context
                existing = unique_numbers[e164]
                if len(match['raw_text']) > len(existing['raw_text']):
                    unique_numbers[e164] = match
        
        return list(unique_numbers.values())
    
    def anonymize_text(self, text, replacement="[PHONE]", region=None):
        """Replace phone numbers in text with anonymized placeholders."""
        search_region = region or self.default_region
        
        # Find all matches and sort by position (descending to avoid offset issues)
        matches = []
        for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
            matches.append((match.start(), match.end()))
        
        matches.sort(reverse=True)
        
        # Replace from end to beginning
        anonymized_text = text
        for start, end in matches:
            anonymized_text = anonymized_text[:start] + replacement + anonymized_text[end:]
        
        return anonymized_text

# Example usage
extractor = PhoneNumberExtractor("US")

sample_document = """
Please contact our sales team at 1-800-555-SALE (1-800-555-7253) or 
our technical support at +1 (650) 253-2222. International customers 
can reach us at +44 20 8366 1177 or +33 1 42 68 53 00.

For urgent matters, call our emergency line: 911
For billing questions: 650.253.2223 ext. 100
"""

print("=== Phone Number Extraction ===")
matches = extractor.extract_from_text(sample_document)
for i, match in enumerate(matches):
    print(f"{i+1}. '{match['raw_text']}' (pos {match['start_pos']}-{match['end_pos']})")
    print(f"   -> {match['formatted']['international']}")
    print(f"   -> Type: {match['number_type']}, Region: {match['region']}")
    print()

print("=== Unique Numbers ===")
unique = extractor.extract_unique_numbers(sample_document)
for match in unique:
    print(f"- {match['formatted']['international']} ({match['region']})")

print("=== Anonymized Text ===")
anonymized = extractor.anonymize_text(sample_document)
print(anonymized)

Contact Information Extraction

import phonenumbers
import re

class ContactExtractor:
    """Extract structured contact information from text."""
    
    def __init__(self, default_region="US"):
        self.default_region = default_region
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    
    def extract_contacts(self, text):
        """Extract phone numbers, emails, and other contact info."""
        contacts = {
            'phone_numbers': [],
            'emails': [],
            'text_segments': []
        }
        
        # Extract phone numbers
        for match in phonenumbers.PhoneNumberMatcher(text, self.default_region):
            contacts['phone_numbers'].append({
                'raw': match.raw_string(),
                'formatted': phonenumbers.format_number(
                    match.number(), 
                    phonenumbers.PhoneNumberFormat.INTERNATIONAL
                ),
                'type': phonenumbers.number_type(match.number()),
                'position': (match.start(), match.end())
            })
        
        # Extract email addresses
        for match in self.email_pattern.finditer(text):
            contacts['emails'].append({
                'email': match.group(),
                'position': (match.start(), match.end())
            })
        
        # Extract text segments between contact info
        all_positions = []
        for phone in contacts['phone_numbers']:
            all_positions.append(phone['position'])
        for email in contacts['emails']:
            all_positions.append(email['position'])
        
        all_positions.sort()
        
        # Get text segments
        last_end = 0
        for start, end in all_positions:
            if start > last_end:
                segment = text[last_end:start].strip()
                if segment:
                    contacts['text_segments'].append(segment)
            last_end = end
        
        # Final segment
        if last_end < len(text):
            segment = text[last_end:].strip()
            if segment:
                contacts['text_segments'].append(segment)
        
        return contacts
    
    def format_contact_card(self, text):
        """Format extracted contact information as a structured card."""
        contacts = self.extract_contacts(text)
        
        card = []
        
        # Group phone numbers by type
        phones_by_type = {}
        for phone in contacts['phone_numbers']:
            phone_type = phone['type']
            if phone_type not in phones_by_type:
                phones_by_type[phone_type] = []
            phones_by_type[phone_type].append(phone['formatted'])
        
        # Format phone numbers
        for phone_type, numbers in phones_by_type.items():
            type_name = str(phone_type).replace('PhoneNumberType.', '').title()
            card.append(f"{type_name}: {', '.join(numbers)}")
        
        # Add emails
        if contacts['emails']:
            emails = [email['email'] for email in contacts['emails']]
            card.append(f"Email: {', '.join(emails)}")
        
        # Add other text
        if contacts['text_segments']:
            card.append(f"Notes: {' | '.join(contacts['text_segments'])}")
        
        return '\n'.join(card)

# Example usage
extractor = ContactExtractor("US")

business_card_text = """
John Smith - Sales Manager
Acme Corporation
Phone: (650) 253-2222
Mobile: 650.555.1234
Email: john.smith@acme.com
Alternative: jsmith@gmail.com

Call anytime between 9 AM - 5 PM PST
Emergency contact: +1-800-555-HELP
"""

print("=== Contact Extraction ===")
contacts = extractor.extract_contacts(business_card_text)

print(f"Phone numbers found: {len(contacts['phone_numbers'])}")
for phone in contacts['phone_numbers']:
    print(f"  - {phone['raw']} -> {phone['formatted']} ({phone['type']})")

print(f"\nEmails found: {len(contacts['emails'])}")
for email in contacts['emails']:
    print(f"  - {email['email']}")

print(f"\nText segments: {len(contacts['text_segments'])}")
for segment in contacts['text_segments']:
    print(f"  - {segment}")

print("\n=== Formatted Contact Card ===")
card = extractor.format_contact_card(business_card_text)
print(card)

Bulk Text Processing

import phonenumbers
from concurrent.futures import ThreadPoolExecutor
import json

class BulkPhoneExtractor:
    """Process multiple documents for phone number extraction."""
    
    def __init__(self, default_region="US", max_workers=4):
        self.default_region = default_region
        self.max_workers = max_workers
    
    def process_document(self, doc_id, text, region=None):
        """Process a single document."""
        search_region = region or self.default_region
        
        result = {
            'doc_id': doc_id,
            'phone_numbers': [],
            'stats': {
                'total_matches': 0,
                'valid_numbers': 0,
                'unique_numbers': 0
            }
        }
        
        seen_numbers = set()
        
        for match in phonenumbers.PhoneNumberMatcher(text, search_region):
            number = match.number()
            e164 = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164)
            
            is_valid = phonenumbers.is_valid_number(number)
            
            result['phone_numbers'].append({
                'raw_text': match.raw_string(),
                'e164': e164,
                'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
                'is_valid': is_valid,
                'type': str(phonenumbers.number_type(number)),
                'region': phonenumbers.region_code_for_number(number),
                'position': [match.start(), match.end()]
            })
            
            result['stats']['total_matches'] += 1
            if is_valid:
                result['stats']['valid_numbers'] += 1
            
            seen_numbers.add(e164)
        
        result['stats']['unique_numbers'] = len(seen_numbers)
        return result
    
    def process_documents(self, documents):
        """Process multiple documents in parallel."""
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = []
            
            for doc_id, text, region in documents:
                future = executor.submit(self.process_document, doc_id, text, region)
                futures.append(future)
            
            results = []
            for future in futures:
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    print(f"Error processing document: {e}")
            
            return results
    
    def generate_summary_report(self, results):
        """Generate summary statistics across all documents."""
        total_docs = len(results)
        total_matches = sum(r['stats']['total_matches'] for r in results)
        total_valid = sum(r['stats']['valid_numbers'] for r in results)
        
        # Collect all unique numbers across documents
        all_numbers = set()
        regions = {}
        types = {}
        
        for result in results:
            for phone in result['phone_numbers']:
                if phone['is_valid']:
                    all_numbers.add(phone['e164'])
                    
                    region = phone['region']
                    regions[region] = regions.get(region, 0) + 1
                    
                    phone_type = phone['type']
                    types[phone_type] = types.get(phone_type, 0) + 1
        
        return {
            'summary': {
                'total_documents': total_docs,
                'total_matches': total_matches,
                'valid_numbers': total_valid,
                'unique_numbers_global': len(all_numbers),
                'average_matches_per_doc': total_matches / total_docs if total_docs > 0 else 0
            },
            'regions': regions,
            'types': types
        }

# Example usage
extractor = BulkPhoneExtractor("US", max_workers=2)

# Sample documents to process
documents = [
    ("doc1", "Call us at 650-253-2222 or +44 20 8366 1177", "US"),
    ("doc2", "Support: 1-800-555-1234, International: +33 1 42 68 53 00", "US"),
    ("doc3", "Office: (555) 123-4567, Mobile: 555.987.6543", "US"),
    ("doc4", "Invalid phone: 123-456, Valid: +1-650-253-2222", "US"),
]

print("=== Bulk Processing Results ===")
results = extractor.process_documents(documents)

for result in results:
    print(f"\nDocument {result['doc_id']}:")
    print(f"  Total matches: {result['stats']['total_matches']}")
    print(f"  Valid numbers: {result['stats']['valid_numbers']}")
    print(f"  Unique numbers: {result['stats']['unique_numbers']}")
    
    for phone in result['phone_numbers'][:3]:  # Show first 3
        status = "✓" if phone['is_valid'] else "✗"
        print(f"    {status} {phone['raw_text']} -> {phone['international']}")

print("\n=== Summary Report ===")
summary = extractor.generate_summary_report(results)
print(json.dumps(summary, indent=2))

Install with Tessl CLI