0
# Phone Number Matching
1
2
Advanced pattern matching to find and extract phone numbers from text, with configurable leniency levels and comprehensive match information. This capability enables extraction of phone numbers from unstructured text like documents, emails, and web pages.
3
4
## Capabilities
5
6
### PhoneNumberMatcher Class
7
8
Iterator class that finds phone number matches in text with various leniency options.
9
10
```python { .api }
11
class PhoneNumberMatcher:
12
"""
13
Iterator for finding phone numbers in text.
14
15
Scans through text and yields PhoneNumberMatch objects for
16
each phone number found, with configurable leniency levels.
17
"""
18
19
def __init__(self, text: str, region: str, leniency: Leniency = None,
20
max_tries: int = 65536):
21
"""
22
Initialize matcher for finding phone numbers in text.
23
24
Parameters:
25
- text: Text to search for phone numbers
26
- region: Two-letter region code for parsing context
27
- leniency: Matching strictness level (defaults to Leniency.VALID)
28
- max_tries: Maximum number of matching attempts to prevent infinite loops
29
"""
30
31
def __iter__(self):
32
"""Return iterator interface."""
33
34
def __next__(self):
35
"""Get next phone number match."""
36
```
37
38
### PhoneNumberMatch Class
39
40
Represents a phone number found in text with position and metadata information.
41
42
```python { .api }
43
class PhoneNumberMatch:
44
"""
45
Represents a phone number match found in text.
46
47
Contains the matched phone number, its position in the text,
48
and the raw text that was matched.
49
"""
50
51
def start(self) -> int:
52
"""
53
Get the start position of the match in the original text.
54
55
Returns:
56
Zero-based index of match start position
57
"""
58
59
def end(self) -> int:
60
"""
61
Get the end position of the match in the original text.
62
63
Returns:
64
Zero-based index of match end position (exclusive)
65
"""
66
67
def number(self) -> PhoneNumber:
68
"""
69
Get the parsed phone number from the match.
70
71
Returns:
72
PhoneNumber object representing the matched number
73
"""
74
75
def raw_string(self) -> str:
76
"""
77
Get the raw text that was matched.
78
79
Returns:
80
Original text substring that contained the phone number
81
"""
82
```
83
84
### Leniency Levels
85
86
Control how strict the matching algorithm should be when finding phone numbers.
87
88
```python { .api }
89
class Leniency:
90
"""
91
Leniency levels for phone number matching.
92
93
Controls how strict the matcher is when identifying
94
potential phone numbers in text.
95
"""
96
97
POSSIBLE = 0
98
"""Match numbers that are possible (basic length checks)."""
99
100
VALID = 1
101
"""Match only valid phone numbers (default level)."""
102
103
STRICT_GROUPING = 2
104
"""Match only numbers with correct punctuation grouping."""
105
106
EXACT_GROUPING = 3
107
"""Match only numbers with exact formatting patterns."""
108
```
109
110
## Usage Examples
111
112
### Basic Phone Number Extraction
113
114
```python
115
import phonenumbers
116
117
# Text containing various phone numbers
118
text = """
119
Contact us at 650-253-2222 or call our international line at +44 20 8366 1177.
120
You can also reach support at (800) 555-1234 or send a fax to 650.253.2223.
121
Our office number is 1-650-253-2222 extension 1234.
122
"""
123
124
print("Phone numbers found in text:")
125
for match in phonenumbers.PhoneNumberMatcher(text, "US"):
126
number = match.number()
127
formatted = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
128
print(f" Position {match.start()}-{match.end()}: '{match.raw_string()}' -> {formatted}")
129
```
130
131
### Leniency Level Comparison
132
133
```python
134
import phonenumbers
135
from phonenumbers import Leniency
136
137
text = "Call me at 555-1234 or 1-800-FLOWERS today!"
138
139
leniency_levels = [
140
(Leniency.POSSIBLE, "POSSIBLE"),
141
(Leniency.VALID, "VALID"),
142
(Leniency.STRICT_GROUPING, "STRICT_GROUPING"),
143
(Leniency.EXACT_GROUPING, "EXACT_GROUPING")
144
]
145
146
for leniency, name in leniency_levels:
147
print(f"\n{name} leniency:")
148
matches = list(phonenumbers.PhoneNumberMatcher(text, "US", leniency))
149
print(f" Found {len(matches)} matches")
150
151
for match in matches:
152
formatted = phonenumbers.format_number(
153
match.number(),
154
phonenumbers.PhoneNumberFormat.INTERNATIONAL
155
)
156
print(f" '{match.raw_string()}' -> {formatted}")
157
```
158
159
### Document Processing Pipeline
160
161
```python
162
import phonenumbers
163
import re
164
165
class PhoneNumberExtractor:
166
"""Extract and normalize phone numbers from documents."""
167
168
def __init__(self, default_region="US", leniency=Leniency.VALID):
169
self.default_region = default_region
170
self.leniency = leniency
171
172
def extract_from_text(self, text, region=None):
173
"""Extract all phone numbers from text."""
174
search_region = region or self.default_region
175
matches = []
176
177
for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
178
number = match.number()
179
180
matches.append({
181
'raw_text': match.raw_string(),
182
'start_pos': match.start(),
183
'end_pos': match.end(),
184
'parsed_number': number,
185
'formatted': {
186
'e164': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164),
187
'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
188
'national': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.NATIONAL)
189
},
190
'is_valid': phonenumbers.is_valid_number(number),
191
'number_type': phonenumbers.number_type(number),
192
'region': phonenumbers.region_code_for_number(number)
193
})
194
195
return matches
196
197
def extract_unique_numbers(self, text, region=None):
198
"""Extract unique phone numbers, removing duplicates."""
199
all_matches = self.extract_from_text(text, region)
200
unique_numbers = {}
201
202
for match in all_matches:
203
e164 = match['formatted']['e164']
204
if e164 not in unique_numbers:
205
unique_numbers[e164] = match
206
else:
207
# Keep the match with better formatting or more context
208
existing = unique_numbers[e164]
209
if len(match['raw_text']) > len(existing['raw_text']):
210
unique_numbers[e164] = match
211
212
return list(unique_numbers.values())
213
214
def anonymize_text(self, text, replacement="[PHONE]", region=None):
215
"""Replace phone numbers in text with anonymized placeholders."""
216
search_region = region or self.default_region
217
218
# Find all matches and sort by position (descending to avoid offset issues)
219
matches = []
220
for match in phonenumbers.PhoneNumberMatcher(text, search_region, self.leniency):
221
matches.append((match.start(), match.end()))
222
223
matches.sort(reverse=True)
224
225
# Replace from end to beginning
226
anonymized_text = text
227
for start, end in matches:
228
anonymized_text = anonymized_text[:start] + replacement + anonymized_text[end:]
229
230
return anonymized_text
231
232
# Example usage
233
extractor = PhoneNumberExtractor("US")
234
235
sample_document = """
236
Please contact our sales team at 1-800-555-SALE (1-800-555-7253) or
237
our technical support at +1 (650) 253-2222. International customers
238
can reach us at +44 20 8366 1177 or +33 1 42 68 53 00.
239
240
For urgent matters, call our emergency line: 911
241
For billing questions: 650.253.2223 ext. 100
242
"""
243
244
print("=== Phone Number Extraction ===")
245
matches = extractor.extract_from_text(sample_document)
246
for i, match in enumerate(matches):
247
print(f"{i+1}. '{match['raw_text']}' (pos {match['start_pos']}-{match['end_pos']})")
248
print(f" -> {match['formatted']['international']}")
249
print(f" -> Type: {match['number_type']}, Region: {match['region']}")
250
print()
251
252
print("=== Unique Numbers ===")
253
unique = extractor.extract_unique_numbers(sample_document)
254
for match in unique:
255
print(f"- {match['formatted']['international']} ({match['region']})")
256
257
print("=== Anonymized Text ===")
258
anonymized = extractor.anonymize_text(sample_document)
259
print(anonymized)
260
```
261
262
### Contact Information Extraction
263
264
```python
265
import phonenumbers
266
import re
267
268
class ContactExtractor:
269
"""Extract structured contact information from text."""
270
271
def __init__(self, default_region="US"):
272
self.default_region = default_region
273
self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
274
275
def extract_contacts(self, text):
276
"""Extract phone numbers, emails, and other contact info."""
277
contacts = {
278
'phone_numbers': [],
279
'emails': [],
280
'text_segments': []
281
}
282
283
# Extract phone numbers
284
for match in phonenumbers.PhoneNumberMatcher(text, self.default_region):
285
contacts['phone_numbers'].append({
286
'raw': match.raw_string(),
287
'formatted': phonenumbers.format_number(
288
match.number(),
289
phonenumbers.PhoneNumberFormat.INTERNATIONAL
290
),
291
'type': phonenumbers.number_type(match.number()),
292
'position': (match.start(), match.end())
293
})
294
295
# Extract email addresses
296
for match in self.email_pattern.finditer(text):
297
contacts['emails'].append({
298
'email': match.group(),
299
'position': (match.start(), match.end())
300
})
301
302
# Extract text segments between contact info
303
all_positions = []
304
for phone in contacts['phone_numbers']:
305
all_positions.append(phone['position'])
306
for email in contacts['emails']:
307
all_positions.append(email['position'])
308
309
all_positions.sort()
310
311
# Get text segments
312
last_end = 0
313
for start, end in all_positions:
314
if start > last_end:
315
segment = text[last_end:start].strip()
316
if segment:
317
contacts['text_segments'].append(segment)
318
last_end = end
319
320
# Final segment
321
if last_end < len(text):
322
segment = text[last_end:].strip()
323
if segment:
324
contacts['text_segments'].append(segment)
325
326
return contacts
327
328
def format_contact_card(self, text):
329
"""Format extracted contact information as a structured card."""
330
contacts = self.extract_contacts(text)
331
332
card = []
333
334
# Group phone numbers by type
335
phones_by_type = {}
336
for phone in contacts['phone_numbers']:
337
phone_type = phone['type']
338
if phone_type not in phones_by_type:
339
phones_by_type[phone_type] = []
340
phones_by_type[phone_type].append(phone['formatted'])
341
342
# Format phone numbers
343
for phone_type, numbers in phones_by_type.items():
344
type_name = str(phone_type).replace('PhoneNumberType.', '').title()
345
card.append(f"{type_name}: {', '.join(numbers)}")
346
347
# Add emails
348
if contacts['emails']:
349
emails = [email['email'] for email in contacts['emails']]
350
card.append(f"Email: {', '.join(emails)}")
351
352
# Add other text
353
if contacts['text_segments']:
354
card.append(f"Notes: {' | '.join(contacts['text_segments'])}")
355
356
return '\n'.join(card)
357
358
# Example usage
359
extractor = ContactExtractor("US")
360
361
business_card_text = """
362
John Smith - Sales Manager
363
Acme Corporation
364
Phone: (650) 253-2222
365
Mobile: 650.555.1234
366
Email: john.smith@acme.com
367
Alternative: jsmith@gmail.com
368
369
Call anytime between 9 AM - 5 PM PST
370
Emergency contact: +1-800-555-HELP
371
"""
372
373
print("=== Contact Extraction ===")
374
contacts = extractor.extract_contacts(business_card_text)
375
376
print(f"Phone numbers found: {len(contacts['phone_numbers'])}")
377
for phone in contacts['phone_numbers']:
378
print(f" - {phone['raw']} -> {phone['formatted']} ({phone['type']})")
379
380
print(f"\nEmails found: {len(contacts['emails'])}")
381
for email in contacts['emails']:
382
print(f" - {email['email']}")
383
384
print(f"\nText segments: {len(contacts['text_segments'])}")
385
for segment in contacts['text_segments']:
386
print(f" - {segment}")
387
388
print("\n=== Formatted Contact Card ===")
389
card = extractor.format_contact_card(business_card_text)
390
print(card)
391
```
392
393
### Bulk Text Processing
394
395
```python
396
import phonenumbers
397
from concurrent.futures import ThreadPoolExecutor
398
import json
399
400
class BulkPhoneExtractor:
401
"""Process multiple documents for phone number extraction."""
402
403
def __init__(self, default_region="US", max_workers=4):
404
self.default_region = default_region
405
self.max_workers = max_workers
406
407
def process_document(self, doc_id, text, region=None):
408
"""Process a single document."""
409
search_region = region or self.default_region
410
411
result = {
412
'doc_id': doc_id,
413
'phone_numbers': [],
414
'stats': {
415
'total_matches': 0,
416
'valid_numbers': 0,
417
'unique_numbers': 0
418
}
419
}
420
421
seen_numbers = set()
422
423
for match in phonenumbers.PhoneNumberMatcher(text, search_region):
424
number = match.number()
425
e164 = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.E164)
426
427
is_valid = phonenumbers.is_valid_number(number)
428
429
result['phone_numbers'].append({
430
'raw_text': match.raw_string(),
431
'e164': e164,
432
'international': phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL),
433
'is_valid': is_valid,
434
'type': str(phonenumbers.number_type(number)),
435
'region': phonenumbers.region_code_for_number(number),
436
'position': [match.start(), match.end()]
437
})
438
439
result['stats']['total_matches'] += 1
440
if is_valid:
441
result['stats']['valid_numbers'] += 1
442
443
seen_numbers.add(e164)
444
445
result['stats']['unique_numbers'] = len(seen_numbers)
446
return result
447
448
def process_documents(self, documents):
449
"""Process multiple documents in parallel."""
450
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
451
futures = []
452
453
for doc_id, text, region in documents:
454
future = executor.submit(self.process_document, doc_id, text, region)
455
futures.append(future)
456
457
results = []
458
for future in futures:
459
try:
460
result = future.result()
461
results.append(result)
462
except Exception as e:
463
print(f"Error processing document: {e}")
464
465
return results
466
467
def generate_summary_report(self, results):
468
"""Generate summary statistics across all documents."""
469
total_docs = len(results)
470
total_matches = sum(r['stats']['total_matches'] for r in results)
471
total_valid = sum(r['stats']['valid_numbers'] for r in results)
472
473
# Collect all unique numbers across documents
474
all_numbers = set()
475
regions = {}
476
types = {}
477
478
for result in results:
479
for phone in result['phone_numbers']:
480
if phone['is_valid']:
481
all_numbers.add(phone['e164'])
482
483
region = phone['region']
484
regions[region] = regions.get(region, 0) + 1
485
486
phone_type = phone['type']
487
types[phone_type] = types.get(phone_type, 0) + 1
488
489
return {
490
'summary': {
491
'total_documents': total_docs,
492
'total_matches': total_matches,
493
'valid_numbers': total_valid,
494
'unique_numbers_global': len(all_numbers),
495
'average_matches_per_doc': total_matches / total_docs if total_docs > 0 else 0
496
},
497
'regions': regions,
498
'types': types
499
}
500
501
# Example usage
502
extractor = BulkPhoneExtractor("US", max_workers=2)
503
504
# Sample documents to process
505
documents = [
506
("doc1", "Call us at 650-253-2222 or +44 20 8366 1177", "US"),
507
("doc2", "Support: 1-800-555-1234, International: +33 1 42 68 53 00", "US"),
508
("doc3", "Office: (555) 123-4567, Mobile: 555.987.6543", "US"),
509
("doc4", "Invalid phone: 123-456, Valid: +1-650-253-2222", "US"),
510
]
511
512
print("=== Bulk Processing Results ===")
513
results = extractor.process_documents(documents)
514
515
for result in results:
516
print(f"\nDocument {result['doc_id']}:")
517
print(f" Total matches: {result['stats']['total_matches']}")
518
print(f" Valid numbers: {result['stats']['valid_numbers']}")
519
print(f" Unique numbers: {result['stats']['unique_numbers']}")
520
521
for phone in result['phone_numbers'][:3]: # Show first 3
522
status = "✓" if phone['is_valid'] else "✗"
523
print(f" {status} {phone['raw_text']} -> {phone['international']}")
524
525
print("\n=== Summary Report ===")
526
summary = extractor.generate_summary_report(results)
527
print(json.dumps(summary, indent=2))
528
```