0
# Text Processing
1
2
Phone number discovery and extraction from unstructured text, plus as-you-type formatting for user interfaces. These tools enable applications to find phone numbers in documents, messages, and user input while providing real-time formatting feedback.
3
4
## Capabilities
5
6
### Phone Number Discovery in Text
7
8
Extract and identify phone numbers from unstructured text using configurable matching algorithms with different levels of strictness.
9
10
```python { .api }
11
class PhoneNumberMatcher:
12
"""Finds phone numbers in text strings."""
13
14
def __init__(self, text: str, region: str | None, leniency: int = Leniency.VALID, max_tries: int = 65535):
15
"""
16
Initialize matcher for finding phone numbers in text.
17
18
Parameters:
19
- text: Text to search for phone numbers
20
- region: Default region for parsing numbers without country codes
21
- leniency: Matching strictness level (Leniency enum value)
22
- max_tries: Maximum number of matches to attempt (default: 65535)
23
"""
24
25
def has_next(self) -> bool:
26
"""Check if more matches exist."""
27
28
def next(self) -> PhoneNumberMatch:
29
"""Get next phone number match."""
30
31
def __iter__(self):
32
"""Iterator support for for-loops."""
33
34
class PhoneNumberMatch:
35
"""Represents a phone number found in text."""
36
37
start: int # Start position in text
38
end: int # End position in text
39
raw_string: str # Raw matched string from text
40
number: PhoneNumber # Parsed phone number object
41
42
def __init__(self, start: int, raw_string: str, numobj: PhoneNumber):
43
"""
44
Initialize a phone number match.
45
46
Parameters:
47
- start: Start position in original text
48
- raw_string: Raw string that was matched
49
- numobj: Parsed PhoneNumber object
50
"""
51
```
52
53
**Leniency Levels:**
54
55
```python { .api }
56
class Leniency:
57
"""Leniency levels for phone number matching in text."""
58
POSSIBLE = 0 # Most permissive, matches possible numbers
59
VALID = 1 # Matches valid numbers only (default)
60
STRICT_GROUPING = 2 # Valid + proper digit grouping
61
EXACT_GROUPING = 3 # Most restrictive, exact formatting match
62
```
63
64
**Usage Examples:**
65
66
```python
67
# Basic phone number extraction
68
text = "Call me at 510-748-8230 if it's before 9:30, or on 703-4800500 after 10am."
69
70
# Find all phone numbers with default settings
71
matches = list(phonenumbers.PhoneNumberMatcher(text, "US"))
72
73
for match in matches:
74
print(f"Found: {match.raw_string}")
75
print(f"Position: {match.start}-{match.end}")
76
print(f"Formatted: {phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)}")
77
print()
78
79
# Output:
80
# Found: 510-748-8230
81
# Position: 11-23
82
# Formatted: +15107488230
83
#
84
# Found: 703-4800500
85
# Position: 51-62
86
# Formatted: +17034800500
87
88
# Using different leniency levels
89
strict_matcher = phonenumbers.PhoneNumberMatcher(text, "US", phonenumbers.Leniency.EXACT_GROUPING)
90
lenient_matcher = phonenumbers.PhoneNumberMatcher(text, "US", phonenumbers.Leniency.POSSIBLE)
91
92
strict_matches = list(strict_matcher)
93
lenient_matches = list(lenient_matcher)
94
95
print(f"Strict matching found: {len(strict_matches)} numbers")
96
print(f"Lenient matching found: {len(lenient_matches)} numbers")
97
```
98
99
### International Phone Number Detection
100
101
Extract phone numbers from international text with various country contexts.
102
103
**Usage Examples:**
104
105
```python
106
# International text with mixed formats
107
international_text = """
108
Contact our offices:
109
US Office: +1 (650) 253-2222
110
UK Office: +44 20 8366 1177
111
Local UK: 020 8366 1177
112
France: +33 1 42 68 53 00
113
Germany: 030 12345678
114
"""
115
116
# Parse with no default region (requires country codes)
117
international_matches = list(phonenumbers.PhoneNumberMatcher(international_text, None))
118
119
for match in international_matches:
120
region = phonenumbers.region_code_for_number(match.number)
121
formatted = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
122
print(f"{match.raw_string} -> {formatted} ({region})")
123
124
# Parse with specific region context
125
uk_context_matches = list(phonenumbers.PhoneNumberMatcher(international_text, "GB"))
126
127
# This will also match "020 8366 1177" as a valid UK number
128
for match in uk_context_matches:
129
if match.raw_string == "020 8366 1177":
130
print(f"Local UK number detected: {match.raw_string}")
131
132
# Multi-region extraction with context switching
133
def extract_numbers_by_region(text, regions):
134
"""Extract numbers trying different regional contexts."""
135
all_matches = {}
136
137
for region in regions:
138
matcher = phonenumbers.PhoneNumberMatcher(text, region)
139
matches = list(matcher)
140
all_matches[region] = matches
141
142
return all_matches
143
144
regions_to_try = ["US", "GB", "DE", "FR"]
145
regional_results = extract_numbers_by_region(international_text, regions_to_try)
146
147
for region, matches in regional_results.items():
148
print(f"\nWith {region} context: {len(matches)} matches")
149
for match in matches:
150
print(f" {match.raw_string}")
151
```
152
153
### Text Processing with Match Filtering
154
155
Advanced filtering and processing of found phone numbers.
156
157
**Usage Examples:**
158
159
```python
160
# Complex text with various number formats
161
complex_text = """
162
Customer service: 1-800-555-0123 (toll-free)
163
Emergency: 911
164
International: +44 20 7946 0958
165
Fax: (555) 123-4567 ext. 789
166
Invalid: 123-45 (too short)
167
Another: +1.555.987.6543
168
Website: Call 555-HELP (555-4357) for assistance
169
"""
170
171
# Extract with different leniency levels
172
def compare_leniency_levels(text, region):
173
"""Compare results across different leniency levels."""
174
leniency_levels = [
175
(phonenumbers.Leniency.POSSIBLE, "Possible"),
176
(phonenumbers.Leniency.VALID, "Valid"),
177
(phonenumbers.Leniency.STRICT_GROUPING, "Strict Grouping"),
178
(phonenumbers.Leniency.EXACT_GROUPING, "Exact Grouping")
179
]
180
181
for leniency, name in leniency_levels:
182
matcher = phonenumbers.PhoneNumberMatcher(text, region, leniency)
183
matches = list(matcher)
184
print(f"\n{name} ({len(matches)} matches):")
185
186
for match in matches:
187
print(f" '{match.raw_string}' at position {match.start}-{match.end}")
188
189
compare_leniency_levels(complex_text, "US")
190
191
# Filter matches by criteria
192
def filter_matches(text, region, filter_func):
193
"""Filter phone number matches by custom criteria."""
194
matcher = phonenumbers.PhoneNumberMatcher(text, region)
195
return [match for match in matcher if filter_func(match)]
196
197
# Filter for toll-free numbers only
198
def is_toll_free(match):
199
num_type = phonenumbers.number_type(match.number)
200
return num_type == phonenumbers.PhoneNumberType.TOLL_FREE
201
202
toll_free_matches = filter_matches(complex_text, "US", is_toll_free)
203
print(f"\nToll-free numbers found: {len(toll_free_matches)}")
204
205
# Filter for specific regions
206
def is_uk_number(match):
207
region = phonenumbers.region_code_for_number(match.number)
208
return region == "GB"
209
210
uk_matches = filter_matches(complex_text, None, is_uk_number)
211
print(f"UK numbers found: {len(uk_matches)}")
212
213
# Extract and categorize numbers
214
def categorize_numbers(text, region):
215
"""Categorize found phone numbers by type."""
216
matcher = phonenumbers.PhoneNumberMatcher(text, region)
217
categories = {
218
"Mobile": [],
219
"Fixed Line": [],
220
"Toll Free": [],
221
"Emergency": [],
222
"Other": []
223
}
224
225
for match in matcher:
226
num_type = phonenumbers.number_type(match.number)
227
228
if num_type == phonenumbers.PhoneNumberType.MOBILE:
229
categories["Mobile"].append(match)
230
elif num_type == phonenumbers.PhoneNumberType.FIXED_LINE:
231
categories["Fixed Line"].append(match)
232
elif num_type == phonenumbers.PhoneNumberType.TOLL_FREE:
233
categories["Toll Free"].append(match)
234
else:
235
# Check if it's an emergency number
236
formatted = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
237
if phonenumbers.is_emergency_number(formatted.replace("+", ""), region):
238
categories["Emergency"].append(match)
239
else:
240
categories["Other"].append(match)
241
242
return categories
243
244
categorized = categorize_numbers(complex_text, "US")
245
for category, matches in categorized.items():
246
if matches:
247
print(f"\n{category} ({len(matches)}):")
248
for match in matches:
249
print(f" {match.raw_string}")
250
```
251
252
### As-You-Type Formatting
253
254
Real-time phone number formatting for user input interfaces, providing immediate feedback as users type.
255
256
```python { .api }
257
class AsYouTypeFormatter:
258
"""Formats phone numbers as digits are entered."""
259
260
def __init__(self, region_code: str):
261
"""
262
Initialize formatter for specified region.
263
264
Parameters:
265
- region_code: Region code for formatting context (e.g., "US", "GB")
266
"""
267
268
def input_digit(self, next_char: str, remember_position: bool = False) -> str:
269
"""
270
Add next digit and get formatted result.
271
272
Parameters:
273
- next_char: Next character entered by user
274
- remember_position: Whether to remember cursor position for get_remembered_position()
275
276
Returns:
277
Formatted phone number string with current input
278
"""
279
280
def clear(self):
281
"""Clear all input and reset formatter."""
282
283
def get_remembered_position(self) -> int:
284
"""Get position of remembered cursor location."""
285
```
286
287
**Usage Examples:**
288
289
```python
290
# Basic as-you-type formatting
291
formatter = phonenumbers.AsYouTypeFormatter("US")
292
293
# Simulate user typing digits one by one
294
digits = "6502532222"
295
print("User input -> Formatted output")
296
print("-" * 30)
297
298
for digit in digits:
299
result = formatter.input_digit(digit)
300
print(f"'{digit}' -> '{result}'")
301
302
# Output:
303
# '6' -> '6'
304
# '5' -> '65'
305
# '0' -> '650'
306
# '2' -> '650-2'
307
# '5' -> '650-25'
308
# '3' -> '650-253'
309
# '2' -> '650-2532'
310
# '2' -> '(650) 253-22'
311
# '2' -> '(650) 253-222'
312
# '2' -> '(650) 253-2222'
313
314
# International number formatting
315
international_formatter = phonenumbers.AsYouTypeFormatter("US")
316
international_digits = "+442083661177"
317
318
print("\nInternational formatting:")
319
for char in international_digits:
320
result = international_formatter.input_digit(char)
321
print(f"'{char}' -> '{result}'")
322
323
# Position remembering for cursor tracking
324
formatter_with_cursor = phonenumbers.AsYouTypeFormatter("US")
325
digits_with_cursor = "6502532222"
326
327
# Remember position after 7th digit
328
for i, digit in enumerate(digits_with_cursor):
329
remember = (i == 6) # Remember position after "6502532"
330
result = formatter_with_cursor.input_digit(digit, remember)
331
332
if remember:
333
remembered_pos = formatter_with_cursor.get_remembered_position()
334
print(f"Cursor position remembered: {remembered_pos} in '{result}'")
335
336
# Clear and restart
337
formatter.clear()
338
new_result = formatter.input_digit("4")
339
print(f"After clear: '{new_result}'") # "4"
340
```
341
342
### Real-World Text Processing Applications
343
344
Practical examples for common use cases.
345
346
**Usage Examples:**
347
348
```python
349
# Email/document phone number extraction
350
def extract_contact_info(document_text, default_region="US"):
351
"""Extract all phone numbers from a document."""
352
matcher = phonenumbers.PhoneNumberMatcher(document_text, default_region)
353
354
contacts = []
355
for match in matcher:
356
contact_info = {
357
"raw_text": match.raw_string,
358
"position": f"{match.start}-{match.end}",
359
"formatted_national": phonenumbers.format_number(
360
match.number, phonenumbers.PhoneNumberFormat.NATIONAL
361
),
362
"formatted_international": phonenumbers.format_number(
363
match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
364
),
365
"region": phonenumbers.region_code_for_number(match.number),
366
"type": phonenumbers.number_type(match.number),
367
"is_valid": phonenumbers.is_valid_number(match.number)
368
}
369
contacts.append(contact_info)
370
371
return contacts
372
373
# Example document
374
business_card = """
375
John Smith, CEO
376
Acme Corporation
377
Phone: (555) 123-4567
378
Mobile: 555.987.6543
379
International: +1-555-246-8101
380
Fax: (555) 123-4568
381
"""
382
383
contacts = extract_contact_info(business_card)
384
for contact in contacts:
385
print(f"Found: {contact['raw_text']} -> {contact['formatted_national']}")
386
387
# Live input field formatting simulation
388
class PhoneInputField:
389
"""Simulates a phone input field with real-time formatting."""
390
391
def __init__(self, region_code="US"):
392
self.formatter = phonenumbers.AsYouTypeFormatter(region_code)
393
self.value = ""
394
395
def on_key_press(self, key):
396
"""Handle user key press."""
397
if key.isdigit() or key in "+()-. ":
398
self.value = self.formatter.input_digit(key)
399
return self.value
400
elif key == "BACKSPACE":
401
# In real implementation, would need to handle backspace properly
402
self.formatter.clear()
403
return ""
404
return self.value
405
406
def clear(self):
407
"""Clear the field."""
408
self.formatter.clear()
409
self.value = ""
410
411
# Simulate user input
412
phone_field = PhoneInputField("US")
413
user_input = "6502532222"
414
415
print("Phone input field simulation:")
416
for char in user_input:
417
display_value = phone_field.on_key_press(char)
418
print(f"User typed '{char}' -> Display: '{display_value}'")
419
420
# Multi-format phone number search
421
def find_phone_variations(text, phone_to_find, region):
422
"""Find all variations of a specific phone number in text."""
423
try:
424
target_number = phonenumbers.parse(phone_to_find, region)
425
target_e164 = phonenumbers.format_number(target_number, phonenumbers.PhoneNumberFormat.E164)
426
427
matcher = phonenumbers.PhoneNumberMatcher(text, region, phonenumbers.Leniency.POSSIBLE)
428
variations = []
429
430
for match in matcher:
431
match_e164 = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)
432
if match_e164 == target_e164:
433
variations.append(match.raw_string)
434
435
return variations
436
except phonenumbers.NumberParseException:
437
return []
438
439
# Find all ways a number appears in text
440
text_with_variations = """
441
Contact us at (650) 253-2222 or 650-253-2222.
442
International callers: +1 650 253 2222
443
Text: 6502532222
444
"""
445
446
variations = find_phone_variations(text_with_variations, "6502532222", "US")
447
print(f"\nVariations found: {variations}")
448
# Output: ['(650) 253-2222', '650-253-2222', '+1 650 253 2222', '6502532222']
449
```
450
451
### Advanced Text Processing Features
452
453
Sophisticated text processing capabilities for complex scenarios.
454
455
**Usage Examples:**
456
457
```python
458
# Batch processing with performance optimization
459
def batch_extract_numbers(documents, default_region="US", max_matches_per_doc=100):
460
"""Extract phone numbers from multiple documents efficiently."""
461
results = {}
462
463
for doc_id, text in documents.items():
464
try:
465
matcher = phonenumbers.PhoneNumberMatcher(text, default_region,
466
phonenumbers.Leniency.VALID,
467
max_matches_per_doc)
468
469
matches = []
470
for match in matcher:
471
matches.append({
472
"text": match.raw_string,
473
"start": match.start,
474
"end": match.end,
475
"e164": phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164),
476
"region": phonenumbers.region_code_for_number(match.number)
477
})
478
479
results[doc_id] = matches
480
481
except Exception as e:
482
results[doc_id] = {"error": str(e)}
483
484
return results
485
486
# Example batch processing
487
sample_documents = {
488
"email_1": "Please call me at (555) 123-4567 or email john@example.com",
489
"email_2": "UK office: +44 20 7946 0958, US office: 1-800-555-0199",
490
"invoice": "Questions? Contact us at 650.253.2222"
491
}
492
493
batch_results = batch_extract_numbers(sample_documents)
494
for doc_id, matches in batch_results.items():
495
if isinstance(matches, list):
496
print(f"\n{doc_id}: {len(matches)} phone numbers")
497
for match in matches:
498
print(f" {match['text']} -> {match['e164']}")
499
500
# Smart region detection
501
def smart_extract_with_region_detection(text, candidate_regions=None):
502
"""Extract numbers trying to detect the most likely region."""
503
if candidate_regions is None:
504
candidate_regions = ["US", "GB", "CA", "AU", "DE", "FR"]
505
506
region_scores = {}
507
508
# Try each region and count valid matches
509
for region in candidate_regions:
510
matcher = phonenumbers.PhoneNumberMatcher(text, region, phonenumbers.Leniency.VALID)
511
valid_matches = list(matcher)
512
region_scores[region] = len(valid_matches)
513
514
# Use region with most valid matches
515
best_region = max(region_scores, key=region_scores.get) if region_scores else "US"
516
517
# Extract with best region
518
final_matcher = phonenumbers.PhoneNumberMatcher(text, best_region)
519
return list(final_matcher), best_region
520
521
mixed_text = """
522
Call our London office at 020 7946 0958 or
523
our New York office at (212) 555-0123.
524
Emergency UK: 999, Emergency US: 911
525
"""
526
527
matches, detected_region = smart_extract_with_region_detection(mixed_text)
528
print(f"\nDetected best region: {detected_region}")
529
print(f"Found {len(matches)} matches")
530
```