Parse US addresses using conditional random fields
npx @tessl/cli install tessl/pypi-usaddress@0.5.0A Python library for parsing unstructured United States address strings into address components using advanced NLP methods and conditional random fields. It makes educated guesses in identifying address components, even in challenging cases where rule-based parsers typically fail.
pip install usaddressimport usaddressimport usaddress
# Example address string
addr = '123 Main St. Suite 100 Chicago, IL'
# Parse method: split address into components and label each
# Returns list of (token, label) tuples
parsed = usaddress.parse(addr)
# Output: [('123', 'AddressNumber'), ('Main', 'StreetName'), ('St.', 'StreetNamePostType'),
# ('Suite', 'OccupancyType'), ('100', 'OccupancyIdentifier'),
# ('Chicago,', 'PlaceName'), ('IL', 'StateName')]
# Tag method: merge consecutive components and return address type
# Returns (dict, address_type) tuple
tagged, address_type = usaddress.tag(addr)
# Output: ({'AddressNumber': '123', 'StreetName': 'Main',
# 'StreetNamePostType': 'St.', 'OccupancyType': 'Suite',
# 'OccupancyIdentifier': '100', 'PlaceName': 'Chicago',
# 'StateName': 'IL'}, 'Street Address')Core functionality for parsing US addresses into labeled components using probabilistic models.
def parse(address_string: str) -> list[tuple[str, str]]:
"""
Split an address string into components, and label each component.
Args:
address_string (str): The address to parse
Returns:
list[tuple[str, str]]: List of (token, label) pairs
"""
def tag(address_string: str, tag_mapping=None) -> tuple[dict[str, str], str]:
"""
Parse and merge consecutive components & strip commas.
Also return an address type ('Street Address', 'Intersection', 'PO Box', or 'Ambiguous').
Because this method returns a dict with labels as keys, it will throw a
RepeatedLabelError when multiple areas of an address have the same label.
Args:
address_string (str): The address to parse
tag_mapping (dict, optional): Optional mapping to remap labels to custom format
Returns:
tuple[dict[str, str], str]: (tagged_address_dict, address_type)
"""# Basic parsing - get individual tokens with labels
tokens = usaddress.parse("1600 Pennsylvania Avenue NW Washington DC 20500")
for token, label in tokens:
print(f"{token}: {label}")
# Advanced tagging - get consolidated address components
address, addr_type = usaddress.tag("1600 Pennsylvania Avenue NW Washington DC 20500")
print(f"Address type: {addr_type}")
print(f"Street number: {address.get('AddressNumber', 'N/A')}")
print(f"Street name: {address.get('StreetName', 'N/A')}")
# Custom label mapping
mapping = {'StreetName': 'Street', 'AddressNumber': 'Number'}
address, addr_type = usaddress.tag("123 Main St", tag_mapping=mapping)
print(address) # Uses custom labelsLow-level tokenization functionality for splitting addresses into unlabeled tokens.
def tokenize(address_string: str) -> list[str]:
"""
Split each component of an address into a list of unlabeled tokens.
Args:
address_string (str): The address to tokenize
Returns:
list[str]: The tokenized address components
"""# Tokenize without labeling
tokens = usaddress.tokenize("123 Main St. Apt 4B")
print(tokens) # ['123', 'Main', 'St.', 'Apt', '4B']Functions for extracting machine learning features from address tokens.
def tokenFeatures(token: str) -> Feature:
"""
Return a Feature dict with attributes that describe a token.
Args:
token (str): The token to analyze
Returns:
Feature: Dict with attributes describing the token
(abbrev, digits, word, trailing.zeros, length, endsinpunc,
directional, street_name, has.vowels)
"""
def tokens2features(address: list[str]) -> list[Feature]:
"""
Turn every token into a Feature dict, and return a list of each token as a Feature.
Each attribute in a Feature describes the corresponding token.
Args:
address (list[str]): The address as a list of tokens
Returns:
list[Feature]: A list of all tokens with feature details and context
"""# Extract features for a single token
features = usaddress.tokenFeatures("123")
print(features['digits']) # 'all_digits'
print(features['length']) # 'd:3'
# Extract features for all tokens with context
tokens = ["123", "Main", "St."]
features_list = usaddress.tokens2features(tokens)
print(features_list[0]['next']['word']) # 'main'
print(features_list[1]['previous']['digits']) # 'all_digits'Helper functions for analyzing token characteristics.
def digits(token: str) -> typing.Literal["all_digits", "some_digits", "no_digits"]:
"""
Identify whether the token string is all digits, has some digits, or has no digits.
Args:
token (str): The token to parse
Returns:
str: Label denoting digit presence ('all_digits', 'some_digits', 'no_digits')
"""
def trailingZeros(token: str) -> str:
"""
Return any trailing zeros found at the end of a token.
If none are found, then return an empty string.
Args:
token (str): The token to search for zeros
Returns:
str: The trailing zeros found, if any. Otherwise, an empty string.
"""# Analyze digit content
print(usaddress.digits("123")) # 'all_digits'
print(usaddress.digits("12th")) # 'some_digits'
print(usaddress.digits("Main")) # 'no_digits'
# Find trailing zeros
print(usaddress.trailingZeros("1200")) # '00'
print(usaddress.trailingZeros("123")) # ''Constants defining the complete set of address component labels used by the parser.
LABELS: list[str]The complete list of 25 address component labels based on the United States Thoroughfare, Landmark, and Postal Address Data Standard:
Built-in reference data for address parsing and feature extraction.
DIRECTIONS: set[str]
STREET_NAMES: set[str]
PARENT_LABEL: str
GROUP_LABEL: str# Check if token is a direction
if token.lower() in usaddress.DIRECTIONS:
print("This is a directional")
# Check if token is a street type
if token.lower() in usaddress.STREET_NAMES:
print("This is a street type")
# Access all available labels
print(f"Total labels: {len(usaddress.LABELS)}")
for label in usaddress.LABELS:
print(label)Feature = dict[str, typing.Union[str, bool, "Feature"]]
class RepeatedLabelError(probableparsing.RepeatedLabelError):
"""
Exception raised when tag() encounters repeated labels that cannot be merged.
Attributes:
REPO_URL (str): "https://github.com/datamade/usaddress/issues/new"
DOCS_URL (str): "https://usaddress.readthedocs.io/"
"""The tag() function can raise a RepeatedLabelError when multiple areas of an address have the same label and cannot be concatenated. This typically indicates either:
try:
address, addr_type = usaddress.tag("123 Main St 456 Oak Ave")
except usaddress.RepeatedLabelError as e:
print(f"Ambiguous address: {e}")
# Fall back to parse() for detailed token analysis
tokens = usaddress.parse("123 Main St 456 Oak Ave")The tag() function returns one of four address types: