Data cleaning, validation, and transformation utilities for improving GTFS data quality. This module includes functions for cleaning IDs, times, route names, and comprehensive feed cleaning operations.
def clean_column_names(df):
"""
Strip whitespace from DataFrame column names.
Parameters:
- df (DataFrame): DataFrame to clean
Returns:
- DataFrame: DataFrame with cleaned column names
"""
def clean_ids(feed):
"""
Clean whitespace from ID fields in all feed tables.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Feed with cleaned ID fields
"""
def extend_id(feed, id_col, extension, *, prefix=True):
"""
Add prefix or suffix to ID column across relevant tables.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
- id_col (str): ID column name to extend
- extension (str): Prefix or suffix to add
- prefix (bool): If True add as prefix, else as suffix
Returns:
- Feed: Feed with extended IDs
"""
def clean_route_short_names(feed):
"""
Clean and disambiguate route short names.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Feed with cleaned route short names
"""def clean_times(feed):
"""
Convert H:MM:SS time format to HH:MM:SS format.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Feed with standardized time formats
"""def build_aggregate_routes_dict(routes, by='route_short_name', route_id_prefix='route_'):
"""
Build dictionary for aggregating routes by specified field.
Parameters:
- routes (DataFrame): Routes DataFrame
- by (str): Field to aggregate by
- route_id_prefix (str): Prefix for new route IDs
Returns:
- dict: Mapping from old route IDs to new aggregated route IDs
"""
def aggregate_routes(feed, by='route_short_name', route_id_prefix='route_'):
"""
Aggregate routes by specified field (e.g., route_short_name).
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
- by (str): Field to aggregate by
- route_id_prefix (str): Prefix for new route IDs
Returns:
- Feed: Feed with aggregated routes
"""def build_aggregate_stops_dict(stops, by='stop_code', stop_id_prefix='stop_'):
"""
Build dictionary for aggregating stops by specified field.
Parameters:
- stops (DataFrame): Stops DataFrame
- by (str): Field to aggregate by
- stop_id_prefix (str): Prefix for new stop IDs
Returns:
- dict: Mapping from old stop IDs to new aggregated stop IDs
"""
def aggregate_stops(feed, by='stop_code', stop_id_prefix='stop_'):
"""
Aggregate stops by specified field (e.g., stop_code).
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
- by (str): Field to aggregate by
- stop_id_prefix (str): Prefix for new stop IDs
Returns:
- Feed: Feed with aggregated stops
"""def drop_zombies(feed):
"""
Drop unused agencies, stops, trips, shapes, routes, and services.
Removes entities that are defined but not referenced by other tables,
ensuring feed consistency and reducing file size.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Feed with unused entities removed
"""
def drop_invalid_columns(feed):
"""
Drop non-GTFS columns from all feed tables.
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Feed with only valid GTFS columns
"""
def clean(feed):
"""
Apply comprehensive cleaning operations to the feed.
Performs multiple cleaning operations including:
- ID cleaning
- Time format standardization
- Column name cleaning
- Removal of unused entities
- Route short name cleaning
Parameters:
- feed (Feed): GTFS feed object (modified in-place)
Returns:
- Feed: Comprehensively cleaned feed
"""def datestr_to_date(x, format_str='%Y%m%d', *, inverse=False):
"""
Convert between date strings and datetime.date objects.
Parameters:
- x: Date string or datetime.date object
- format_str (str): Date format string
- inverse (bool): If True, convert date to string
Returns:
- datetime.date or str: Converted date
"""
def timestr_to_seconds(x, *, inverse=False, mod24=False):
"""
Convert HH:MM:SS time strings to seconds past midnight.
Parameters:
- x: Time string or seconds value
- inverse (bool): If True, convert seconds to time string
- mod24 (bool): Apply modulo 24 to hours
Returns:
- int or str: Converted time value
"""
def timestr_mod24(timestr):
"""
Return time string with hours modulo 24.
Parameters:
- timestr (str): Time string in HH:MM:SS format
Returns:
- str: Time string with normalized hours
"""
def weekday_to_str(weekday, *, inverse=False):
"""
Convert weekday number to/from string representation.
Parameters:
- weekday: Weekday number (0-6) or string
- inverse (bool): If True, convert string to number
Returns:
- str or int: Converted weekday
"""def get_segment_length(linestring, p, q=None):
"""
Get distance along linestring between projected points.
Parameters:
- linestring: Shapely LineString geometry
- p: Point to project onto linestring
- q: Optional second point
Returns:
- float: Distance along linestring
"""
def get_convert_dist(dist_units_in, dist_units_out):
"""
Get distance conversion function between units.
Parameters:
- dist_units_in (str): Input distance units
- dist_units_out (str): Output distance units
Returns:
- function: Conversion function
"""
def is_metric(dist_units):
"""
Check if distance units are metric (m/km).
Parameters:
- dist_units (str): Distance units to check
Returns:
- bool: True if metric units
"""def get_max_runs(x):
"""
Get start and end indices of runs of maximum values.
Parameters:
- x: Array-like sequence
Returns:
- list: List of (start, end) index tuples
"""
def get_peak_indices(times, counts):
"""
Get indices of longest peak period in time series.
Parameters:
- times: Time values
- counts: Count values
Returns:
- tuple: (start_index, end_index) of peak period
"""
def almost_equal(f, g):
"""
Check DataFrame equality after canonical sorting.
Parameters:
- f (DataFrame): First DataFrame
- g (DataFrame): Second DataFrame
Returns:
- bool: True if DataFrames are equal after sorting
"""
def is_not_null(df, col_name):
"""
Check if DataFrame column has non-NaN values.
Parameters:
- df (DataFrame): DataFrame to check
- col_name (str): Column name to check
Returns:
- bool: True if column has non-null values
"""
def longest_subsequence(seq, mode='strictly', order='increasing', key=None, *, index=False):
"""
Find longest increasing subsequence.
Parameters:
- seq: Input sequence
- mode (str): 'strictly' or 'non-strictly' increasing
- order (str): 'increasing' or 'decreasing'
- key: Function to extract comparison key
- index (bool): Return indices instead of values
Returns:
- list: Longest subsequence or indices
"""def get_active_trips_df(trip_times):
"""
Count active trips at any given time.
Parameters:
- trip_times: Trip time data
Returns:
- DataFrame: Active trip counts by time
"""
def combine_time_series(time_series_dict, kind, *, split_directions=False):
"""
Combine time series with hierarchical columns.
Parameters:
- time_series_dict (dict): Dictionary of time series
- kind (str): Type of combination
- split_directions (bool): Split by direction
Returns:
- DataFrame: Combined time series
"""
def downsample(time_series, freq):
"""
Downsample time series to given frequency.
Parameters:
- time_series (DataFrame): Time series to downsample
- freq (str): Target frequency
Returns:
- DataFrame: Downsampled time series
"""
def unstack_time_series(time_series):
"""
Unstack time series to long format.
Parameters:
- time_series (DataFrame): Time series to unstack
Returns:
- DataFrame: Unstacked time series
"""
def restack_time_series(unstacked_time_series):
"""
Restack unstacked time series.
Parameters:
- unstacked_time_series (DataFrame): Unstacked time series
Returns:
- DataFrame: Restacked time series
"""def make_html(d):
"""
Convert dictionary to HTML table.
Parameters:
- d (dict): Dictionary to convert
Returns:
- str: HTML table string
"""
def drop_feature_ids(collection):
"""
Remove 'id' attributes from GeoJSON features.
Parameters:
- collection (dict): GeoJSON FeatureCollection
Returns:
- dict: FeatureCollection without feature IDs
"""
def make_ids(n, prefix='id_'):
"""
Generate sequential ID strings.
Parameters:
- n (int): Number of IDs to generate
- prefix (str): Prefix for generated IDs
Returns:
- list: List of generated ID strings
"""import gtfs_kit as gk
# Load a potentially messy GTFS feed
feed = gk.read_feed('messy_gtfs.zip', dist_units='km')
# Apply comprehensive cleaning
cleaned_feed = gk.clean(feed)
# Or apply specific cleaning operations
gk.clean_ids(feed)
gk.clean_times(feed)
gk.clean_route_short_names(feed)
gk.drop_zombies(feed)# Aggregate routes by short name to consolidate variants
feed_agg = feed.copy()
gk.aggregate_routes(feed_agg, by='route_short_name')
# Aggregate stops by stop code to merge nearby stops
gk.aggregate_stops(feed_agg, by='stop_code')# Check for unused entities before cleaning
print(f"Routes before: {len(feed.routes)}")
print(f"Stops before: {len(feed.stops)}")
# Remove unused entities
gk.drop_zombies(feed)
print(f"Routes after: {len(feed.routes)}")
print(f"Stops after: {len(feed.stops)}")
# Ensure only valid GTFS columns remain
gk.drop_invalid_columns(feed)# Date/time conversions
date_obj = gk.datestr_to_date('20230101')
date_str = gk.datestr_to_date(date_obj, inverse=True)
seconds = gk.timestr_to_seconds('14:30:00')
time_str = gk.timestr_to_seconds(seconds, inverse=True)
# Distance conversions
convert_km_to_mi = gk.get_convert_dist('km', 'mi')
miles = convert_km_to_mi(10) # Convert 10 km to milesThe data processing capabilities ensure GTFS feeds are clean, consistent, and optimized for analysis and visualization workflows.