or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

data-analysis.mddata-processing.mdfeed-operations.mdgeospatial.mdindex.mdutilities.md
tile.json

data-processing.mddocs/

Data Processing

Data cleaning, validation, and transformation utilities for improving GTFS data quality. This module includes functions for cleaning IDs, times, route names, and comprehensive feed cleaning operations.

Data Cleaning

ID and Name Cleaning

def clean_column_names(df):
    """
    Strip whitespace from DataFrame column names.
    
    Parameters:
    - df (DataFrame): DataFrame to clean
    
    Returns:
    - DataFrame: DataFrame with cleaned column names
    """

def clean_ids(feed):
    """
    Clean whitespace from ID fields in all feed tables.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Feed with cleaned ID fields
    """

def extend_id(feed, id_col, extension, *, prefix=True):
    """
    Add prefix or suffix to ID column across relevant tables.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    - id_col (str): ID column name to extend
    - extension (str): Prefix or suffix to add
    - prefix (bool): If True add as prefix, else as suffix
    
    Returns:
    - Feed: Feed with extended IDs
    """

def clean_route_short_names(feed):
    """
    Clean and disambiguate route short names.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Feed with cleaned route short names
    """

Time Cleaning

def clean_times(feed):
    """
    Convert H:MM:SS time format to HH:MM:SS format.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Feed with standardized time formats
    """

Data Aggregation

Route Aggregation

def build_aggregate_routes_dict(routes, by='route_short_name', route_id_prefix='route_'):
    """
    Build dictionary for aggregating routes by specified field.
    
    Parameters:
    - routes (DataFrame): Routes DataFrame
    - by (str): Field to aggregate by
    - route_id_prefix (str): Prefix for new route IDs
    
    Returns:
    - dict: Mapping from old route IDs to new aggregated route IDs
    """

def aggregate_routes(feed, by='route_short_name', route_id_prefix='route_'):
    """
    Aggregate routes by specified field (e.g., route_short_name).
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    - by (str): Field to aggregate by
    - route_id_prefix (str): Prefix for new route IDs
    
    Returns:
    - Feed: Feed with aggregated routes
    """

Stop Aggregation

def build_aggregate_stops_dict(stops, by='stop_code', stop_id_prefix='stop_'):
    """
    Build dictionary for aggregating stops by specified field.
    
    Parameters:
    - stops (DataFrame): Stops DataFrame
    - by (str): Field to aggregate by
    - stop_id_prefix (str): Prefix for new stop IDs
    
    Returns:
    - dict: Mapping from old stop IDs to new aggregated stop IDs
    """

def aggregate_stops(feed, by='stop_code', stop_id_prefix='stop_'):
    """
    Aggregate stops by specified field (e.g., stop_code).
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    - by (str): Field to aggregate by
    - stop_id_prefix (str): Prefix for new stop IDs
    
    Returns:
    - Feed: Feed with aggregated stops
    """

Comprehensive Cleaning

Feed Validation and Cleanup

def drop_zombies(feed):
    """
    Drop unused agencies, stops, trips, shapes, routes, and services.
    
    Removes entities that are defined but not referenced by other tables,
    ensuring feed consistency and reducing file size.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Feed with unused entities removed
    """

def drop_invalid_columns(feed):
    """
    Drop non-GTFS columns from all feed tables.
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Feed with only valid GTFS columns
    """

def clean(feed):
    """
    Apply comprehensive cleaning operations to the feed.
    
    Performs multiple cleaning operations including:
    - ID cleaning
    - Time format standardization
    - Column name cleaning
    - Removal of unused entities
    - Route short name cleaning
    
    Parameters:
    - feed (Feed): GTFS feed object (modified in-place)
    
    Returns:
    - Feed: Comprehensively cleaned feed
    """

Helper Functions

Date and Time Utilities

def datestr_to_date(x, format_str='%Y%m%d', *, inverse=False):
    """
    Convert between date strings and datetime.date objects.
    
    Parameters:
    - x: Date string or datetime.date object
    - format_str (str): Date format string
    - inverse (bool): If True, convert date to string
    
    Returns:
    - datetime.date or str: Converted date
    """

def timestr_to_seconds(x, *, inverse=False, mod24=False):
    """
    Convert HH:MM:SS time strings to seconds past midnight.
    
    Parameters:
    - x: Time string or seconds value
    - inverse (bool): If True, convert seconds to time string
    - mod24 (bool): Apply modulo 24 to hours
    
    Returns:
    - int or str: Converted time value
    """

def timestr_mod24(timestr):
    """
    Return time string with hours modulo 24.
    
    Parameters:
    - timestr (str): Time string in HH:MM:SS format
    
    Returns:
    - str: Time string with normalized hours
    """

def weekday_to_str(weekday, *, inverse=False):
    """
    Convert weekday number to/from string representation.
    
    Parameters:
    - weekday: Weekday number (0-6) or string
    - inverse (bool): If True, convert string to number
    
    Returns:
    - str or int: Converted weekday
    """

Geometric Utilities

def get_segment_length(linestring, p, q=None):
    """
    Get distance along linestring between projected points.
    
    Parameters:
    - linestring: Shapely LineString geometry
    - p: Point to project onto linestring
    - q: Optional second point
    
    Returns:
    - float: Distance along linestring
    """

def get_convert_dist(dist_units_in, dist_units_out):
    """
    Get distance conversion function between units.
    
    Parameters:
    - dist_units_in (str): Input distance units
    - dist_units_out (str): Output distance units
    
    Returns:
    - function: Conversion function
    """

def is_metric(dist_units):
    """
    Check if distance units are metric (m/km).
    
    Parameters:
    - dist_units (str): Distance units to check
    
    Returns:
    - bool: True if metric units
    """

Data Analysis Utilities

def get_max_runs(x):
    """
    Get start and end indices of runs of maximum values.
    
    Parameters:
    - x: Array-like sequence
    
    Returns:
    - list: List of (start, end) index tuples
    """

def get_peak_indices(times, counts):
    """
    Get indices of longest peak period in time series.
    
    Parameters:
    - times: Time values
    - counts: Count values
    
    Returns:
    - tuple: (start_index, end_index) of peak period
    """

def almost_equal(f, g):
    """
    Check DataFrame equality after canonical sorting.
    
    Parameters:
    - f (DataFrame): First DataFrame
    - g (DataFrame): Second DataFrame
    
    Returns:
    - bool: True if DataFrames are equal after sorting
    """

def is_not_null(df, col_name):
    """
    Check if DataFrame column has non-NaN values.
    
    Parameters:
    - df (DataFrame): DataFrame to check
    - col_name (str): Column name to check
    
    Returns:
    - bool: True if column has non-null values
    """

def longest_subsequence(seq, mode='strictly', order='increasing', key=None, *, index=False):
    """
    Find longest increasing subsequence.
    
    Parameters:
    - seq: Input sequence
    - mode (str): 'strictly' or 'non-strictly' increasing
    - order (str): 'increasing' or 'decreasing'
    - key: Function to extract comparison key
    - index (bool): Return indices instead of values
    
    Returns:
    - list: Longest subsequence or indices
    """

Time Series Utilities

def get_active_trips_df(trip_times):
    """
    Count active trips at any given time.
    
    Parameters:
    - trip_times: Trip time data
    
    Returns:
    - DataFrame: Active trip counts by time
    """

def combine_time_series(time_series_dict, kind, *, split_directions=False):
    """
    Combine time series with hierarchical columns.
    
    Parameters:
    - time_series_dict (dict): Dictionary of time series
    - kind (str): Type of combination
    - split_directions (bool): Split by direction
    
    Returns:
    - DataFrame: Combined time series
    """

def downsample(time_series, freq):
    """
    Downsample time series to given frequency.
    
    Parameters:
    - time_series (DataFrame): Time series to downsample
    - freq (str): Target frequency
    
    Returns:
    - DataFrame: Downsampled time series
    """

def unstack_time_series(time_series):
    """
    Unstack time series to long format.
    
    Parameters:
    - time_series (DataFrame): Time series to unstack
    
    Returns:
    - DataFrame: Unstacked time series
    """

def restack_time_series(unstacked_time_series):
    """
    Restack unstacked time series.
    
    Parameters:
    - unstacked_time_series (DataFrame): Unstacked time series
    
    Returns:
    - DataFrame: Restacked time series
    """

Utility Functions

def make_html(d):
    """
    Convert dictionary to HTML table.
    
    Parameters:
    - d (dict): Dictionary to convert
    
    Returns:
    - str: HTML table string
    """

def drop_feature_ids(collection):
    """
    Remove 'id' attributes from GeoJSON features.
    
    Parameters:
    - collection (dict): GeoJSON FeatureCollection
    
    Returns:
    - dict: FeatureCollection without feature IDs
    """

def make_ids(n, prefix='id_'):
    """
    Generate sequential ID strings.
    
    Parameters:
    - n (int): Number of IDs to generate
    - prefix (str): Prefix for generated IDs
    
    Returns:
    - list: List of generated ID strings
    """

Usage Examples

Basic Data Cleaning

import gtfs_kit as gk

# Load a potentially messy GTFS feed
feed = gk.read_feed('messy_gtfs.zip', dist_units='km')

# Apply comprehensive cleaning
cleaned_feed = gk.clean(feed)

# Or apply specific cleaning operations
gk.clean_ids(feed)
gk.clean_times(feed)
gk.clean_route_short_names(feed)
gk.drop_zombies(feed)

Route and Stop Aggregation

# Aggregate routes by short name to consolidate variants
feed_agg = feed.copy()
gk.aggregate_routes(feed_agg, by='route_short_name')

# Aggregate stops by stop code to merge nearby stops
gk.aggregate_stops(feed_agg, by='stop_code')

Data Validation and Quality Improvement

# Check for unused entities before cleaning
print(f"Routes before: {len(feed.routes)}")
print(f"Stops before: {len(feed.stops)}")

# Remove unused entities
gk.drop_zombies(feed)

print(f"Routes after: {len(feed.routes)}")
print(f"Stops after: {len(feed.stops)}")

# Ensure only valid GTFS columns remain
gk.drop_invalid_columns(feed)

Helper Function Usage

# Date/time conversions
date_obj = gk.datestr_to_date('20230101')
date_str = gk.datestr_to_date(date_obj, inverse=True)

seconds = gk.timestr_to_seconds('14:30:00')
time_str = gk.timestr_to_seconds(seconds, inverse=True)

# Distance conversions
convert_km_to_mi = gk.get_convert_dist('km', 'mi')
miles = convert_km_to_mi(10)  # Convert 10 km to miles

The data processing capabilities ensure GTFS feeds are clean, consistent, and optimized for analysis and visualization workflows.