Tessl Tile for pypi/gtfs-kit@9.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-analysis.md data-cleaning.md feed-operations.md geospatial.md index.md time-series.md validation.md

data-cleaning.mddocs/

0
# Data Cleaning
1

2
Data cleaning, transformation, and feed modification functions.
3

4
## Comprehensive Cleaning Functions
5

6
{ .api }
7
```python
8
def clean(feed: Feed) -> Feed:
9
    """
10
    Apply comprehensive cleaning pipeline to feed including ID cleaning, time formatting, 
11
    route name disambiguation, and removal of unused data.
12
    
13
    Args:
14
        feed: Feed object to clean
15
    
16
    Returns:
17
        New Feed object with all standard cleaning operations applied:
18
        - clean_ids(): Strip whitespace from ID columns
19
        - clean_times(): Standardize time format to HH:MM:SS  
20
        - clean_route_short_names(): Disambiguate route names
21
        - drop_zombies(): Remove unused stops, trips, shapes, routes, services
22
    """
23

24
def drop_zombies(feed: Feed) -> Feed:
25
    """
26
    Remove unused/unreferenced entities (stops, trips, shapes, routes, services).
27
    
28
    Args:
29
        feed: Feed object to clean
30
    
31
    Returns:
32
        New Feed object with only referenced entities:
33
        - Removes stops not used in stop_times
34
        - Removes trips not scheduled in stop_times  
35
        - Removes routes with no trips
36
        - Removes shapes not referenced by trips
37
        - Removes services not used by trips
38
        - Removes agencies not referenced by routes
39
    """
40

41
def drop_invalid_columns(feed: Feed) -> Feed:
42
    """
43
    Remove columns that are not part of the GTFS specification.
44
    
45
    Args:
46
        feed: Feed object to validate
47
    
48
    Returns:
49
        New Feed object with only GTFS-compliant columns in each table
50
    """
51
```
52

53
## ID and Basic Cleaning
54

55
{ .api }
56
```python
57
def clean_ids(feed: Feed) -> Feed:
58
    """
59
    Clean all ID columns by stripping leading/trailing whitespace.
60
    
61
    Args:
62
        feed: Feed object to clean
63
    
64
    Returns:
65
        New Feed object with cleaned ID fields in all tables
66
    """
67

68
def extend_id(feed: Feed, id_col: str, extension: str, *, prefix: bool = True) -> Feed:
69
    """
70
    Add prefix or suffix to ID values in specified column across all relevant tables.
71
    
72
    Args:
73
        feed: Feed object to modify
74
        id_col: ID column name to extend ("stop_id", "route_id", "trip_id", etc.)
75
        extension: String to add as prefix or suffix
76
        prefix: If True, add as prefix; if False, add as suffix
77
    
78
    Returns:
79
        New Feed object with extended ID values in all referencing tables
80
    """
81

82
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
83
    """
84
    Strip whitespace from DataFrame column names.
85
    
86
    Args:
87
        df: DataFrame with potentially messy column names
88
    
89
    Returns:
90
        DataFrame with cleaned column names
91
    """
92
```
93

94
## Time and Format Cleaning
95

96
{ .api }
97
```python
98
def clean_times(feed: Feed) -> Feed:
99
    """
100
    Standardize time format from H:MM:SS to HH:MM:SS throughout feed.
101
    
102
    Args:
103
        feed: Feed object with potentially inconsistent time formatting
104
    
105
    Returns:
106
        New Feed object with standardized HH:MM:SS time format in:
107
        - stop_times (arrival_time, departure_time)
108
        - frequencies (start_time, end_time)
109
    """
110

111
def clean_route_short_names(feed: Feed) -> Feed:
112
    """
113
    Clean and disambiguate route short names by removing whitespace and 
114
    ensuring uniqueness within each agency.
115
    
116
    Args:
117
        feed: Feed object with potentially duplicate or messy route names
118
    
119
    Returns:
120
        New Feed object with cleaned, unique route_short_name values
121
    """
122
```
123

124
## Feed Restriction Functions
125

126
{ .api }
127
```python
128
def restrict_to_routes(feed: Feed, route_ids: list[str]) -> Feed:
129
    """
130
    Restrict feed to only specified routes and related data.
131
    
132
    Args:
133
        feed: Feed object to restrict
134
        route_ids: List of route IDs to keep
135
    
136
    Returns:
137
        New Feed object containing only:
138
        - Specified routes
139
        - Trips for those routes
140
        - Stop times for those trips
141
        - Stops used by those trips
142
        - Services used by those trips
143
        - Shapes used by those trips (if any)
144
        - Related fare and transfer data
145
    """
146

147
def restrict_to_agencies(feed: Feed, agency_ids: list[str]) -> Feed:
148
    """
149
    Restrict feed to only specified agencies and all their related data.
150
    
151
    Args:
152
        feed: Feed object to restrict
153
        agency_ids: List of agency IDs to keep
154
    
155
    Returns:
156
        New Feed object containing only data for specified agencies
157
    """
158

159
def restrict_to_dates(feed: Feed, dates: list[str]) -> Feed:
160
    """
161
    Restrict feed to only service operating on specified dates.
162
    
163
    Args:
164
        feed: Feed object to restrict
165
        dates: List of date strings (YYYYMMDD) to keep service for
166
    
167
    Returns:
168
        New Feed object with calendar/calendar_dates filtered to dates,
169
        and trips/stop_times for only services active on those dates
170
    """
171

172
def restrict_to_area(feed: Feed, area: sg.Polygon) -> Feed:
173
    """
174
    Restrict feed to only trips that intersect with a polygonal area.
175
    
176
    Args:
177
        feed: Feed object to restrict
178
        area: Shapely Polygon defining the geographic boundary
179
    
180
    Returns:
181
        New Feed object containing only trips with stops or shapes intersecting the area
182
    """
183
```
184

185
## Data Aggregation Functions
186

187
{ .api }
188
```python
189
def aggregate_routes(feed: Feed, by: str, route_id_prefix: str = "route_") -> Feed:
190
    """
191
    Aggregate routes by a specified column (e.g., route_short_name, route_type).
192
    
193
    Args:
194
        feed: Feed object to aggregate
195
        by: Column name to group routes by for aggregation
196
        route_id_prefix: Prefix for new aggregated route IDs
197
    
198
    Returns:
199
        New Feed object with routes aggregated and trip/stop_times updated with new route IDs
200
    """
201

202
def build_aggregate_routes_dict(routes: pd.DataFrame, by: str, 
203
                               route_id_prefix: str) -> dict[str, str]:
204
    """
205
    Helper function to build mapping dictionary for route aggregation.
206
    
207
    Args:
208
        routes: Routes DataFrame
209
        by: Column to aggregate by
210
        route_id_prefix: Prefix for new route IDs
211
    
212
    Returns:
213
        Dictionary mapping old route_id -> new aggregated route_id
214
    """
215

216
def aggregate_stops(feed: Feed, by: str, stop_id_prefix: str = "stop_") -> Feed:
217
    """
218
    Aggregate stops by a specified column (e.g., stop_name, parent_station).
219
    
220
    Args:
221
        feed: Feed object to aggregate
222
        by: Column name to group stops by for aggregation  
223
        stop_id_prefix: Prefix for new aggregated stop IDs
224
    
225
    Returns:
226
        New Feed object with stops aggregated and stop_times updated with new stop IDs
227
    """
228

229
def build_aggregate_stops_dict(stops: pd.DataFrame, by: str,
230
                              stop_id_prefix: str) -> dict[str, str]:
231
    """
232
    Helper function to build mapping dictionary for stop aggregation.
233
    
234
    Args:
235
        stops: Stops DataFrame
236
        by: Column to aggregate by
237
        stop_id_prefix: Prefix for new stop IDs
238
    
239
    Returns:
240
        Dictionary mapping old stop_id -> new aggregated stop_id
241
    """
242
```
243

244
## Date and Calendar Filtering
245

246
{ .api }
247
```python
248
def subset_dates(feed: Feed, dates: list[str]) -> list[str]:
249
    """
250
    Filter dates to only those within the feed's service period.
251
    
252
    Args:
253
        feed: Feed object containing calendar data
254
        dates: List of candidate dates (YYYYMMDD)
255
    
256
    Returns:
257
        Filtered list of dates that fall within feed's service period
258
    """
259
```
260

261
## Usage Examples
262

263
### Basic Cleaning Workflow
264

265
```python
266
import gtfs_kit as gk
267

268
# Load raw feed
269
feed = gk.read_feed("data/raw_gtfs.zip")
270
print(f"Raw feed: {len(feed.stops)} stops, {len(feed.trips)} trips")
271

272
# Apply comprehensive cleaning
273
clean_feed = gk.clean(feed)
274
print(f"Clean feed: {len(clean_feed.stops)} stops, {len(clean_feed.trips)} trips")
275

276
# Check what was removed
277
print(f"Removed {len(feed.stops) - len(clean_feed.stops)} unused stops")
278
print(f"Removed {len(feed.trips) - len(clean_feed.trips)} unused trips")
279
```
280

281
### Step-by-Step Cleaning
282

283
```python
284
# Start with raw feed
285
working_feed = feed.copy()
286

287
# Clean IDs (remove whitespace)
288
working_feed = gk.clean_ids(working_feed)
289
print("Cleaned ID whitespace")
290

291
# Standardize time format
292
working_feed = gk.clean_times(working_feed)
293
print("Standardized time format")
294

295
# Clean route names
296
working_feed = gk.clean_route_short_names(working_feed)
297
print("Cleaned route short names")
298

299
# Remove unused entities
300
working_feed = gk.drop_zombies(working_feed)
301
print("Removed unused entities")
302

303
# Remove invalid columns
304
working_feed = gk.drop_invalid_columns(working_feed)
305
print("Removed non-GTFS columns")
306
```
307

308
### Feed Restriction Examples
309

310
```python
311
# Restrict to specific routes
312
bus_routes = feed.routes[feed.routes['route_type'] == 3]['route_id'].tolist()
313
bus_feed = gk.restrict_to_routes(feed, bus_routes)
314
print(f"Bus-only feed: {len(bus_feed.routes)} routes")
315

316
# Restrict to specific agency
317
if 'agency_id' in feed.routes.columns:
318
    agency_ids = feed.agency['agency_id'].head(1).tolist()
319
    agency_feed = gk.restrict_to_agencies(feed, agency_ids)
320
    print(f"Single agency feed: {len(agency_feed.routes)} routes")
321

322
# Restrict to weekdays only
323
dates = gk.get_dates(feed)
324
weekday_dates = [d for d in dates if pd.to_datetime(d, format='%Y%m%d').weekday() < 5]
325
weekday_feed = gk.restrict_to_dates(feed, weekday_dates)
326
print(f"Weekday feed covers {len(weekday_dates)} dates")
327

328
# Restrict to geographic area  
329
from shapely.geometry import Point
330
centroid = gk.compute_centroid(feed)
331
city_center = Point(centroid.x, centroid.y).buffer(0.01)  # ~1km radius
332
area_feed = gk.restrict_to_area(feed, city_center)
333
print(f"Area feed: {len(area_feed.trips)} trips in city center")
334
```
335

336
### Data Aggregation Examples
337

338
```python
339
# Aggregate routes by type
340
route_type_feed = gk.aggregate_routes(feed, by='route_type', route_id_prefix='type_')
341
print(f"Aggregated to {len(route_type_feed.routes)} route types")
342

343
# Aggregate routes by short name (combine duplicate names)
344
if 'route_short_name' in feed.routes.columns:
345
    name_feed = gk.aggregate_routes(feed, by='route_short_name', route_id_prefix='name_')
346
    print(f"Aggregated to {len(name_feed.routes)} unique route names")
347

348
# Aggregate stops by name (combine nearby stops with same name)
349
if 'stop_name' in feed.stops.columns:
350
    stop_name_feed = gk.aggregate_stops(feed, by='stop_name', stop_id_prefix='name_')
351
    print(f"Aggregated to {len(stop_name_feed.stops)} unique stop names")
352

353
# Custom aggregation using helper functions
354
routes_dict = gk.build_aggregate_routes_dict(
355
    feed.routes, by='route_short_name', route_id_prefix='custom_'
356
)
357
print(f"Built mapping for {len(routes_dict)} route aggregations")
358
```
359

360
### ID Extension and Modification
361

362
```python
363
# Add prefix to all stop IDs
364
prefixed_feed = gk.extend_id(feed, 'stop_id', 'agency1_', prefix=True)
365
sample_stops = prefixed_feed.stops['stop_id'].head(3).tolist()
366
print(f"Prefixed stop IDs: {sample_stops}")
367

368
# Add suffix to route IDs  
369
suffixed_feed = gk.extend_id(feed, 'route_id', '_v2', prefix=False)
370
sample_routes = suffixed_feed.routes['route_id'].head(3).tolist()
371
print(f"Suffixed route IDs: {sample_routes}")
372

373
# Extend trip IDs
374
trip_feed = gk.extend_id(feed, 'trip_id', 'modified_', prefix=True)
375
print("Extended trip IDs")
376
```
377

378
### Column and Format Cleaning
379

380
```python
381
# Clean column names if needed
382
if hasattr(feed, 'stops'):
383
    clean_stops = gk.clean_column_names(feed.stops)
384
    print("Column names:", list(clean_stops.columns)[:5])
385

386
# Check time format before/after cleaning
387
if 'departure_time' in feed.stop_times.columns:
388
    sample_times_before = feed.stop_times['departure_time'].head(3).tolist()
389
    time_cleaned_feed = gk.clean_times(feed)
390
    sample_times_after = time_cleaned_feed.stop_times['departure_time'].head(3).tolist()
391
    
392
    print("Times before:", sample_times_before)
393
    print("Times after:", sample_times_after)
394
```
395

396
### Quality Improvement Workflow
397

398
```python
399
# Full quality improvement pipeline
400
def improve_feed_quality(raw_feed):
401
    """Complete feed cleaning and improvement workflow."""
402
    
403
    # Start with copy to preserve original
404
    improved_feed = raw_feed.copy()
405
    
406
    # Basic cleaning
407
    improved_feed = gk.clean_ids(improved_feed)
408
    improved_feed = gk.clean_times(improved_feed)  
409
    improved_feed = gk.clean_route_short_names(improved_feed)
410
    
411
    # Remove invalid data
412
    improved_feed = gk.drop_invalid_columns(improved_feed)
413
    improved_feed = gk.drop_zombies(improved_feed)
414
    
415
    # Quality checks
416
    print(f"Quality improvement summary:")
417
    print(f"  Stops: {len(raw_feed.stops)} -> {len(improved_feed.stops)}")
418
    print(f"  Routes: {len(raw_feed.routes)} -> {len(improved_feed.routes)}")
419
    print(f"  Trips: {len(raw_feed.trips)} -> {len(improved_feed.trips)}")
420
    
421
    return improved_feed
422

423
# Apply improvement workflow
424
improved_feed = improve_feed_quality(feed)
425

426
# Validate the improvements
427
validation_results = gk.validate(improved_feed)
428
error_count = len(validation_results[validation_results['type'] == 'error'])
429
print(f"Validation errors after cleaning: {error_count}")
430
```
431

432
### Date Filtering and Subsetting
433

434
```python
435
# Get all valid dates and filter to recent period
436
all_dates = gk.get_dates(feed)
437
print(f"Feed covers {len(all_dates)} dates: {all_dates[0]} to {all_dates[-1]}")
438

439
# Filter to specific date range
440
import datetime
441
start_date = datetime.date(2024, 3, 1).strftime('%Y%m%d')
442
end_date = datetime.date(2024, 3, 31).strftime('%Y%m%d')
443

444
target_dates = [d for d in all_dates if start_date <= d <= end_date]
445
filtered_dates = gk.subset_dates(feed, target_dates)
446

447
print(f"Target period: {len(target_dates)} dates")
448
print(f"Available in feed: {len(filtered_dates)} dates")
449

450
# Create feed for specific period
451
period_feed = gk.restrict_to_dates(feed, filtered_dates)
452
print(f"Period feed: {len(period_feed.trips)} trips for {len(filtered_dates)} dates")
453
```

Version

Tile

Files

data-cleaning.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-cleaning.mddocs/