0
# Data Cleaning
1
2
Data cleaning, transformation, and feed modification functions.
3
4
## Comprehensive Cleaning Functions
5
6
{ .api }
7
```python
8
def clean(feed: Feed) -> Feed:
9
"""
10
Apply comprehensive cleaning pipeline to feed including ID cleaning, time formatting,
11
route name disambiguation, and removal of unused data.
12
13
Args:
14
feed: Feed object to clean
15
16
Returns:
17
New Feed object with all standard cleaning operations applied:
18
- clean_ids(): Strip whitespace from ID columns
19
- clean_times(): Standardize time format to HH:MM:SS
20
- clean_route_short_names(): Disambiguate route names
21
- drop_zombies(): Remove unused stops, trips, shapes, routes, services
22
"""
23
24
def drop_zombies(feed: Feed) -> Feed:
25
"""
26
Remove unused/unreferenced entities (stops, trips, shapes, routes, services).
27
28
Args:
29
feed: Feed object to clean
30
31
Returns:
32
New Feed object with only referenced entities:
33
- Removes stops not used in stop_times
34
- Removes trips not scheduled in stop_times
35
- Removes routes with no trips
36
- Removes shapes not referenced by trips
37
- Removes services not used by trips
38
- Removes agencies not referenced by routes
39
"""
40
41
def drop_invalid_columns(feed: Feed) -> Feed:
42
"""
43
Remove columns that are not part of the GTFS specification.
44
45
Args:
46
feed: Feed object to validate
47
48
Returns:
49
New Feed object with only GTFS-compliant columns in each table
50
"""
51
```
52
53
## ID and Basic Cleaning
54
55
{ .api }
56
```python
57
def clean_ids(feed: Feed) -> Feed:
58
"""
59
Clean all ID columns by stripping leading/trailing whitespace.
60
61
Args:
62
feed: Feed object to clean
63
64
Returns:
65
New Feed object with cleaned ID fields in all tables
66
"""
67
68
def extend_id(feed: Feed, id_col: str, extension: str, *, prefix: bool = True) -> Feed:
69
"""
70
Add prefix or suffix to ID values in specified column across all relevant tables.
71
72
Args:
73
feed: Feed object to modify
74
id_col: ID column name to extend ("stop_id", "route_id", "trip_id", etc.)
75
extension: String to add as prefix or suffix
76
prefix: If True, add as prefix; if False, add as suffix
77
78
Returns:
79
New Feed object with extended ID values in all referencing tables
80
"""
81
82
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
83
"""
84
Strip whitespace from DataFrame column names.
85
86
Args:
87
df: DataFrame with potentially messy column names
88
89
Returns:
90
DataFrame with cleaned column names
91
"""
92
```
93
94
## Time and Format Cleaning
95
96
{ .api }
97
```python
98
def clean_times(feed: Feed) -> Feed:
99
"""
100
Standardize time format from H:MM:SS to HH:MM:SS throughout feed.
101
102
Args:
103
feed: Feed object with potentially inconsistent time formatting
104
105
Returns:
106
New Feed object with standardized HH:MM:SS time format in:
107
- stop_times (arrival_time, departure_time)
108
- frequencies (start_time, end_time)
109
"""
110
111
def clean_route_short_names(feed: Feed) -> Feed:
112
"""
113
Clean and disambiguate route short names by removing whitespace and
114
ensuring uniqueness within each agency.
115
116
Args:
117
feed: Feed object with potentially duplicate or messy route names
118
119
Returns:
120
New Feed object with cleaned, unique route_short_name values
121
"""
122
```
123
124
## Feed Restriction Functions
125
126
{ .api }
127
```python
128
def restrict_to_routes(feed: Feed, route_ids: list[str]) -> Feed:
129
"""
130
Restrict feed to only specified routes and related data.
131
132
Args:
133
feed: Feed object to restrict
134
route_ids: List of route IDs to keep
135
136
Returns:
137
New Feed object containing only:
138
- Specified routes
139
- Trips for those routes
140
- Stop times for those trips
141
- Stops used by those trips
142
- Services used by those trips
143
- Shapes used by those trips (if any)
144
- Related fare and transfer data
145
"""
146
147
def restrict_to_agencies(feed: Feed, agency_ids: list[str]) -> Feed:
148
"""
149
Restrict feed to only specified agencies and all their related data.
150
151
Args:
152
feed: Feed object to restrict
153
agency_ids: List of agency IDs to keep
154
155
Returns:
156
New Feed object containing only data for specified agencies
157
"""
158
159
def restrict_to_dates(feed: Feed, dates: list[str]) -> Feed:
160
"""
161
Restrict feed to only service operating on specified dates.
162
163
Args:
164
feed: Feed object to restrict
165
dates: List of date strings (YYYYMMDD) to keep service for
166
167
Returns:
168
New Feed object with calendar/calendar_dates filtered to dates,
169
and trips/stop_times for only services active on those dates
170
"""
171
172
def restrict_to_area(feed: Feed, area: sg.Polygon) -> Feed:
173
"""
174
Restrict feed to only trips that intersect with a polygonal area.
175
176
Args:
177
feed: Feed object to restrict
178
area: Shapely Polygon defining the geographic boundary
179
180
Returns:
181
New Feed object containing only trips with stops or shapes intersecting the area
182
"""
183
```
184
185
## Data Aggregation Functions
186
187
{ .api }
188
```python
189
def aggregate_routes(feed: Feed, by: str, route_id_prefix: str = "route_") -> Feed:
190
"""
191
Aggregate routes by a specified column (e.g., route_short_name, route_type).
192
193
Args:
194
feed: Feed object to aggregate
195
by: Column name to group routes by for aggregation
196
route_id_prefix: Prefix for new aggregated route IDs
197
198
Returns:
199
New Feed object with routes aggregated and trip/stop_times updated with new route IDs
200
"""
201
202
def build_aggregate_routes_dict(routes: pd.DataFrame, by: str,
203
route_id_prefix: str) -> dict[str, str]:
204
"""
205
Helper function to build mapping dictionary for route aggregation.
206
207
Args:
208
routes: Routes DataFrame
209
by: Column to aggregate by
210
route_id_prefix: Prefix for new route IDs
211
212
Returns:
213
Dictionary mapping old route_id -> new aggregated route_id
214
"""
215
216
def aggregate_stops(feed: Feed, by: str, stop_id_prefix: str = "stop_") -> Feed:
217
"""
218
Aggregate stops by a specified column (e.g., stop_name, parent_station).
219
220
Args:
221
feed: Feed object to aggregate
222
by: Column name to group stops by for aggregation
223
stop_id_prefix: Prefix for new aggregated stop IDs
224
225
Returns:
226
New Feed object with stops aggregated and stop_times updated with new stop IDs
227
"""
228
229
def build_aggregate_stops_dict(stops: pd.DataFrame, by: str,
230
stop_id_prefix: str) -> dict[str, str]:
231
"""
232
Helper function to build mapping dictionary for stop aggregation.
233
234
Args:
235
stops: Stops DataFrame
236
by: Column to aggregate by
237
stop_id_prefix: Prefix for new stop IDs
238
239
Returns:
240
Dictionary mapping old stop_id -> new aggregated stop_id
241
"""
242
```
243
244
## Date and Calendar Filtering
245
246
{ .api }
247
```python
248
def subset_dates(feed: Feed, dates: list[str]) -> list[str]:
249
"""
250
Filter dates to only those within the feed's service period.
251
252
Args:
253
feed: Feed object containing calendar data
254
dates: List of candidate dates (YYYYMMDD)
255
256
Returns:
257
Filtered list of dates that fall within feed's service period
258
"""
259
```
260
261
## Usage Examples
262
263
### Basic Cleaning Workflow
264
265
```python
266
import gtfs_kit as gk
267
268
# Load raw feed
269
feed = gk.read_feed("data/raw_gtfs.zip")
270
print(f"Raw feed: {len(feed.stops)} stops, {len(feed.trips)} trips")
271
272
# Apply comprehensive cleaning
273
clean_feed = gk.clean(feed)
274
print(f"Clean feed: {len(clean_feed.stops)} stops, {len(clean_feed.trips)} trips")
275
276
# Check what was removed
277
print(f"Removed {len(feed.stops) - len(clean_feed.stops)} unused stops")
278
print(f"Removed {len(feed.trips) - len(clean_feed.trips)} unused trips")
279
```
280
281
### Step-by-Step Cleaning
282
283
```python
284
# Start with raw feed
285
working_feed = feed.copy()
286
287
# Clean IDs (remove whitespace)
288
working_feed = gk.clean_ids(working_feed)
289
print("Cleaned ID whitespace")
290
291
# Standardize time format
292
working_feed = gk.clean_times(working_feed)
293
print("Standardized time format")
294
295
# Clean route names
296
working_feed = gk.clean_route_short_names(working_feed)
297
print("Cleaned route short names")
298
299
# Remove unused entities
300
working_feed = gk.drop_zombies(working_feed)
301
print("Removed unused entities")
302
303
# Remove invalid columns
304
working_feed = gk.drop_invalid_columns(working_feed)
305
print("Removed non-GTFS columns")
306
```
307
308
### Feed Restriction Examples
309
310
```python
311
# Restrict to specific routes
312
bus_routes = feed.routes[feed.routes['route_type'] == 3]['route_id'].tolist()
313
bus_feed = gk.restrict_to_routes(feed, bus_routes)
314
print(f"Bus-only feed: {len(bus_feed.routes)} routes")
315
316
# Restrict to specific agency
317
if 'agency_id' in feed.routes.columns:
318
agency_ids = feed.agency['agency_id'].head(1).tolist()
319
agency_feed = gk.restrict_to_agencies(feed, agency_ids)
320
print(f"Single agency feed: {len(agency_feed.routes)} routes")
321
322
# Restrict to weekdays only
323
dates = gk.get_dates(feed)
324
weekday_dates = [d for d in dates if pd.to_datetime(d, format='%Y%m%d').weekday() < 5]
325
weekday_feed = gk.restrict_to_dates(feed, weekday_dates)
326
print(f"Weekday feed covers {len(weekday_dates)} dates")
327
328
# Restrict to geographic area
329
from shapely.geometry import Point
330
centroid = gk.compute_centroid(feed)
331
city_center = Point(centroid.x, centroid.y).buffer(0.01) # ~1km radius
332
area_feed = gk.restrict_to_area(feed, city_center)
333
print(f"Area feed: {len(area_feed.trips)} trips in city center")
334
```
335
336
### Data Aggregation Examples
337
338
```python
339
# Aggregate routes by type
340
route_type_feed = gk.aggregate_routes(feed, by='route_type', route_id_prefix='type_')
341
print(f"Aggregated to {len(route_type_feed.routes)} route types")
342
343
# Aggregate routes by short name (combine duplicate names)
344
if 'route_short_name' in feed.routes.columns:
345
name_feed = gk.aggregate_routes(feed, by='route_short_name', route_id_prefix='name_')
346
print(f"Aggregated to {len(name_feed.routes)} unique route names")
347
348
# Aggregate stops by name (combine nearby stops with same name)
349
if 'stop_name' in feed.stops.columns:
350
stop_name_feed = gk.aggregate_stops(feed, by='stop_name', stop_id_prefix='name_')
351
print(f"Aggregated to {len(stop_name_feed.stops)} unique stop names")
352
353
# Custom aggregation using helper functions
354
routes_dict = gk.build_aggregate_routes_dict(
355
feed.routes, by='route_short_name', route_id_prefix='custom_'
356
)
357
print(f"Built mapping for {len(routes_dict)} route aggregations")
358
```
359
360
### ID Extension and Modification
361
362
```python
363
# Add prefix to all stop IDs
364
prefixed_feed = gk.extend_id(feed, 'stop_id', 'agency1_', prefix=True)
365
sample_stops = prefixed_feed.stops['stop_id'].head(3).tolist()
366
print(f"Prefixed stop IDs: {sample_stops}")
367
368
# Add suffix to route IDs
369
suffixed_feed = gk.extend_id(feed, 'route_id', '_v2', prefix=False)
370
sample_routes = suffixed_feed.routes['route_id'].head(3).tolist()
371
print(f"Suffixed route IDs: {sample_routes}")
372
373
# Extend trip IDs
374
trip_feed = gk.extend_id(feed, 'trip_id', 'modified_', prefix=True)
375
print("Extended trip IDs")
376
```
377
378
### Column and Format Cleaning
379
380
```python
381
# Clean column names if needed
382
if hasattr(feed, 'stops'):
383
clean_stops = gk.clean_column_names(feed.stops)
384
print("Column names:", list(clean_stops.columns)[:5])
385
386
# Check time format before/after cleaning
387
if 'departure_time' in feed.stop_times.columns:
388
sample_times_before = feed.stop_times['departure_time'].head(3).tolist()
389
time_cleaned_feed = gk.clean_times(feed)
390
sample_times_after = time_cleaned_feed.stop_times['departure_time'].head(3).tolist()
391
392
print("Times before:", sample_times_before)
393
print("Times after:", sample_times_after)
394
```
395
396
### Quality Improvement Workflow
397
398
```python
399
# Full quality improvement pipeline
400
def improve_feed_quality(raw_feed):
401
"""Complete feed cleaning and improvement workflow."""
402
403
# Start with copy to preserve original
404
improved_feed = raw_feed.copy()
405
406
# Basic cleaning
407
improved_feed = gk.clean_ids(improved_feed)
408
improved_feed = gk.clean_times(improved_feed)
409
improved_feed = gk.clean_route_short_names(improved_feed)
410
411
# Remove invalid data
412
improved_feed = gk.drop_invalid_columns(improved_feed)
413
improved_feed = gk.drop_zombies(improved_feed)
414
415
# Quality checks
416
print(f"Quality improvement summary:")
417
print(f" Stops: {len(raw_feed.stops)} -> {len(improved_feed.stops)}")
418
print(f" Routes: {len(raw_feed.routes)} -> {len(improved_feed.routes)}")
419
print(f" Trips: {len(raw_feed.trips)} -> {len(improved_feed.trips)}")
420
421
return improved_feed
422
423
# Apply improvement workflow
424
improved_feed = improve_feed_quality(feed)
425
426
# Validate the improvements
427
validation_results = gk.validate(improved_feed)
428
error_count = len(validation_results[validation_results['type'] == 'error'])
429
print(f"Validation errors after cleaning: {error_count}")
430
```
431
432
### Date Filtering and Subsetting
433
434
```python
435
# Get all valid dates and filter to recent period
436
all_dates = gk.get_dates(feed)
437
print(f"Feed covers {len(all_dates)} dates: {all_dates[0]} to {all_dates[-1]}")
438
439
# Filter to specific date range
440
import datetime
441
start_date = datetime.date(2024, 3, 1).strftime('%Y%m%d')
442
end_date = datetime.date(2024, 3, 31).strftime('%Y%m%d')
443
444
target_dates = [d for d in all_dates if start_date <= d <= end_date]
445
filtered_dates = gk.subset_dates(feed, target_dates)
446
447
print(f"Target period: {len(target_dates)} dates")
448
print(f"Available in feed: {len(filtered_dates)} dates")
449
450
# Create feed for specific period
451
period_feed = gk.restrict_to_dates(feed, filtered_dates)
452
print(f"Period feed: {len(period_feed.trips)} trips for {len(filtered_dates)} dates")
453
```