0
# Data Analysis
1
2
Statistical analysis, feed summaries, and computational functions for transit metrics.
3
4
## Feed Analysis Functions
5
6
{ .api }
7
```python
8
def describe(feed: Feed, sample_date: str) -> pd.DataFrame:
9
"""
10
Generate comprehensive feed description with key metrics and indicators.
11
12
Args:
13
feed: Feed object to describe
14
sample_date: Date string (YYYYMMDD) to use for analysis
15
16
Returns:
17
DataFrame with feed statistics including routes, stops, trips, and service metrics
18
"""
19
20
def summarize(feed: Feed, table: str) -> pd.DataFrame:
21
"""
22
Summarize a GTFS table with column statistics and data quality metrics.
23
24
Args:
25
feed: Feed object containing the table
26
table: Name of table to summarize ("stops", "routes", "trips", etc.)
27
28
Returns:
29
DataFrame with column-wise statistics including null counts, data types, and distributions
30
"""
31
```
32
33
## Feed-Level Statistics
34
35
{ .api }
36
```python
37
def compute_feed_stats(feed: Feed, trip_stats: pd.DataFrame, dates: list[str],
38
*, split_route_types: bool = False) -> pd.DataFrame:
39
"""
40
Compute comprehensive feed-level statistics across multiple dates.
41
42
Args:
43
feed: Feed object to analyze
44
trip_stats: Pre-computed trip statistics DataFrame
45
dates: List of service dates to include in analysis
46
split_route_types: If True, compute separate stats by route type
47
48
Returns:
49
DataFrame with feed-level metrics including total distance, service hours, frequencies
50
"""
51
52
def compute_feed_stats_0(feed: Feed, trip_stats_subset: pd.DataFrame,
53
*, split_route_types: bool = False) -> pd.DataFrame:
54
"""
55
Helper function to compute feed stats for a single date/subset.
56
57
Args:
58
feed: Feed object to analyze
59
trip_stats_subset: Trip statistics for specific subset
60
split_route_types: If True, split results by route type
61
62
Returns:
63
DataFrame with feed statistics for the subset
64
"""
65
```
66
67
## Trip Analysis
68
69
{ .api }
70
```python
71
def compute_trip_stats(feed: Feed, route_ids: list[str] | None,
72
*, compute_dist_from_shapes: bool = False) -> pd.DataFrame:
73
"""
74
Compute comprehensive statistics for trips including distances, durations, and patterns.
75
76
Args:
77
feed: Feed object containing trip data
78
route_ids: List of route IDs to include, or None for all routes
79
compute_dist_from_shapes: If True, compute distances from shape geometries
80
81
Returns:
82
DataFrame with per-trip statistics including:
83
- trip_id, route_id, direction_id, service_id
84
- start_time, end_time, duration
85
- distance, speed, num_stops
86
- stop_pattern information
87
"""
88
89
def compute_trip_activity(feed: Feed, dates: list[str]) -> pd.DataFrame:
90
"""
91
Determine which trips are active on specified dates.
92
93
Args:
94
feed: Feed object containing trip and calendar data
95
dates: List of dates to check for trip activity
96
97
Returns:
98
DataFrame marking trips as active (1) or inactive (0) for each date
99
"""
100
101
def locate_trips(feed: Feed, date: str, times: list[str]) -> pd.DataFrame:
102
"""
103
Locate trip positions at specified times on a given date.
104
105
Args:
106
feed: Feed object with trip and schedule data
107
date: Date string (YYYYMMDD) for analysis
108
times: List of time strings (HH:MM:SS) to locate trips
109
110
Returns:
111
DataFrame with trip positions, stops, and progress at each time
112
"""
113
114
def name_stop_patterns(feed: Feed) -> pd.DataFrame:
115
"""
116
Add stop pattern names to trips based on their sequence of stops.
117
118
Args:
119
feed: Feed object containing trips and stop_times
120
121
Returns:
122
Modified trips DataFrame with stop_pattern_name column added
123
"""
124
```
125
126
## Route Analysis
127
128
{ .api }
129
```python
130
def compute_route_stats(feed: Feed, trip_stats_subset: pd.DataFrame, dates: list[str],
131
headway_start_time: str, headway_end_time: str,
132
*, split_directions: bool = False) -> pd.DataFrame:
133
"""
134
Compute route-level statistics including service frequency and headways.
135
136
Args:
137
feed: Feed object containing route data
138
trip_stats_subset: Pre-computed trip statistics to aggregate
139
dates: List of service dates to analyze
140
headway_start_time: Start time for headway calculation (HH:MM:SS)
141
headway_end_time: End time for headway calculation (HH:MM:SS)
142
split_directions: If True, compute separate stats by direction
143
144
Returns:
145
DataFrame with route statistics including:
146
- route_id, direction_id (if split_directions=True)
147
- num_trips, num_trip_starts, num_trip_ends
148
- start_time, end_time, max_headway, mean_headway
149
- max_speed, mean_speed, service_distance, service_duration
150
"""
151
152
def compute_route_stats_0(trip_stats_subset: pd.DataFrame, headway_start_time: str,
153
headway_end_time: str, *, split_directions: bool = False) -> pd.DataFrame:
154
"""
155
Helper function to compute route stats from trip stats subset.
156
157
Args:
158
trip_stats_subset: Trip statistics DataFrame subset
159
headway_start_time: Start time for headway calculation
160
headway_end_time: End time for headway calculation
161
split_directions: If True, split by direction
162
163
Returns:
164
DataFrame with route-level aggregated statistics
165
"""
166
167
def build_route_timetable(feed: Feed, route_id: str, dates: list[str]) -> pd.DataFrame:
168
"""
169
Build a timetable showing all trips for a specific route.
170
171
Args:
172
feed: Feed object containing schedule data
173
route_id: Route ID to build timetable for
174
dates: List of dates to include in timetable
175
176
Returns:
177
DataFrame with trip times organized by route and date
178
"""
179
```
180
181
## Stop Analysis
182
183
{ .api }
184
```python
185
def compute_stop_stats(feed: Feed, dates: list[str], stop_ids: list[str] | None,
186
headway_start_time: str, headway_end_time: str,
187
*, split_directions: bool = False) -> pd.DataFrame:
188
"""
189
Compute stop-level statistics including service frequency and activity.
190
191
Args:
192
feed: Feed object containing stop and schedule data
193
dates: List of service dates to analyze
194
stop_ids: List of stop IDs to include, or None for all stops
195
headway_start_time: Start time for frequency analysis (HH:MM:SS)
196
headway_end_time: End time for frequency analysis (HH:MM:SS)
197
split_directions: If True, compute separate stats by direction
198
199
Returns:
200
DataFrame with stop statistics including:
201
- stop_id, direction_id (if split_directions=True)
202
- num_routes, num_trips, max_headway, mean_headway
203
- start_time, end_time, peak_num_trips, peak_start_time, peak_end_time
204
"""
205
206
def compute_stop_stats_0(stop_times_subset: pd.DataFrame, trip_subset: pd.DataFrame,
207
headway_start_time: str, headway_end_time: str,
208
*, split_directions: bool = False) -> pd.DataFrame:
209
"""
210
Helper function to compute stop stats from subsets.
211
212
Args:
213
stop_times_subset: Stop times DataFrame subset
214
trip_subset: Trips DataFrame subset
215
headway_start_time: Start time for analysis
216
headway_end_time: End time for analysis
217
split_directions: If True, split by direction
218
219
Returns:
220
DataFrame with stop-level statistics
221
"""
222
223
def compute_stop_activity(feed: Feed, dates: list[str]) -> pd.DataFrame:
224
"""
225
Determine which stops are active (have service) on specified dates.
226
227
Args:
228
feed: Feed object containing stop and schedule data
229
dates: List of dates to check for stop activity
230
231
Returns:
232
DataFrame marking stops as active (1) or inactive (0) for each date
233
"""
234
235
def build_stop_timetable(feed: Feed, stop_id: str, dates: list[str]) -> pd.DataFrame:
236
"""
237
Build a timetable showing all arrivals/departures for a specific stop.
238
239
Args:
240
feed: Feed object containing schedule data
241
stop_id: Stop ID to build timetable for
242
dates: List of dates to include in timetable
243
244
Returns:
245
DataFrame with arrival/departure times organized by stop and date
246
"""
247
```
248
249
## Service and Calendar Analysis
250
251
{ .api }
252
```python
253
def get_active_services(feed: Feed, date: str) -> list[str]:
254
"""
255
Get list of service IDs that are active on a specific date.
256
257
Args:
258
feed: Feed object containing calendar data
259
date: Date string (YYYYMMDD) to check
260
261
Returns:
262
List of service_id values active on the specified date
263
"""
264
265
def compute_busiest_date(feed: Feed, dates: list[str]) -> str:
266
"""
267
Find the date with the maximum number of active trips.
268
269
Args:
270
feed: Feed object with trip and calendar data
271
dates: List of candidate dates to compare
272
273
Returns:
274
Date string (YYYYMMDD) with most active trips
275
"""
276
277
def get_dates(feed: Feed, *, as_date_obj: bool = False) -> list[str]:
278
"""
279
Get all valid service dates for the feed based on calendar definitions.
280
281
Args:
282
feed: Feed object containing calendar data
283
as_date_obj: If True, return date objects instead of strings
284
285
Returns:
286
List of valid service dates (YYYYMMDD strings or date objects)
287
"""
288
289
def get_week(feed: Feed, k: int, *, as_date_obj: bool = False) -> list[str]:
290
"""
291
Get the kth Monday-to-Sunday week of service.
292
293
Args:
294
feed: Feed object containing calendar data
295
k: Week number (0-indexed, 0 = first week)
296
as_date_obj: If True, return date objects instead of strings
297
298
Returns:
299
List of 7 dates representing the kth week
300
"""
301
302
def get_first_week(feed: Feed, *, as_date_obj: bool = False) -> list[str]:
303
"""
304
Get the first Monday-to-Sunday week of service.
305
306
Args:
307
feed: Feed object containing calendar data
308
as_date_obj: If True, return date objects instead of strings
309
310
Returns:
311
List of 7 dates representing the first service week
312
"""
313
```
314
315
## Unit Conversion
316
317
{ .api }
318
```python
319
def convert_dist(feed: Feed, new_dist_units: str) -> Feed:
320
"""
321
Convert all distance measurements in feed to new units.
322
323
Args:
324
feed: Feed object with distance data
325
new_dist_units: Target distance units ("km", "m", "mi", "ft")
326
327
Returns:
328
New Feed object with distances converted to specified units
329
"""
330
```
331
332
## Usage Examples
333
334
### Feed Overview and Description
335
336
```python
337
import gtfs_kit as gk
338
339
# Load feed
340
feed = gk.read_feed("data/gtfs.zip")
341
342
# Get comprehensive feed description
343
dates = gk.get_dates(feed)
344
sample_date = dates[0] if dates else "20240315"
345
description = gk.describe(feed, sample_date)
346
347
print("Feed Description:")
348
print(description)
349
350
# Summarize individual tables
351
stops_summary = gk.summarize(feed, "stops")
352
routes_summary = gk.summarize(feed, "routes")
353
354
print(f"\nStops table summary:")
355
print(stops_summary)
356
```
357
358
### Trip Analysis Workflow
359
360
```python
361
# Compute comprehensive trip statistics
362
trip_stats = gk.compute_trip_stats(feed, route_ids=None, compute_dist_from_shapes=True)
363
364
print(f"Analyzed {len(trip_stats)} trips")
365
print("Trip statistics columns:", list(trip_stats.columns))
366
367
# Analyze trip patterns
368
patterns_df = gk.name_stop_patterns(feed)
369
pattern_counts = patterns_df['stop_pattern_name'].value_counts()
370
print(f"Found {len(pattern_counts)} unique stop patterns")
371
372
# Check trip activity across dates
373
dates = gk.get_dates(feed)[:7] # First week
374
trip_activity = gk.compute_trip_activity(feed, dates)
375
print(f"Trip activity shape: {trip_activity.shape}")
376
```
377
378
### Route Performance Analysis
379
380
```python
381
# Find busiest service day
382
dates = gk.get_dates(feed)
383
busiest_date = gk.compute_busiest_date(feed, dates)
384
print(f"Busiest service date: {busiest_date}")
385
386
# Compute route statistics
387
trip_stats = gk.compute_trip_stats(feed, route_ids=None)
388
route_stats = gk.compute_route_stats(
389
feed=feed,
390
trip_stats_subset=trip_stats,
391
dates=[busiest_date],
392
headway_start_time="07:00:00",
393
headway_end_time="19:00:00",
394
split_directions=True
395
)
396
397
print("Route Statistics:")
398
print(route_stats[['route_id', 'direction_id', 'mean_headway', 'num_trips']].head())
399
400
# Analyze specific route
401
route_id = route_stats['route_id'].iloc[0]
402
route_timetable = gk.build_route_timetable(feed, route_id, [busiest_date])
403
print(f"\nTimetable for route {route_id}:")
404
print(route_timetable.head())
405
```
406
407
### Feed-Level Analysis
408
409
```python
410
# Compute feed-level statistics
411
feed_stats = gk.compute_feed_stats(
412
feed=feed,
413
trip_stats=trip_stats,
414
dates=[busiest_date],
415
split_route_types=True
416
)
417
418
print("Feed-Level Statistics:")
419
print(feed_stats)
420
421
# Analyze by route type
422
if 'route_type' in feed_stats.columns:
423
by_route_type = feed_stats.groupby('route_type').sum()
424
print("\nStatistics by Route Type:")
425
print(by_route_type)
426
```
427
428
### Stop Analysis
429
430
```python
431
# Compute stop statistics
432
stop_stats = gk.compute_stop_stats(
433
feed=feed,
434
dates=[busiest_date],
435
stop_ids=None, # All stops
436
headway_start_time="06:00:00",
437
headway_end_time="22:00:00",
438
split_directions=False
439
)
440
441
# Find busiest stops
442
busiest_stops = stop_stats.nlargest(10, 'num_trips')
443
print("Busiest Stops:")
444
print(busiest_stops[['stop_id', 'num_trips', 'num_routes', 'mean_headway']])
445
446
# Build timetable for busiest stop
447
busiest_stop_id = busiest_stops['stop_id'].iloc[0]
448
stop_timetable = gk.build_stop_timetable(feed, busiest_stop_id, [busiest_date])
449
print(f"\nTimetable for stop {busiest_stop_id}:")
450
print(stop_timetable.head())
451
```
452
453
### Service Analysis
454
455
```python
456
# Analyze service patterns
457
active_services = gk.get_active_services(feed, busiest_date)
458
print(f"Active services on {busiest_date}: {len(active_services)}")
459
460
# Get service weeks
461
first_week = gk.get_first_week(feed)
462
print(f"First service week: {first_week}")
463
464
# Check stop activity across multiple dates
465
dates_sample = dates[:7]
466
stop_activity = gk.compute_stop_activity(feed, dates_sample)
467
print(f"Stop activity matrix shape: {stop_activity.shape}")
468
469
# Find stops with varying service
470
varying_stops = stop_activity[stop_activity.sum(axis=1).between(1, len(dates_sample)-1)]
471
print(f"Stops with varying service: {len(varying_stops)}")
472
```