or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-analysis.mddata-cleaning.mdfeed-operations.mdgeospatial.mdindex.mdtime-series.mdvalidation.md

data-analysis.mddocs/

0

# Data Analysis

1

2

Statistical analysis, feed summaries, and computational functions for transit metrics.

3

4

## Feed Analysis Functions

5

6

{ .api }

7

```python

8

def describe(feed: Feed, sample_date: str) -> pd.DataFrame:

9

"""

10

Generate comprehensive feed description with key metrics and indicators.

11

12

Args:

13

feed: Feed object to describe

14

sample_date: Date string (YYYYMMDD) to use for analysis

15

16

Returns:

17

DataFrame with feed statistics including routes, stops, trips, and service metrics

18

"""

19

20

def summarize(feed: Feed, table: str) -> pd.DataFrame:

21

"""

22

Summarize a GTFS table with column statistics and data quality metrics.

23

24

Args:

25

feed: Feed object containing the table

26

table: Name of table to summarize ("stops", "routes", "trips", etc.)

27

28

Returns:

29

DataFrame with column-wise statistics including null counts, data types, and distributions

30

"""

31

```

32

33

## Feed-Level Statistics

34

35

{ .api }

36

```python

37

def compute_feed_stats(feed: Feed, trip_stats: pd.DataFrame, dates: list[str],

38

*, split_route_types: bool = False) -> pd.DataFrame:

39

"""

40

Compute comprehensive feed-level statistics across multiple dates.

41

42

Args:

43

feed: Feed object to analyze

44

trip_stats: Pre-computed trip statistics DataFrame

45

dates: List of service dates to include in analysis

46

split_route_types: If True, compute separate stats by route type

47

48

Returns:

49

DataFrame with feed-level metrics including total distance, service hours, frequencies

50

"""

51

52

def compute_feed_stats_0(feed: Feed, trip_stats_subset: pd.DataFrame,

53

*, split_route_types: bool = False) -> pd.DataFrame:

54

"""

55

Helper function to compute feed stats for a single date/subset.

56

57

Args:

58

feed: Feed object to analyze

59

trip_stats_subset: Trip statistics for specific subset

60

split_route_types: If True, split results by route type

61

62

Returns:

63

DataFrame with feed statistics for the subset

64

"""

65

```

66

67

## Trip Analysis

68

69

{ .api }

70

```python

71

def compute_trip_stats(feed: Feed, route_ids: list[str] | None,

72

*, compute_dist_from_shapes: bool = False) -> pd.DataFrame:

73

"""

74

Compute comprehensive statistics for trips including distances, durations, and patterns.

75

76

Args:

77

feed: Feed object containing trip data

78

route_ids: List of route IDs to include, or None for all routes

79

compute_dist_from_shapes: If True, compute distances from shape geometries

80

81

Returns:

82

DataFrame with per-trip statistics including:

83

- trip_id, route_id, direction_id, service_id

84

- start_time, end_time, duration

85

- distance, speed, num_stops

86

- stop_pattern information

87

"""

88

89

def compute_trip_activity(feed: Feed, dates: list[str]) -> pd.DataFrame:

90

"""

91

Determine which trips are active on specified dates.

92

93

Args:

94

feed: Feed object containing trip and calendar data

95

dates: List of dates to check for trip activity

96

97

Returns:

98

DataFrame marking trips as active (1) or inactive (0) for each date

99

"""

100

101

def locate_trips(feed: Feed, date: str, times: list[str]) -> pd.DataFrame:

102

"""

103

Locate trip positions at specified times on a given date.

104

105

Args:

106

feed: Feed object with trip and schedule data

107

date: Date string (YYYYMMDD) for analysis

108

times: List of time strings (HH:MM:SS) to locate trips

109

110

Returns:

111

DataFrame with trip positions, stops, and progress at each time

112

"""

113

114

def name_stop_patterns(feed: Feed) -> pd.DataFrame:

115

"""

116

Add stop pattern names to trips based on their sequence of stops.

117

118

Args:

119

feed: Feed object containing trips and stop_times

120

121

Returns:

122

Modified trips DataFrame with stop_pattern_name column added

123

"""

124

```

125

126

## Route Analysis

127

128

{ .api }

129

```python

130

def compute_route_stats(feed: Feed, trip_stats_subset: pd.DataFrame, dates: list[str],

131

headway_start_time: str, headway_end_time: str,

132

*, split_directions: bool = False) -> pd.DataFrame:

133

"""

134

Compute route-level statistics including service frequency and headways.

135

136

Args:

137

feed: Feed object containing route data

138

trip_stats_subset: Pre-computed trip statistics to aggregate

139

dates: List of service dates to analyze

140

headway_start_time: Start time for headway calculation (HH:MM:SS)

141

headway_end_time: End time for headway calculation (HH:MM:SS)

142

split_directions: If True, compute separate stats by direction

143

144

Returns:

145

DataFrame with route statistics including:

146

- route_id, direction_id (if split_directions=True)

147

- num_trips, num_trip_starts, num_trip_ends

148

- start_time, end_time, max_headway, mean_headway

149

- max_speed, mean_speed, service_distance, service_duration

150

"""

151

152

def compute_route_stats_0(trip_stats_subset: pd.DataFrame, headway_start_time: str,

153

headway_end_time: str, *, split_directions: bool = False) -> pd.DataFrame:

154

"""

155

Helper function to compute route stats from trip stats subset.

156

157

Args:

158

trip_stats_subset: Trip statistics DataFrame subset

159

headway_start_time: Start time for headway calculation

160

headway_end_time: End time for headway calculation

161

split_directions: If True, split by direction

162

163

Returns:

164

DataFrame with route-level aggregated statistics

165

"""

166

167

def build_route_timetable(feed: Feed, route_id: str, dates: list[str]) -> pd.DataFrame:

168

"""

169

Build a timetable showing all trips for a specific route.

170

171

Args:

172

feed: Feed object containing schedule data

173

route_id: Route ID to build timetable for

174

dates: List of dates to include in timetable

175

176

Returns:

177

DataFrame with trip times organized by route and date

178

"""

179

```

180

181

## Stop Analysis

182

183

{ .api }

184

```python

185

def compute_stop_stats(feed: Feed, dates: list[str], stop_ids: list[str] | None,

186

headway_start_time: str, headway_end_time: str,

187

*, split_directions: bool = False) -> pd.DataFrame:

188

"""

189

Compute stop-level statistics including service frequency and activity.

190

191

Args:

192

feed: Feed object containing stop and schedule data

193

dates: List of service dates to analyze

194

stop_ids: List of stop IDs to include, or None for all stops

195

headway_start_time: Start time for frequency analysis (HH:MM:SS)

196

headway_end_time: End time for frequency analysis (HH:MM:SS)

197

split_directions: If True, compute separate stats by direction

198

199

Returns:

200

DataFrame with stop statistics including:

201

- stop_id, direction_id (if split_directions=True)

202

- num_routes, num_trips, max_headway, mean_headway

203

- start_time, end_time, peak_num_trips, peak_start_time, peak_end_time

204

"""

205

206

def compute_stop_stats_0(stop_times_subset: pd.DataFrame, trip_subset: pd.DataFrame,

207

headway_start_time: str, headway_end_time: str,

208

*, split_directions: bool = False) -> pd.DataFrame:

209

"""

210

Helper function to compute stop stats from subsets.

211

212

Args:

213

stop_times_subset: Stop times DataFrame subset

214

trip_subset: Trips DataFrame subset

215

headway_start_time: Start time for analysis

216

headway_end_time: End time for analysis

217

split_directions: If True, split by direction

218

219

Returns:

220

DataFrame with stop-level statistics

221

"""

222

223

def compute_stop_activity(feed: Feed, dates: list[str]) -> pd.DataFrame:

224

"""

225

Determine which stops are active (have service) on specified dates.

226

227

Args:

228

feed: Feed object containing stop and schedule data

229

dates: List of dates to check for stop activity

230

231

Returns:

232

DataFrame marking stops as active (1) or inactive (0) for each date

233

"""

234

235

def build_stop_timetable(feed: Feed, stop_id: str, dates: list[str]) -> pd.DataFrame:

236

"""

237

Build a timetable showing all arrivals/departures for a specific stop.

238

239

Args:

240

feed: Feed object containing schedule data

241

stop_id: Stop ID to build timetable for

242

dates: List of dates to include in timetable

243

244

Returns:

245

DataFrame with arrival/departure times organized by stop and date

246

"""

247

```

248

249

## Service and Calendar Analysis

250

251

{ .api }

252

```python

253

def get_active_services(feed: Feed, date: str) -> list[str]:

254

"""

255

Get list of service IDs that are active on a specific date.

256

257

Args:

258

feed: Feed object containing calendar data

259

date: Date string (YYYYMMDD) to check

260

261

Returns:

262

List of service_id values active on the specified date

263

"""

264

265

def compute_busiest_date(feed: Feed, dates: list[str]) -> str:

266

"""

267

Find the date with the maximum number of active trips.

268

269

Args:

270

feed: Feed object with trip and calendar data

271

dates: List of candidate dates to compare

272

273

Returns:

274

Date string (YYYYMMDD) with most active trips

275

"""

276

277

def get_dates(feed: Feed, *, as_date_obj: bool = False) -> list[str]:

278

"""

279

Get all valid service dates for the feed based on calendar definitions.

280

281

Args:

282

feed: Feed object containing calendar data

283

as_date_obj: If True, return date objects instead of strings

284

285

Returns:

286

List of valid service dates (YYYYMMDD strings or date objects)

287

"""

288

289

def get_week(feed: Feed, k: int, *, as_date_obj: bool = False) -> list[str]:

290

"""

291

Get the kth Monday-to-Sunday week of service.

292

293

Args:

294

feed: Feed object containing calendar data

295

k: Week number (0-indexed, 0 = first week)

296

as_date_obj: If True, return date objects instead of strings

297

298

Returns:

299

List of 7 dates representing the kth week

300

"""

301

302

def get_first_week(feed: Feed, *, as_date_obj: bool = False) -> list[str]:

303

"""

304

Get the first Monday-to-Sunday week of service.

305

306

Args:

307

feed: Feed object containing calendar data

308

as_date_obj: If True, return date objects instead of strings

309

310

Returns:

311

List of 7 dates representing the first service week

312

"""

313

```

314

315

## Unit Conversion

316

317

{ .api }

318

```python

319

def convert_dist(feed: Feed, new_dist_units: str) -> Feed:

320

"""

321

Convert all distance measurements in feed to new units.

322

323

Args:

324

feed: Feed object with distance data

325

new_dist_units: Target distance units ("km", "m", "mi", "ft")

326

327

Returns:

328

New Feed object with distances converted to specified units

329

"""

330

```

331

332

## Usage Examples

333

334

### Feed Overview and Description

335

336

```python

337

import gtfs_kit as gk

338

339

# Load feed

340

feed = gk.read_feed("data/gtfs.zip")

341

342

# Get comprehensive feed description

343

dates = gk.get_dates(feed)

344

sample_date = dates[0] if dates else "20240315"

345

description = gk.describe(feed, sample_date)

346

347

print("Feed Description:")

348

print(description)

349

350

# Summarize individual tables

351

stops_summary = gk.summarize(feed, "stops")

352

routes_summary = gk.summarize(feed, "routes")

353

354

print(f"\nStops table summary:")

355

print(stops_summary)

356

```

357

358

### Trip Analysis Workflow

359

360

```python

361

# Compute comprehensive trip statistics

362

trip_stats = gk.compute_trip_stats(feed, route_ids=None, compute_dist_from_shapes=True)

363

364

print(f"Analyzed {len(trip_stats)} trips")

365

print("Trip statistics columns:", list(trip_stats.columns))

366

367

# Analyze trip patterns

368

patterns_df = gk.name_stop_patterns(feed)

369

pattern_counts = patterns_df['stop_pattern_name'].value_counts()

370

print(f"Found {len(pattern_counts)} unique stop patterns")

371

372

# Check trip activity across dates

373

dates = gk.get_dates(feed)[:7] # First week

374

trip_activity = gk.compute_trip_activity(feed, dates)

375

print(f"Trip activity shape: {trip_activity.shape}")

376

```

377

378

### Route Performance Analysis

379

380

```python

381

# Find busiest service day

382

dates = gk.get_dates(feed)

383

busiest_date = gk.compute_busiest_date(feed, dates)

384

print(f"Busiest service date: {busiest_date}")

385

386

# Compute route statistics

387

trip_stats = gk.compute_trip_stats(feed, route_ids=None)

388

route_stats = gk.compute_route_stats(

389

feed=feed,

390

trip_stats_subset=trip_stats,

391

dates=[busiest_date],

392

headway_start_time="07:00:00",

393

headway_end_time="19:00:00",

394

split_directions=True

395

)

396

397

print("Route Statistics:")

398

print(route_stats[['route_id', 'direction_id', 'mean_headway', 'num_trips']].head())

399

400

# Analyze specific route

401

route_id = route_stats['route_id'].iloc[0]

402

route_timetable = gk.build_route_timetable(feed, route_id, [busiest_date])

403

print(f"\nTimetable for route {route_id}:")

404

print(route_timetable.head())

405

```

406

407

### Feed-Level Analysis

408

409

```python

410

# Compute feed-level statistics

411

feed_stats = gk.compute_feed_stats(

412

feed=feed,

413

trip_stats=trip_stats,

414

dates=[busiest_date],

415

split_route_types=True

416

)

417

418

print("Feed-Level Statistics:")

419

print(feed_stats)

420

421

# Analyze by route type

422

if 'route_type' in feed_stats.columns:

423

by_route_type = feed_stats.groupby('route_type').sum()

424

print("\nStatistics by Route Type:")

425

print(by_route_type)

426

```

427

428

### Stop Analysis

429

430

```python

431

# Compute stop statistics

432

stop_stats = gk.compute_stop_stats(

433

feed=feed,

434

dates=[busiest_date],

435

stop_ids=None, # All stops

436

headway_start_time="06:00:00",

437

headway_end_time="22:00:00",

438

split_directions=False

439

)

440

441

# Find busiest stops

442

busiest_stops = stop_stats.nlargest(10, 'num_trips')

443

print("Busiest Stops:")

444

print(busiest_stops[['stop_id', 'num_trips', 'num_routes', 'mean_headway']])

445

446

# Build timetable for busiest stop

447

busiest_stop_id = busiest_stops['stop_id'].iloc[0]

448

stop_timetable = gk.build_stop_timetable(feed, busiest_stop_id, [busiest_date])

449

print(f"\nTimetable for stop {busiest_stop_id}:")

450

print(stop_timetable.head())

451

```

452

453

### Service Analysis

454

455

```python

456

# Analyze service patterns

457

active_services = gk.get_active_services(feed, busiest_date)

458

print(f"Active services on {busiest_date}: {len(active_services)}")

459

460

# Get service weeks

461

first_week = gk.get_first_week(feed)

462

print(f"First service week: {first_week}")

463

464

# Check stop activity across multiple dates

465

dates_sample = dates[:7]

466

stop_activity = gk.compute_stop_activity(feed, dates_sample)

467

print(f"Stop activity matrix shape: {stop_activity.shape}")

468

469

# Find stops with varying service

470

varying_stops = stop_activity[stop_activity.sum(axis=1).between(1, len(dates_sample)-1)]

471

print(f"Stops with varying service: {len(varying_stops)}")

472

```