or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dataset-loading.mdindex.mdspecialized-datasets.md

specialized-datasets.mddocs/

0

# Specialized Datasets

1

2

Enhanced dataset loaders for specific datasets that require custom parsing, date handling, data transformation, or return types beyond standard DataFrames.

3

4

## Capabilities

5

6

### Stocks Dataset with Pivot Support

7

8

Enhanced stocks dataset loader supporting both standard long format and pivoted wide format for time series analysis.

9

10

```python { .api }

11

class Stocks(Dataset):

12

def __call__(self, pivoted: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:

13

"""

14

Load stocks dataset with optional pivot transformation.

15

16

Parameters:

17

- pivoted: bool, if True pivot data so each stock is in separate column

18

- use_local: bool, prefer local data when available

19

- **kwargs: additional arguments passed to pandas parser

20

21

Returns:

22

pandas.DataFrame: stocks data in long format (default) or wide format (pivoted)

23

"""

24

```

25

26

**Usage Example:**

27

```python

28

from vega_datasets import data

29

30

# Standard long format

31

stocks_long = data.stocks()

32

print(stocks_long.head(3))

33

# symbol date price

34

# 0 MSFT 2000-01-01 39.81

35

# 1 MSFT 2000-02-01 36.35

36

# 2 MSFT 2000-03-01 43.22

37

38

# Pivoted wide format for time series analysis

39

stocks_wide = data.stocks(pivoted=True)

40

print(stocks_wide.head(3))

41

# symbol AAPL AMZN GOOG IBM MSFT

42

# date

43

# 2000-01-01 25.94 64.56 NaN 100.52 39.81

44

# 2000-02-01 28.66 68.87 NaN 92.11 36.35

45

# 2000-03-01 33.95 67.00 NaN 106.11 43.22

46

```

47

48

### Miserables Network Dataset

49

50

Specialized loader for network graph data returning separate node and link DataFrames.

51

52

```python { .api }

53

class Miserables(Dataset):

54

def __call__(self, use_local: bool = True, **kwargs) -> Tuple[pd.DataFrame, pd.DataFrame]:

55

"""

56

Load Les Misérables character network data.

57

58

Parameters:

59

- use_local: bool, prefer local data when available

60

- **kwargs: additional arguments passed to JSON parser

61

62

Returns:

63

Tuple[pd.DataFrame, pd.DataFrame]: (nodes, links) DataFrames

64

"""

65

```

66

67

**Usage Example:**

68

```python

69

from vega_datasets import data

70

71

# Returns tuple of two DataFrames

72

nodes, links = data.miserables()

73

74

print("Nodes DataFrame:")

75

print(nodes.head())

76

# group name

77

# 0 1 Myriel

78

# 1 1 Napoleon

79

# 2 1 Mlle.Baptistine

80

81

print("Links DataFrame:")

82

print(links.head())

83

# source target value

84

# 0 1 0 1

85

# 1 2 0 8

86

# 2 3 0 10

87

```

88

89

### Geographic TopoJSON Datasets

90

91

Specialized loaders for geographic data that return Python dictionaries containing TopoJSON structures rather than DataFrames.

92

93

```python { .api }

94

class US_10M(Dataset):

95

def __call__(self, use_local: bool = True, **kwargs) -> dict:

96

"""

97

Load US geographic boundaries as TopoJSON.

98

99

Parameters:

100

- use_local: bool, prefer local data when available

101

- **kwargs: additional arguments passed to JSON parser

102

103

Returns:

104

dict: TopoJSON structure with US geographic boundaries

105

"""

106

107

class World_110M(Dataset):

108

def __call__(self, use_local: bool = True, **kwargs) -> dict:

109

"""

110

Load world geographic boundaries as TopoJSON.

111

112

Parameters:

113

- use_local: bool, prefer local data when available

114

- **kwargs: additional arguments passed to JSON parser

115

116

Returns:

117

dict: TopoJSON structure with world geographic boundaries

118

"""

119

```

120

121

**Usage Example:**

122

```python

123

from vega_datasets import data

124

125

# Geographic data as dictionary structures

126

us_geo = data.us_10m()

127

world_geo = data.world_110m()

128

129

print(f"US data type: {type(us_geo)}") # <class 'dict'>

130

print(f"World data type: {type(world_geo)}") # <class 'dict'>

131

132

# TopoJSON structure

133

print("US TopoJSON keys:", list(us_geo.keys()))

134

# ['type', 'arcs', 'objects', 'transform']

135

136

# Use with geographic visualization libraries

137

import altair as alt

138

# These can be used directly with Altair/Vega-Lite geographic visualizations

139

```

140

141

### Date-Parsed Datasets

142

143

Multiple datasets with automatic date/time parsing for time series analysis.

144

145

```python { .api }

146

class Cars(Dataset):

147

"""Cars dataset with Year field converted to datetime."""

148

149

class Climate(Dataset):

150

"""Climate dataset with DATE field parsed as datetime."""

151

152

class Github(Dataset):

153

"""GitHub dataset with time field parsed as datetime."""

154

155

class IowaElectricity(Dataset):

156

"""Iowa electricity dataset with year field parsed as datetime."""

157

158

class LARiots(Dataset):

159

"""LA riots dataset with death_date field parsed as datetime."""

160

161

class SeattleTemps(Dataset):

162

"""Seattle temperatures with date field parsed as datetime."""

163

164

class SeattleWeather(Dataset):

165

"""Seattle weather with date field parsed as datetime."""

166

167

class SFTemps(Dataset):

168

"""San Francisco temperatures with date field parsed as datetime."""

169

170

class Sp500(Dataset):

171

"""S&P 500 dataset with date field parsed as datetime."""

172

173

class UnemploymentAcrossIndustries(Dataset):

174

"""Unemployment dataset with date field converted to datetime."""

175

```

176

177

**Usage Example:**

178

```python

179

from vega_datasets import data

180

181

# Date parsing happens automatically

182

seattle_weather = data.seattle_weather()

183

print(seattle_weather.dtypes)

184

# date datetime64[ns]

185

# precipitation float64

186

# temp_max float64

187

# temp_min float64

188

# wind float64

189

# weather object

190

191

# Ready for time series analysis

192

print(seattle_weather['date'].min()) # 2012-01-01 00:00:00

193

print(seattle_weather['date'].max()) # 2015-12-31 00:00:00

194

```

195

196

### Specialized Data Type Handling

197

198

Dataset with custom data type specifications for proper data handling.

199

200

```python { .api }

201

class ZIPCodes(Dataset):

202

"""ZIP codes dataset with zip_code field as string/object dtype."""

203

```

204

205

**Usage Example:**

206

```python

207

from vega_datasets import data

208

209

# ZIP codes preserved as strings (not converted to integers)

210

zipcodes = data.zipcodes()

211

print(zipcodes.dtypes)

212

# zip_code object # Preserved as string

213

# latitude float64

214

# longitude float64

215

216

print(zipcodes['zip_code'].head())

217

# 0 01001

218

# 1 01002

219

# 2 01003

220

# Preserves leading zeros

221

```

222

223

## Advanced Usage Patterns

224

225

### Working with Network Data

226

227

```python

228

from vega_datasets import data

229

import networkx as nx

230

231

# Load network data

232

nodes, links = data.miserables()

233

234

# Create NetworkX graph

235

G = nx.Graph()

236

237

# Add nodes with attributes

238

for idx, row in nodes.iterrows():

239

G.add_node(idx, **row.to_dict())

240

241

# Add edges

242

for _, row in links.iterrows():

243

G.add_edge(row['source'], row['target'], weight=row['value'])

244

245

print(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges")

246

```

247

248

### Geographic Data Processing

249

250

```python

251

from vega_datasets import data

252

import json

253

254

# Load geographic data

255

us_topo = data.us_10m()

256

world_topo = data.world_110m()

257

258

# Save to files for use with other tools

259

with open('us_boundaries.json', 'w') as f:

260

json.dump(us_topo, f)

261

262

# Extract specific geographic features

263

states = us_topo['objects']['states']

264

counties = us_topo['objects']['counties']

265

print(f"US data contains: {list(us_topo['objects'].keys())}")

266

```

267

268

### Time Series Analysis

269

270

```python

271

from vega_datasets import data

272

import pandas as pd

273

274

# Load time series data (dates auto-parsed)

275

seattle_weather = data.seattle_weather()

276

stocks = data.stocks(pivoted=True) # Wide format for multiple series

277

278

# Time series operations

279

monthly_temps = seattle_weather.groupby(seattle_weather['date'].dt.to_period('M')).agg({

280

'temp_max': 'mean',

281

'temp_min': 'mean',

282

'precipitation': 'sum'

283

})

284

285

# Stock returns analysis

286

stock_returns = stocks.pct_change().dropna()

287

print("Average daily returns by stock:")

288

print(stock_returns.mean())

289

```

290

291

### Multi-Format Dataset Integration

292

293

```python

294

from vega_datasets import data

295

296

# Combine different dataset formats and types

297

airports_df = data.airports() # CSV -> DataFrame

298

github_df = data.github() # JSON -> DataFrame

299

nodes, links = data.miserables() # JSON -> Tuple[DataFrame, DataFrame]

300

us_geo = data.us_10m() # JSON -> dict

301

302

# Integration example: airports with geographic boundaries

303

import altair as alt

304

305

# Create map visualization combining airports and geographic data

306

airports_map = alt.Chart(alt.InlineData(values=us_geo, format=alt.DataFormat(property='features', type='json'))).mark_geoshape(

307

fill='lightgray',

308

stroke='white'

309

).properties(

310

width=500,

311

height=300

312

) + alt.Chart(airports_df).mark_circle().encode(

313

latitude='latitude:Q',

314

longitude='longitude:Q',

315

size=alt.value(20)

316

)

317

```