A Python package for offline access to Vega datasets
Enhanced dataset loaders for specific datasets that require custom parsing, date handling, data transformation, or return types beyond standard DataFrames.
Enhanced stocks dataset loader supporting both standard long format and pivoted wide format for time series analysis.
class Stocks(Dataset):
def __call__(self, pivoted: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:
"""
Load stocks dataset with optional pivot transformation.
Parameters:
- pivoted: bool, if True pivot data so each stock is in separate column
- use_local: bool, prefer local data when available
- **kwargs: additional arguments passed to pandas parser
Returns:
pandas.DataFrame: stocks data in long format (default) or wide format (pivoted)
"""Usage Example:
from vega_datasets import data
# Standard long format
stocks_long = data.stocks()
print(stocks_long.head(3))
# symbol date price
# 0 MSFT 2000-01-01 39.81
# 1 MSFT 2000-02-01 36.35
# 2 MSFT 2000-03-01 43.22
# Pivoted wide format for time series analysis
stocks_wide = data.stocks(pivoted=True)
print(stocks_wide.head(3))
# symbol AAPL AMZN GOOG IBM MSFT
# date
# 2000-01-01 25.94 64.56 NaN 100.52 39.81
# 2000-02-01 28.66 68.87 NaN 92.11 36.35
# 2000-03-01 33.95 67.00 NaN 106.11 43.22Specialized loader for network graph data returning separate node and link DataFrames.
class Miserables(Dataset):
def __call__(self, use_local: bool = True, **kwargs) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Load Les Misérables character network data.
Parameters:
- use_local: bool, prefer local data when available
- **kwargs: additional arguments passed to JSON parser
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: (nodes, links) DataFrames
"""Usage Example:
from vega_datasets import data
# Returns tuple of two DataFrames
nodes, links = data.miserables()
print("Nodes DataFrame:")
print(nodes.head())
# group name
# 0 1 Myriel
# 1 1 Napoleon
# 2 1 Mlle.Baptistine
print("Links DataFrame:")
print(links.head())
# source target value
# 0 1 0 1
# 1 2 0 8
# 2 3 0 10Specialized loaders for geographic data that return Python dictionaries containing TopoJSON structures rather than DataFrames.
class US_10M(Dataset):
def __call__(self, use_local: bool = True, **kwargs) -> dict:
"""
Load US geographic boundaries as TopoJSON.
Parameters:
- use_local: bool, prefer local data when available
- **kwargs: additional arguments passed to JSON parser
Returns:
dict: TopoJSON structure with US geographic boundaries
"""
class World_110M(Dataset):
def __call__(self, use_local: bool = True, **kwargs) -> dict:
"""
Load world geographic boundaries as TopoJSON.
Parameters:
- use_local: bool, prefer local data when available
- **kwargs: additional arguments passed to JSON parser
Returns:
dict: TopoJSON structure with world geographic boundaries
"""Usage Example:
from vega_datasets import data
# Geographic data as dictionary structures
us_geo = data.us_10m()
world_geo = data.world_110m()
print(f"US data type: {type(us_geo)}") # <class 'dict'>
print(f"World data type: {type(world_geo)}") # <class 'dict'>
# TopoJSON structure
print("US TopoJSON keys:", list(us_geo.keys()))
# ['type', 'arcs', 'objects', 'transform']
# Use with geographic visualization libraries
import altair as alt
# These can be used directly with Altair/Vega-Lite geographic visualizationsMultiple datasets with automatic date/time parsing for time series analysis.
class Cars(Dataset):
"""Cars dataset with Year field converted to datetime."""
class Climate(Dataset):
"""Climate dataset with DATE field parsed as datetime."""
class Github(Dataset):
"""GitHub dataset with time field parsed as datetime."""
class IowaElectricity(Dataset):
"""Iowa electricity dataset with year field parsed as datetime."""
class LARiots(Dataset):
"""LA riots dataset with death_date field parsed as datetime."""
class SeattleTemps(Dataset):
"""Seattle temperatures with date field parsed as datetime."""
class SeattleWeather(Dataset):
"""Seattle weather with date field parsed as datetime."""
class SFTemps(Dataset):
"""San Francisco temperatures with date field parsed as datetime."""
class Sp500(Dataset):
"""S&P 500 dataset with date field parsed as datetime."""
class UnemploymentAcrossIndustries(Dataset):
"""Unemployment dataset with date field converted to datetime."""Usage Example:
from vega_datasets import data
# Date parsing happens automatically
seattle_weather = data.seattle_weather()
print(seattle_weather.dtypes)
# date datetime64[ns]
# precipitation float64
# temp_max float64
# temp_min float64
# wind float64
# weather object
# Ready for time series analysis
print(seattle_weather['date'].min()) # 2012-01-01 00:00:00
print(seattle_weather['date'].max()) # 2015-12-31 00:00:00Dataset with custom data type specifications for proper data handling.
class ZIPCodes(Dataset):
"""ZIP codes dataset with zip_code field as string/object dtype."""Usage Example:
from vega_datasets import data
# ZIP codes preserved as strings (not converted to integers)
zipcodes = data.zipcodes()
print(zipcodes.dtypes)
# zip_code object # Preserved as string
# latitude float64
# longitude float64
print(zipcodes['zip_code'].head())
# 0 01001
# 1 01002
# 2 01003
# Preserves leading zerosfrom vega_datasets import data
import networkx as nx
# Load network data
nodes, links = data.miserables()
# Create NetworkX graph
G = nx.Graph()
# Add nodes with attributes
for idx, row in nodes.iterrows():
G.add_node(idx, **row.to_dict())
# Add edges
for _, row in links.iterrows():
G.add_edge(row['source'], row['target'], weight=row['value'])
print(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges")from vega_datasets import data
import json
# Load geographic data
us_topo = data.us_10m()
world_topo = data.world_110m()
# Save to files for use with other tools
with open('us_boundaries.json', 'w') as f:
json.dump(us_topo, f)
# Extract specific geographic features
states = us_topo['objects']['states']
counties = us_topo['objects']['counties']
print(f"US data contains: {list(us_topo['objects'].keys())}")from vega_datasets import data
import pandas as pd
# Load time series data (dates auto-parsed)
seattle_weather = data.seattle_weather()
stocks = data.stocks(pivoted=True) # Wide format for multiple series
# Time series operations
monthly_temps = seattle_weather.groupby(seattle_weather['date'].dt.to_period('M')).agg({
'temp_max': 'mean',
'temp_min': 'mean',
'precipitation': 'sum'
})
# Stock returns analysis
stock_returns = stocks.pct_change().dropna()
print("Average daily returns by stock:")
print(stock_returns.mean())from vega_datasets import data
# Combine different dataset formats and types
airports_df = data.airports() # CSV -> DataFrame
github_df = data.github() # JSON -> DataFrame
nodes, links = data.miserables() # JSON -> Tuple[DataFrame, DataFrame]
us_geo = data.us_10m() # JSON -> dict
# Integration example: airports with geographic boundaries
import altair as alt
# Create map visualization combining airports and geographic data
airports_map = alt.Chart(alt.InlineData(values=us_geo, format=alt.DataFormat(property='features', type='json'))).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
width=500,
height=300
) + alt.Chart(airports_df).mark_circle().encode(
latitude='latitude:Q',
longitude='longitude:Q',
size=alt.value(20)
)Install with Tessl CLI
npx tessl i tessl/pypi-vega-datasets