A Python package for offline access to Vega datasets
Comprehensive dataset loading capabilities providing access to 70 curated datasets from the Vega visualization ecosystem. Supports both local and remote data sources with automatic format detection and pandas integration.
Main interface for accessing all available datasets with flexible loading options and format support.
class DataLoader:
def __call__(self, name: str, return_raw: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:
"""
Load a dataset by name.
Parameters:
- name: str, dataset name (use list_datasets() to see available names)
- return_raw: bool, if True return raw bytes instead of DataFrame
- use_local: bool, if True prefer local data over remote when available
- **kwargs: additional arguments passed to pandas parser (read_csv, read_json)
Returns:
pandas.DataFrame or bytes (if return_raw=True)
"""
def list_datasets(self) -> List[str]:
"""Return list of all available dataset names."""
def __getattr__(self, dataset_name: str):
"""Access datasets as attributes (e.g., data.iris())."""
def __dir__(self) -> List[str]:
"""Support for tab completion and introspection."""Restricted loader for only locally bundled datasets, ensuring offline operation.
class LocalDataLoader:
def __call__(self, name: str, return_raw: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:
"""
Load a locally bundled dataset by name.
Parameters:
- name: str, local dataset name (use list_datasets() to see available names)
- return_raw: bool, if True return raw bytes instead of DataFrame
- use_local: bool, ignored (always True for local loader)
- **kwargs: additional arguments passed to pandas parser
Returns:
pandas.DataFrame or bytes (if return_raw=True)
Raises:
ValueError: if dataset is not available locally
"""
def list_datasets(self) -> List[str]:
"""Return list of locally available dataset names."""
def __getattr__(self, dataset_name: str):
"""Access local datasets as attributes."""Individual dataset handler providing metadata and flexible loading options.
class Dataset:
# Class methods
@classmethod
def init(cls, name: str) -> 'Dataset':
"""Return an instance of appropriate Dataset subclass for the given name."""
@classmethod
def list_datasets(cls) -> List[str]:
"""Return list of all available dataset names."""
@classmethod
def list_local_datasets(cls) -> List[str]:
"""Return list of locally available dataset names."""
# Instance methods
def raw(self, use_local: bool = True) -> bytes:
"""
Load raw dataset bytes.
Parameters:
- use_local: bool, if True and dataset is local, load from package
Returns:
bytes: raw dataset content
"""
def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame:
"""
Load and parse dataset.
Parameters:
- use_local: bool, prefer local data when available
- **kwargs: passed to pandas parser (read_csv, read_json, read_csv with sep='\t')
Returns:
pandas.DataFrame: parsed dataset
"""
# Properties
@property
def filepath(self) -> str:
"""Local file path (only valid for local datasets)."""
# Instance attributes
name: str # Dataset name
methodname: str # Method-safe name (hyphens -> underscores)
filename: str # Original filename
url: str # Full remote URL
format: str # File format ('csv', 'json', 'tsv', 'png')
pkg_filename: str # Path within package
is_local: bool # True if bundled locally
description: str # Dataset description
references: List[str] # Academic referencesfrom vega_datasets import data
# Load classic iris dataset
iris = data.iris()
print(iris.shape) # (150, 5)
print(iris.columns.tolist()) # ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species']
# Load by string name
cars = data('cars')
print(cars.head())
# Pass pandas arguments
airports = data.airports(usecols=['iata', 'name', 'city', 'state'])from vega_datasets import data, local_data
# Force remote loading (even for local datasets)
iris_remote = data.iris(use_local=False)
# Local-only loading (fails for remote datasets)
try:
stocks_local = local_data.stocks() # Works - stocks is local
github_local = local_data.github() # Fails - github is remote-only
except ValueError as e:
print(f"Error: {e}")
# Check if dataset is local
print(f"Iris is local: {data.iris.is_local}") # True
print(f"GitHub is local: {data.github.is_local}") # Falsefrom vega_datasets import data
# Get raw bytes instead of DataFrame
raw_data = data.iris.raw()
print(type(raw_data)) # <class 'bytes'>
# Use with custom parsing
import json
raw_json = data.cars.raw()
custom_data = json.loads(raw_json.decode())
# Raw data through call method
raw_csv = data('airports', return_raw=True)from vega_datasets import data, local_data
# List all datasets
all_datasets = data.list_datasets()
print(f"Total datasets: {len(all_datasets)}") # 70
# List only local datasets
local_datasets = local_data.list_datasets()
print(f"Local datasets: {len(local_datasets)}") # 17
# Check specific dataset availability
print("Local datasets:", local_datasets[:5])
# ['airports', 'anscombe', 'barley', 'burtin', 'cars']
# Use tab completion in interactive environments
# data.<TAB> shows all available datasetsfrom vega_datasets import data
import pandas as pd
# Load with pandas options
flights = data.flights(
parse_dates=['date'],
dtype={'origin': 'category', 'destination': 'category'}
)
# TSV format handling (automatic)
seattle_temps = data.seattle_temps() # Handles TSV automatically
# JSON with custom options
github_data = data.github(lines=True) # If supported by dataset formatfrom vega_datasets import data
# Access dataset metadata
iris_dataset = data.iris # Get Dataset object (don't call yet)
print(f"Name: {iris_dataset.name}")
print(f"Format: {iris_dataset.format}")
print(f"URL: {iris_dataset.url}")
print(f"Local: {iris_dataset.is_local}")
print(f"Description: {iris_dataset.description}")
# Get file path for local datasets
if iris_dataset.is_local:
print(f"Local path: {iris_dataset.filepath}")from vega_datasets import data
from urllib.error import URLError
# Handle invalid dataset names
try:
df = data('invalid-name')
except ValueError as e:
print(f"Dataset error: {e}")
# Handle network issues for remote datasets
try:
df = data.github(use_local=False)
except URLError as e:
print(f"Network error: {e}")
# Fallback to local if available
if data.github.is_local:
df = data.github(use_local=True)from vega_datasets.utils import connection_ok
# Check internet connectivity before loading remote datasets
if connection_ok():
github_data = data.github()
print("Loaded remote dataset successfully")
else:
print("No internet connection - using local datasets only")
local_datasets = local_data.list_datasets()
stocks_data = local_data.stocks()The package automatically handles multiple data formats:
Format detection is automatic based on dataset metadata, with appropriate pandas parsers used for each format.
Note: PNG format datasets (like 7zip) can only be accessed via the raw() method or with return_raw=True, as the DataFrame parsing will raise a ValueError for unsupported formats.
Install with Tessl CLI
npx tessl i tessl/pypi-vega-datasets