CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-vega-datasets

A Python package for offline access to Vega datasets

Overview
Eval results
Files

dataset-loading.mddocs/

Dataset Loading

Comprehensive dataset loading capabilities providing access to 70 curated datasets from the Vega visualization ecosystem. Supports both local and remote data sources with automatic format detection and pandas integration.

Capabilities

DataLoader Class

Main interface for accessing all available datasets with flexible loading options and format support.

class DataLoader:
    def __call__(self, name: str, return_raw: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:
        """
        Load a dataset by name.
        
        Parameters:
        - name: str, dataset name (use list_datasets() to see available names)
        - return_raw: bool, if True return raw bytes instead of DataFrame
        - use_local: bool, if True prefer local data over remote when available
        - **kwargs: additional arguments passed to pandas parser (read_csv, read_json)
        
        Returns:
        pandas.DataFrame or bytes (if return_raw=True)
        """
    
    def list_datasets(self) -> List[str]:
        """Return list of all available dataset names."""
    
    def __getattr__(self, dataset_name: str):
        """Access datasets as attributes (e.g., data.iris())."""
    
    def __dir__(self) -> List[str]:
        """Support for tab completion and introspection."""

LocalDataLoader Class

Restricted loader for only locally bundled datasets, ensuring offline operation.

class LocalDataLoader:
    def __call__(self, name: str, return_raw: bool = False, use_local: bool = True, **kwargs) -> pd.DataFrame:
        """
        Load a locally bundled dataset by name.
        
        Parameters:
        - name: str, local dataset name (use list_datasets() to see available names)
        - return_raw: bool, if True return raw bytes instead of DataFrame
        - use_local: bool, ignored (always True for local loader)
        - **kwargs: additional arguments passed to pandas parser
        
        Returns:
        pandas.DataFrame or bytes (if return_raw=True)
        
        Raises:
        ValueError: if dataset is not available locally
        """
    
    def list_datasets(self) -> List[str]:
        """Return list of locally available dataset names."""
    
    def __getattr__(self, dataset_name: str):
        """Access local datasets as attributes."""

Dataset Base Class

Individual dataset handler providing metadata and flexible loading options.

class Dataset:
    # Class methods
    @classmethod
    def init(cls, name: str) -> 'Dataset':
        """Return an instance of appropriate Dataset subclass for the given name."""
    
    @classmethod
    def list_datasets(cls) -> List[str]:
        """Return list of all available dataset names."""
    
    @classmethod
    def list_local_datasets(cls) -> List[str]:
        """Return list of locally available dataset names."""
    
    # Instance methods
    def raw(self, use_local: bool = True) -> bytes:
        """
        Load raw dataset bytes.
        
        Parameters:
        - use_local: bool, if True and dataset is local, load from package
        
        Returns:
        bytes: raw dataset content
        """
    
    def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame:
        """
        Load and parse dataset.
        
        Parameters:
        - use_local: bool, prefer local data when available
        - **kwargs: passed to pandas parser (read_csv, read_json, read_csv with sep='\t')
        
        Returns:
        pandas.DataFrame: parsed dataset
        """
    
    # Properties
    @property
    def filepath(self) -> str:
        """Local file path (only valid for local datasets)."""
    
    # Instance attributes
    name: str                    # Dataset name
    methodname: str             # Method-safe name (hyphens -> underscores)
    filename: str               # Original filename
    url: str                    # Full remote URL
    format: str                 # File format ('csv', 'json', 'tsv', 'png')
    pkg_filename: str           # Path within package
    is_local: bool              # True if bundled locally
    description: str            # Dataset description
    references: List[str]       # Academic references

Usage Examples

Basic Dataset Loading

from vega_datasets import data

# Load classic iris dataset
iris = data.iris()
print(iris.shape)  # (150, 5)
print(iris.columns.tolist())  # ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species']

# Load by string name
cars = data('cars')
print(cars.head())

# Pass pandas arguments
airports = data.airports(usecols=['iata', 'name', 'city', 'state'])

Local vs Remote Loading

from vega_datasets import data, local_data

# Force remote loading (even for local datasets)
iris_remote = data.iris(use_local=False)

# Local-only loading (fails for remote datasets)
try:
    stocks_local = local_data.stocks()  # Works - stocks is local
    github_local = local_data.github()  # Fails - github is remote-only
except ValueError as e:
    print(f"Error: {e}")

# Check if dataset is local
print(f"Iris is local: {data.iris.is_local}")     # True
print(f"GitHub is local: {data.github.is_local}") # False

Raw Data Access

from vega_datasets import data

# Get raw bytes instead of DataFrame
raw_data = data.iris.raw()
print(type(raw_data))  # <class 'bytes'>

# Use with custom parsing
import json
raw_json = data.cars.raw()
custom_data = json.loads(raw_json.decode())

# Raw data through call method
raw_csv = data('airports', return_raw=True)

Dataset Discovery

from vega_datasets import data, local_data

# List all datasets
all_datasets = data.list_datasets()
print(f"Total datasets: {len(all_datasets)}")  # 70

# List only local datasets
local_datasets = local_data.list_datasets()
print(f"Local datasets: {len(local_datasets)}")  # 17

# Check specific dataset availability
print("Local datasets:", local_datasets[:5])
# ['airports', 'anscombe', 'barley', 'burtin', 'cars']

# Use tab completion in interactive environments
# data.<TAB> shows all available datasets

Advanced Pandas Integration

from vega_datasets import data
import pandas as pd

# Load with pandas options
flights = data.flights(
    parse_dates=['date'],
    dtype={'origin': 'category', 'destination': 'category'}
)

# TSV format handling (automatic)
seattle_temps = data.seattle_temps()  # Handles TSV automatically

# JSON with custom options
github_data = data.github(lines=True)  # If supported by dataset format

Metadata Access

from vega_datasets import data

# Access dataset metadata
iris_dataset = data.iris  # Get Dataset object (don't call yet)
print(f"Name: {iris_dataset.name}")
print(f"Format: {iris_dataset.format}")
print(f"URL: {iris_dataset.url}")
print(f"Local: {iris_dataset.is_local}")
print(f"Description: {iris_dataset.description}")

# Get file path for local datasets
if iris_dataset.is_local:
    print(f"Local path: {iris_dataset.filepath}")

Error Handling

from vega_datasets import data
from urllib.error import URLError

# Handle invalid dataset names
try:
    df = data('invalid-name')
except ValueError as e:
    print(f"Dataset error: {e}")

# Handle network issues for remote datasets
try:
    df = data.github(use_local=False)
except URLError as e:
    print(f"Network error: {e}")
    # Fallback to local if available
    if data.github.is_local:
        df = data.github(use_local=True)

Connection Testing

from vega_datasets.utils import connection_ok

# Check internet connectivity before loading remote datasets
if connection_ok():
    github_data = data.github()
    print("Loaded remote dataset successfully")
else:
    print("No internet connection - using local datasets only")
    local_datasets = local_data.list_datasets()
    stocks_data = local_data.stocks()

Supported File Formats

The package automatically handles multiple data formats:

  • CSV: Comma-separated values (most common)
  • JSON: JavaScript Object Notation (nested data structures)
  • TSV: Tab-separated values (automatic delimiter detection)
  • PNG: Portable Network Graphics (for 7zip dataset, returns raw bytes)

Format detection is automatic based on dataset metadata, with appropriate pandas parsers used for each format.

Note: PNG format datasets (like 7zip) can only be accessed via the raw() method or with return_raw=True, as the DataFrame parsing will raise a ValueError for unsupported formats.

Install with Tessl CLI

npx tessl i tessl/pypi-vega-datasets

docs

dataset-loading.md

index.md

specialized-datasets.md

tile.json