Vega-Altair: A declarative statistical visualization library for Python.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Data transformation, loading, and preprocessing functionality including support for pandas DataFrames, CSV/JSON files, and various data formats. Altair provides flexible data transformation pipeline that works with multiple data sources.
Registry system for managing different data transformation backends that convert various data formats into Vega-Lite compatible specifications.
class DataTransformerRegistry:
def enable(self, name, **kwargs):
"""Enable a data transformer by name."""
def disable(self):
"""Disable current data transformer."""
def register(self, name, func):
"""Register a new data transformer function."""
def get(self):
"""Get currently active data transformer."""
@property
def active(self):
"""Get name of active data transformer."""
def names(self):
"""Get list of available transformer names."""
# Global data transformers registry
data_transformers = DataTransformerRegistry()Functions for converting data between different formats compatible with Vega-Lite.
def to_json(data, prefix='altair-data', extension='json', **kwargs):
"""
Convert data to JSON format.
Parameters:
- data: Input data (DataFrame, dict, list)
- prefix: Filename prefix for generated files
- extension: File extension to use
Returns:
dict: Vega-Lite data specification
"""
def to_csv(data, prefix='altair-data', extension='csv', **kwargs):
"""
Convert data to CSV format.
Parameters:
- data: Input data (DataFrame, dict, list)
- prefix: Filename prefix for generated files
- extension: File extension to use
Returns:
dict: Vega-Lite data specification with CSV URL
"""
def to_values(data):
"""
Convert data to inline values format.
Parameters:
- data: Input data (DataFrame, dict, list)
Returns:
dict: Vega-Lite data specification with inline values
"""Functions for managing large datasets by limiting rows or sampling data.
def limit_rows(max_rows=5000):
"""
Data transformer that limits the number of rows.
Parameters:
- max_rows: Maximum number of rows to include
Returns:
Configured data transformer function
"""
def sample(n=None, frac=None):
"""
Sample random subset of data rows.
Parameters:
- n: Number of rows to sample
- frac: Fraction of rows to sample (0-1)
Returns:
Sampled data
"""
class MaxRowsError(Exception):
"""Exception raised when data exceeds maximum allowed rows."""def default_data_transformer(data):
"""
Get the default data transformer function.
Parameters:
- data: Input data
Returns:
Transformed data specification
"""Standalone functions for generating synthetic data sources.
def sequence(start, stop=None, step=None, as_='data'):
"""
Generate sequence of numbers as data source.
Parameters:
- start: Starting value
- stop: Ending value (exclusive)
- step: Step size (default 1)
- as_: Output field name
Returns:
SequenceGenerator: Sequence data specification
"""
def graticule(extent=None, extentMajor=None, extentMinor=None, step=None, stepMajor=None, stepMinor=None, precision=None):
"""
Generate graticule (geographic grid lines) as data source.
Parameters:
- extent: Overall extent [[x0, y0], [x1, y1]]
- extentMajor: Major line extent
- extentMinor: Minor line extent
- step: Overall step size [dx, dy]
- stepMajor: Major line step size
- stepMinor: Minor line step size
- precision: Line precision
Returns:
GraticuleGenerator: Graticule data specification
"""
def sphere():
"""
Generate sphere geometry as data source.
Returns:
SphereGenerator: Sphere data specification
"""
def topo_feature(topology, feature):
"""
Extract feature from TopoJSON topology.
Parameters:
- topology: TopoJSON topology object or URL
- feature: Feature name to extract
Returns:
dict: Data specification for extracted feature
"""Support for various data input formats and sources.
# Inline data
class InlineData:
def __init__(self, values=None, format=None): ...
# URL-based data
class UrlData:
def __init__(self, url=None, format=None): ...
# Named datasets
class NamedData:
def __init__(self, name=None): ...
# Generated data
class SequenceGenerator:
def __init__(self, start=None, stop=None, step=None, as_=None): ...
class GraticuleGenerator:
def __init__(self, extent=None, extentMajor=None, extentMinor=None, step=None, stepMajor=None, stepMinor=None, precision=None): ...
class SphereGenerator:
def __init__(self): ...Classes for specifying data parsing and formatting options.
class DataFormat:
def __init__(self, type=None, **kwargs): ...
class CsvDataFormat(DataFormat):
def __init__(self, parse=None, delimiter=None, **kwargs): ...
class JsonDataFormat(DataFormat):
def __init__(self, parse=None, property=None, **kwargs): ...
class TopoDataFormat(DataFormat):
def __init__(self, feature=None, mesh=None, **kwargs): ...
class DsvDataFormat(DataFormat):
def __init__(self, delimiter=None, parse=None, **kwargs): ...import altair as alt
import pandas as pd
# From pandas DataFrame
df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
chart = alt.Chart(df).mark_point().encode(x='x', y='y')
# From URL
chart = alt.Chart('https://example.com/data.csv').mark_point().encode(x='x', y='y')
# From JSON file
chart = alt.Chart('data.json').mark_point().encode(x='x', y='y')# Enable JSON data transformer
alt.data_transformers.enable('json')
# Enable data server (for large datasets)
alt.data_transformers.enable('data_server')
# Custom row limit
alt.data_transformers.enable('json', max_rows=10000)
# Check active transformer
print(alt.data_transformers.active)import pandas as pd
# Large dataset
large_df = pd.DataFrame({
'x': range(100000),
'y': range(100000)
})
# Sample data for visualization
alt.data_transformers.enable('json', max_rows=5000)
chart = alt.Chart(large_df).mark_point().encode(x='x', y='y')
# Or manually sample
sampled_data = large_df.sample(n=1000)
chart = alt.Chart(sampled_data).mark_point().encode(x='x', y='y')# Sequence data for mathematical functions
sequence_chart = alt.Chart(alt.sequence(1, 100)).mark_line().encode(
x='data:Q',
y=alt.expr('sin(datum.data * PI / 20)').title('sin(x)')
).properties(
title='Sine Wave from Generated Sequence'
)
# Multiple sequences for comparison
comparison_chart = alt.Chart(alt.sequence(0, 10, 0.1)).mark_line().encode(
x='data:Q'
).transform_calculate(
sin_val=alt.expr('sin(datum.data)'),
cos_val=alt.expr('cos(datum.data)')
).transform_fold(
['sin_val', 'cos_val'], as_=['function', 'value']
).encode(
y='value:Q',
color='function:N'
)
# Graticule for geographic maps
world_with_graticule = alt.layer(
alt.Chart(alt.sphere()).mark_geoshape(fill='lightblue'),
alt.Chart(alt.graticule()).mark_geoshape(
stroke='white',
strokeWidth=0.5,
fill=None
)
).resolve_scale(color='independent')
# Custom graticule spacing
custom_graticule = alt.Chart(
alt.graticule(step=[30, 30]) # 30-degree grid
).mark_geoshape(stroke='gray', strokeWidth=1)
# TopoJSON feature extraction
us_states = alt.Chart(
alt.topo_feature('https://vega.github.io/vega-datasets/data/us-10m.json', 'states')
).mark_geoshape().encode(
color=alt.value('steelblue'),
stroke=alt.value('white')
)# CSV with custom parsing
chart = alt.Chart(
alt.UrlData(
url='data.csv',
format=alt.CsvDataFormat(parse={'date': 'date:%Y-%m-%d'})
)
).mark_line().encode(
x='date:T',
y='value:Q'
)
# JSON with property extraction
chart = alt.Chart(
alt.UrlData(
url='data.json',
format=alt.JsonDataFormat(property='results')
)
).mark_bar().encode(
x='category:N',
y='value:Q'
)from typing import Union, Dict, Any, Optional, List, Callable
# Data source types
DataSource = Union[
pd.DataFrame,
str, # URL
Dict[str, Any], # Specification
List[Dict[str, Any]], # Inline values
InlineData,
UrlData,
NamedData,
SequenceGenerator,
GraticuleGenerator,
SphereGenerator
]
# Data transformer function type
DataTransformer = Callable[[Any], Dict[str, Any]]
# Parse specification
ParseDict = Dict[str, Union[str, None]]
# Format types
FormatType = Union['json', 'csv', 'tsv', 'dsv', 'topojson']Install with Tessl CLI
npx tessl i tessl/pypi-altair