An open-source interactive data visualization library for Python
—
Sample datasets for learning and experimentation with plotly visualizations. The data module provides 10+ commonly used datasets in data science, returned as pandas DataFrames (or other backends if configured).
Classic datasets for machine learning and statistical analysis.
def iris():
"""
Load the Iris flower dataset.
Contains measurements of iris flowers from three species: setosa, versicolor, and virginica.
Each sample has four features: sepal length, sepal width, petal length, and petal width.
Returns:
DataFrame: 150 rows × 5 columns
- sepal_length: float, sepal length in cm
- sepal_width: float, sepal width in cm
- petal_length: float, petal length in cm
- petal_width: float, petal width in cm
- species: str, flower species ('setosa', 'versicolor', 'virginica')
- species_id: int, numeric species identifier (0, 1, 2)
"""
def tips():
"""
Load restaurant tips dataset.
Contains information about restaurant bills, tips, and customer characteristics.
Useful for exploring relationships between categorical and continuous variables.
Returns:
DataFrame: 244 rows × 7 columns
- total_bill: float, total bill amount in dollars
- tip: float, tip amount in dollars
- sex: str, customer gender ('Male', 'Female')
- smoker: str, smoking status ('Yes', 'No')
- day: str, day of week ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
- time: str, meal time ('Lunch', 'Dinner')
- size: int, party size (number of people)
"""Datasets containing economic indicators and demographic information over time.
def gapminder():
"""
Load Gapminder world development dataset.
Contains country-level data on life expectancy, GDP per capita, and population
from 1952 to 2007. Excellent for demonstrating animated visualizations and
geographic mapping.
Returns:
DataFrame: 1704 rows × 8 columns
- country: str, country name
- continent: str, continent name ('Africa', 'Americas', 'Asia', 'Europe', 'Oceania')
- year: int, year (1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007)
- lifeExp: float, life expectancy in years
- pop: int, population count
- gdpPercap: float, GDP per capita in US dollars
- iso_alpha: str, 3-letter ISO country code
- iso_num: int, numeric ISO country code
"""
def medals_wide():
"""
Load Olympic medals dataset in wide format.
Contains medal counts by country for 2018 Winter Olympics, with separate
columns for each medal type.
Returns:
DataFrame: 30 rows × 4 columns
- nation: str, country name
- gold: int, number of gold medals
- silver: int, number of silver medals
- bronze: int, number of bronze medals
"""
def medals_long():
"""
Load Olympic medals dataset in long format.
Same data as medals_wide but in tidy/long format with medal type as a variable.
Returns:
DataFrame: 90 rows × 3 columns
- nation: str, country name
- medal: str, medal type ('gold', 'silver', 'bronze')
- count: int, number of medals of that type
"""Datasets with temporal components for time series analysis and visualization.
def stocks():
"""
Load stock price dataset.
Contains daily stock prices for major technology companies (AAPL, GOOGL, AMZN, FB, NFLX, MSFT)
from 2018-2020. Useful for financial charts and time series analysis.
Returns:
DataFrame: 1560 rows × 3 columns
- date: datetime, trading date
- AAPL: float, Apple stock price
- GOOGL: float, Google stock price
- AMZN: float, Amazon stock price
- FB: float, Facebook stock price
- NFLX: float, Netflix stock price
- MSFT: float, Microsoft stock price
"""
def flights():
"""
Load airline passenger flights dataset.
Contains monthly passenger counts for different airlines and airports.
Good for demonstrating time series patterns and seasonal trends.
Returns:
DataFrame: 5733 rows × 4 columns
- year: int, year
- month: int, month (1-12)
- passengers: int, number of passengers
- airline: str, airline identifier
"""Datasets containing electoral and political information.
def election():
"""
Load 2013 Montreal mayoral election results.
Contains voting results by district with candidate vote shares and
geographic information for choropleth mapping.
Returns:
DataFrame: 58 rows × 15 columns
- district: int, electoral district number
- Coderre: float, vote percentage for Denis Coderre
- Bergeron: float, vote percentage for Richard Bergeron
- Joly: float, vote percentage for Mélanie Joly
- total: int, total votes cast
- winner: str, winning candidate name
- result: str, result type ('win', 'lose')
- district_id: int, district identifier for mapping
- ... additional demographic columns
"""
def election_geojson():
"""
Load GeoJSON data for Montreal election districts.
Geographic boundary data corresponding to the election dataset,
used for creating choropleth maps.
Returns:
dict: GeoJSON feature collection with district boundaries
"""Datasets from scientific measurements and environmental monitoring.
def wind():
"""
Load wind measurement dataset.
Contains wind speed and direction measurements, useful for polar plots,
wind roses, and meteorological visualizations.
Returns:
DataFrame: 128 rows × 4 columns
- direction: str, wind direction ('N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW')
- strength: str, wind strength category ('0-1', '1-2', '2-3', '3-4', '4-4+', '4-5', '5-6', '6+')
- frequency: float, frequency of occurrence
- magnitude: float, magnitude value for polar plotting
"""
def carshare():
"""
Load car sharing usage dataset.
Contains information about car sharing service usage patterns,
including temporal and geographic distribution.
Returns:
DataFrame: 249 rows × 4 columns
- centroid_lat: float, latitude of service area centroid
- centroid_lon: float, longitude of service area centroid
- car_hours: float, total car usage hours
- member_birth_year: int, birth year of member
"""Datasets designed for statistical analysis and experimental design examples.
def experiment():
"""
Load A/B testing experiment dataset.
Contains results from a controlled experiment with treatment and control groups,
useful for demonstrating statistical analysis and hypothesis testing.
Returns:
DataFrame: 100 rows × 4 columns
- experiment_1: int, first experiment result
- experiment_2: int, second experiment result
- experiment_3: int, third experiment result
- group: str, experimental group ('control', 'treatment')
"""import plotly.express as px
import plotly.data as data
# Load and explore iris dataset
df_iris = data.iris()
print(df_iris.head())
print(df_iris.info())
# Create scatter plot with iris data
fig1 = px.scatter(df_iris, x="sepal_width", y="sepal_length",
color="species", size="petal_length",
title="Iris Dataset Visualization")
fig1.show()
# Load gapminder for animated visualization
df_gap = data.gapminder()
fig2 = px.scatter(df_gap, x="gdpPercap", y="lifeExp",
animation_frame="year", animation_group="country",
size="pop", color="continent", hover_name="country",
log_x=True, size_max=55, range_x=[100,100000],
range_y=[25,90], title="Gapminder Animation")
fig2.show()
# Stock price time series
df_stocks = data.stocks()
fig3 = px.line(df_stocks, x="date", y=["AAPL", "GOOGL", "AMZN"],
title="Tech Stock Prices")
fig3.show()
# Tips dataset for statistical analysis
df_tips = data.tips()
fig4 = px.box(df_tips, x="day", y="total_bill", color="time",
title="Restaurant Bills by Day and Time")
fig4.show()
# Wind data for polar visualization
df_wind = data.wind()
fig5 = px.bar_polar(df_wind, r="frequency", theta="direction",
color="strength", template="plotly_dark",
color_discrete_sequence=px.colors.sequential.Plasma_r,
title="Wind Pattern Analysis")
fig5.show()
# Election data for choropleth mapping
df_election = data.election()
geojson = data.election_geojson()
fig6 = px.choropleth(df_election, geojson=geojson, locations="district",
color="winner",
hover_data=["Coderre", "Bergeron", "Joly"],
title="Montreal Election Results")
fig6.show()
# Car sharing geographic analysis
df_cars = data.carshare()
fig7 = px.scatter_mapbox(df_cars, lat="centroid_lat", lon="centroid_lon",
size="car_hours", color="member_birth_year",
hover_data=["car_hours"], zoom=10, height=600,
mapbox_style="open-street-map",
title="Car Sharing Usage Patterns")
fig7.show()
# Olympic medals comparison
df_medals = data.medals_long()
fig8 = px.bar(df_medals, x="nation", y="count", color="medal",
title="2018 Winter Olympics Medal Count")
fig8.show()
# Flight passenger trends
df_flights = data.flights()
fig9 = px.line(df_flights, x="month", y="passengers", color="airline",
title="Airline Passenger Trends")
fig9.show()
# A/B testing results
df_experiment = data.experiment()
fig10 = px.box(df_experiment, y=["experiment_1", "experiment_2", "experiment_3"],
color="group", title="A/B Testing Results")
fig10.show()
# Dataset information summary
datasets = [
('iris', data.iris),
('tips', data.tips),
('gapminder', data.gapminder),
('stocks', data.stocks),
('flights', data.flights),
('wind', data.wind),
('election', data.election),
('carshare', data.carshare),
('medals_long', data.medals_long),
('experiment', data.experiment)
]
for name, func in datasets:
df = func()
print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")Install with Tessl CLI
npx tessl i tessl/pypi-plotly