CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-boost-histogram

The Boost::Histogram Python wrapper providing fast histogram implementations with full power and flexibility for scientific computing.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

numpy-integration.mddocs/

NumPy Integration

NumPy-compatible histogram functions providing familiar interfaces while leveraging boost-histogram's performance advantages. These functions offer drop-in replacements for NumPy's histogram functions with additional features and better performance.

Capabilities

1D Histogram Function

Drop-in replacement for numpy.histogram with enhanced performance and features.

def histogram(
    a,
    bins=10,
    range=None,
    weights=None,
    density=False,
    *,
    histogram=None,
    storage=None,
    threads=None
):
    """
    Compute histogram of a dataset.

    Parameters:
    - a: array-like, input data
    - bins: int or sequence, number of bins or bin edges
    - range: tuple, (min, max) range for bins (ignored if bins is sequence)
    - weights: array-like, weights for each value in a
    - density: bool, normalize to create probability density
    - histogram: Histogram class to use for return type (None returns numpy arrays)
    - storage: Storage type (boost_histogram storage class)
    - threads: int, number of threads for parallel processing

    Returns:
    Tuple of (values, edges) where:
    - values: histogram bin counts/densities
    - edges: bin edge array (length N+1 for N bins)
    """

2D Histogram Function

Compute 2D histograms with high performance.

def histogram2d(
    x,
    y,
    bins=10,
    range=None,
    weights=None,
    density=False,
    *,
    histogram=None,
    storage=None,
    threads=None
):
    """
    Compute 2D histogram of two datasets.

    Parameters:
    - x: array-like, x-coordinates of data points
    - y: array-like, y-coordinates of data points  
    - bins: int or [int, int] or array-like, number of bins or bin edges for each dimension
    - range: array-like, [[xmin, xmax], [ymin, ymax]] ranges for bins
    - weights: array-like, weights for each data point
    - density: bool, normalize to create probability density
    - histogram: Histogram class to use for return type (None returns numpy arrays)
    - storage: Storage type (boost_histogram storage class)
    - threads: int, number of threads for parallel processing

    Returns:
    Tuple of (H, xedges, yedges) where:
    - H: 2D histogram array, shape (nx, ny)
    - xedges: x-axis bin edges (length nx+1)
    - yedges: y-axis bin edges (length ny+1)
    """

N-Dimensional Histogram Function

General N-dimensional histogram computation.

def histogramdd(
    sample,
    bins=10,
    range=None,
    weights=None,
    density=False,
    *,
    histogram=None,
    storage=None,
    threads=None
):
    """
    Compute N-dimensional histogram.

    Parameters:
    - sample: array-like, (N, D) array or sequence of D arrays for D-dimensional data
    - bins: int or sequence, number of bins or bin edges for each dimension
    - range: sequence, [(min, max), ...] ranges for each dimension
    - weights: array-like, weights for each sample point
    - density: bool, normalize to create probability density
    - histogram: Histogram class to use for return type (None returns numpy arrays)
    - storage: Storage type (boost_histogram storage class)  
    - threads: int, number of threads for parallel processing

    Returns:
    Tuple of (H, edges) where:
    - H: N-dimensional histogram array
    - edges: list of edge arrays for each dimension
    """

Usage Examples

Basic 1D Histogram

import boost_histogram.numpy as bhnp
import numpy as np

# Generate sample data
data = np.random.normal(0, 1, 10000)

# Basic histogram (drop-in replacement for np.histogram)
counts, edges = bhnp.histogram(data, bins=50)

# With explicit range
counts, edges = bhnp.histogram(data, bins=50, range=(-3, 3))

# With custom bin edges
custom_edges = np.linspace(-4, 4, 41)  # 40 bins
counts, edges = bhnp.histogram(data, bins=custom_edges)

# Density histogram (normalized)
density, edges = bhnp.histogram(data, bins=50, density=True)

Weighted Histograms

# Data with weights
data = np.random.exponential(1, 5000)
weights = np.random.uniform(0.5, 2.0, 5000)

# Weighted histogram
counts, edges = bhnp.histogram(data, bins=30, weights=weights, range=(0, 5))

# Weighted density
density, edges = bhnp.histogram(data, bins=30, weights=weights, 
                               density=True, range=(0, 5))

High-Performance Options

# Use specific storage for better performance
counts, edges = bhnp.histogram(
    data, 
    bins=100, 
    storage=bh.storage.AtomicInt64(),  # Thread-safe integer storage
    threads=4  # Use 4 threads
)

# For very large datasets
large_data = np.random.random(50_000_000)
counts, edges = bhnp.histogram(
    large_data,
    bins=1000,
    threads=None  # Use all available cores
)

2D Histograms

# Generate 2D data
x = np.random.normal(0, 1, 10000)
y = 0.5 * x + np.random.normal(0, 0.8, 10000)

# Basic 2D histogram
H, xedges, yedges = bhnp.histogram2d(x, y, bins=50)

# With explicit ranges and different bin counts
H, xedges, yedges = bhnp.histogram2d(
    x, y,
    bins=[30, 40],  # 30 bins in x, 40 in y
    range=[[-3, 3], [-2, 2]]  # Explicit ranges
)

# Weighted 2D histogram
weights = np.random.exponential(1, 10000)
H, xedges, yedges = bhnp.histogram2d(x, y, bins=40, weights=weights)

# 2D density
H_density, xedges, yedges = bhnp.histogram2d(x, y, bins=50, density=True)

Multi-dimensional Histograms

# 3D histogram
x = np.random.normal(0, 1, 5000)
y = np.random.normal(0, 1, 5000)  
z = x + y + np.random.normal(0, 0.5, 5000)

# Stack data for histogramdd
sample = np.column_stack([x, y, z])

# 3D histogram
H, edges = bhnp.histogramdd(sample, bins=20)
print(f"3D histogram shape: {H.shape}")  # (20, 20, 20)

# Different bins per dimension
H, edges = bhnp.histogramdd(sample, bins=[15, 20, 25])

# With ranges
H, edges = bhnp.histogramdd(
    sample,
    bins=15,
    range=[[-2, 2], [-2, 2], [-3, 3]]
)

# Alternative input format (sequence of arrays)
H, edges = bhnp.histogramdd([x, y, z], bins=20)

Advanced Examples

import boost_histogram as bh
import boost_histogram.numpy as bhnp

# Compare with pure boost-histogram
data = np.random.gamma(2, 1, 100000)

# NumPy-style interface
counts_np, edges_np = bhnp.histogram(data, bins=50, range=(0, 10))

# Equivalent boost-histogram approach
hist_bh = bh.Histogram(bh.axis.Regular(50, 0, 10))
hist_bh.fill(data)
counts_bh = hist_bh.values()
edges_bh = hist_bh.axes[0].edges

# Results are equivalent
assert np.allclose(counts_np, counts_bh)
assert np.allclose(edges_np, edges_bh)

Integration with Scientific Stack

import matplotlib.pyplot as plt
import boost_histogram.numpy as bhnp

# Generate and histogram data
data = np.random.beta(2, 5, 10000)
counts, edges = bhnp.histogram(data, bins=50, density=True)

# Plot with matplotlib
centers = (edges[:-1] + edges[1:]) / 2
plt.bar(centers, counts, width=np.diff(edges), alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Beta Distribution Histogram')
plt.show()

# For 2D plotting
x = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 5000)
H, xedges, yedges = bhnp.histogram2d(x[:, 0], x[:, 1], bins=30)

# Plot 2D histogram
plt.imshow(H.T, origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar()
plt.xlabel('X')
plt.ylabel('Y') 
plt.title('2D Histogram')
plt.show()

Performance Comparison

import time
import numpy as np
import boost_histogram.numpy as bhnp

# Large dataset for performance testing
large_data = np.random.normal(0, 1, 10_000_000)

# NumPy histogram
start = time.time()
np_counts, np_edges = np.histogram(large_data, bins=100)
np_time = time.time() - start

# boost-histogram NumPy interface
start = time.time()
bh_counts, bh_edges = bhnp.histogram(large_data, bins=100)
bh_time = time.time() - start

# boost-histogram with parallelism
start = time.time()
bh_parallel_counts, bh_parallel_edges = bhnp.histogram(
    large_data, 
    bins=100, 
    threads=4
)
bh_parallel_time = time.time() - start

print(f"NumPy time: {np_time:.3f}s")
print(f"boost-histogram time: {bh_time:.3f}s") 
print(f"boost-histogram (4 threads) time: {bh_parallel_time:.3f}s")
print(f"Speedup vs NumPy: {np_time/bh_parallel_time:.1f}x")

Custom Storage Integration

# Use advanced storage with NumPy interface
data = np.random.poisson(3, 50000).astype(float)
weights = np.random.exponential(1, 50000)

# Weighted histogram with variance tracking
counts, edges = bhnp.histogram(
    data,
    bins=20,
    range=(0, 15),
    weights=weights,
    storage=bh.storage.Weight()
)

# Access the underlying histogram for variance information
hist = bh.Histogram(bh.axis.Regular(20, 0, 15), storage=bh.storage.Weight())
hist.fill(data, weight=weights)

values = hist.values()  # Same as counts from bhnp.histogram
variances = hist.variances()  # Additional variance information

print(f"Bin values: {values[:5]}")
print(f"Bin variances: {variances[:5]}")

Install with Tessl CLI

npx tessl i tessl/pypi-boost-histogram

docs

axes.md

histogram-core.md

index.md

indexing-operations.md

numpy-integration.md

storage-accumulators.md

tile.json