tessl/pypi-prov

A library for W3C Provenance Data Model supporting PROV-JSON, PROV-XML and PROV-O (RDF)

—

Pending

Overview

Eval results

Files

Graph Analysis and Visualization

Name: tessl/pypi-prov
Author: tessl

Integration with NetworkX for graph analysis and visualization capabilities, including conversion to/from graph formats and DOT export for graphical rendering. Enables provenance graph analysis and visual representation of PROV documents.

Capabilities

Graph Conversion Functions

Convert between PROV documents and NetworkX graph representations.

def prov_to_graph(prov_document):
    """
    Convert a PROV document to a NetworkX MultiDiGraph.
    
    Args:
        prov_document (ProvDocument): PROV document to convert
        
    Returns:
        networkx.MultiDiGraph: Graph representation of the provenance
        
    Notes:
        - Nodes represent PROV elements (entities, activities, agents)
        - Edges represent PROV relationships
        - Node and edge attributes preserve PROV metadata
        - Multiple edges between same nodes are supported (MultiDiGraph)
    """

def graph_to_prov(g):
    """
    Convert a NetworkX MultiDiGraph back to a PROV document.
    
    Args:
        g (networkx.MultiDiGraph): Graph to convert
        
    Returns:
        ProvDocument: PROV document reconstructed from graph
        
    Notes:
        - Requires properly formatted node and edge attributes
        - Node types determine PROV element types
        - Edge types determine PROV relationship types
    """

DOT Visualization

Generate DOT format for graphical rendering with Graphviz.

def prov_to_dot(bundle, show_nary=True, use_labels=False, direction="BT", 
                show_element_attributes=True, show_relation_attributes=True):
    """
    Convert a PROV bundle to DOT graph format for visualization.
    
    Args:
        bundle (ProvBundle): PROV bundle to visualize
        show_nary (bool): Show n-ary relations as nodes (default: True)
        use_labels (bool): Use labels instead of identifiers (default: False)
        direction (str): Graph direction - "BT", "TB", "LR", "RL" (default: "BT")
        show_element_attributes (bool): Show element attributes (default: True)
        show_relation_attributes (bool): Show relation attributes (default: True)
        
    Returns:
        pydot.Dot: DOT graph object that can be rendered to various formats
        
    Notes:
        - Requires pydot and graphviz for rendering
        - Supports various output formats: PNG, SVG, PDF, etc.
        - Direction: BT=bottom-to-top, TB=top-to-bottom, LR=left-to-right, RL=right-to-left
    """

Bundle Plotting Method

Convenient plotting method available on ProvBundle objects.

class ProvBundle:
    def plot(self, filename=None, show_nary=True, use_labels=False, direction="BT"):
        """
        Create a visualization of this bundle.
        
        Args:
            filename (str, optional): Output filename (format inferred from extension)
            show_nary (bool): Show n-ary relations as nodes
            use_labels (bool): Use labels instead of identifiers  
            direction (str): Graph layout direction
            
        Returns:
            Graph object that can be further customized
            
        Notes:
            - If filename provided, saves to file
            - If no filename, returns graph object for interactive use
            - Supports formats: PNG, SVG, PDF, DOT, etc.
        """

Visualization Style Constants

Predefined styling for DOT graph elements.

# Generic node styling
GENERIC_NODE_STYLE: dict
"""Base styling for all nodes."""

# PROV-specific DOT styling  
DOT_PROV_STYLE: dict
"""PROV element type specific styling including colors and shapes."""

# Annotation styling
ANNOTATION_STYLE: dict  
"""Styling for annotation elements."""

Usage Examples

Basic Graph Conversion

from prov.model import ProvDocument
from prov.graph import prov_to_graph, graph_to_prov
import networkx as nx

# Create a PROV document
doc = ProvDocument()
doc.add_namespace('ex', 'http://example.org/')

entity1 = doc.entity('ex:entity1')
activity1 = doc.activity('ex:activity1')
agent1 = doc.agent('ex:agent1')

doc.generation(entity1, activity1)
doc.association(activity1, agent1)

# Convert to NetworkX graph
graph = prov_to_graph(doc)

# Analyze graph properties
print(f"Nodes: {graph.number_of_nodes()}")
print(f"Edges: {graph.number_of_edges()}")
print(f"Node types: {[graph.nodes[n].get('prov:type') for n in graph.nodes()]}")

# Graph analysis with NetworkX
print(f"Is directed acyclic graph: {nx.is_directed_acyclic_graph(graph)}")
print(f"Weakly connected components: {nx.number_weakly_connected_components(graph)}")

Advanced Graph Analysis

import networkx as nx

# Convert document to graph for analysis
graph = prov_to_graph(doc)

# Find paths between elements
try:
    entity_nodes = [n for n in graph.nodes() if graph.nodes[n].get('prov:type') == 'prov:Entity']
    if len(entity_nodes) >= 2:
        paths = list(nx.all_simple_paths(graph, entity_nodes[0], entity_nodes[1]))
        print(f"Paths between entities: {len(paths)}")
except nx.NetworkXNoPath:
    print("No path found between entities")

# Analyze centrality
centrality = nx.degree_centrality(graph)
most_central = max(centrality, key=centrality.get)
print(f"Most central node: {most_central} (centrality: {centrality[most_central]:.3f})")

# Find strongly connected components
scc = list(nx.strongly_connected_components(graph))
print(f"Strongly connected components: {len(scc)}")

DOT Visualization

from prov.dot import prov_to_dot

# Create DOT graph for visualization
dot_graph = prov_to_dot(doc)

# Save to various formats
dot_graph.write_png('provenance.png')
dot_graph.write_svg('provenance.svg')
dot_graph.write_pdf('provenance.pdf')
dot_graph.write_dot('provenance.dot')

# Custom visualization options
custom_dot = prov_to_dot(doc, 
                        show_nary=False,           # Hide n-ary relations
                        use_labels=True,           # Use labels instead of IDs
                        direction="LR",            # Left-to-right layout
                        show_element_attributes=False,  # Hide element attrs
                        show_relation_attributes=False) # Hide relation attrs

custom_dot.write_png('provenance_simple.png')

Bundle Plotting Method

# Direct plotting from bundle
doc.plot('visualization.png')                    # Save to PNG
doc.plot('visualization.svg', direction="TB")    # Top-to-bottom layout
doc.plot('visualization.pdf', use_labels=True)   # Use labels

# Interactive plotting (returns graph object)
graph_obj = doc.plot()
# Customize the returned graph object further
graph_obj.set_bgcolor('lightgray')
graph_obj.write_png('custom_viz.png')

Working with Large Documents

# For large documents, visualize specific bundles
large_doc = ProvDocument()
# ... populate with many records ...

# Create bundle with subset of data
analysis_bundle = large_doc.bundle('ex:analysis_subset')

# Add only relevant records to bundle
entities_of_interest = ['ex:dataset1', 'ex:result1', 'ex:report1']
for entity_id in entities_of_interest:
    records = large_doc.get_record(entity_id)
    for record in records:
        analysis_bundle.add_record(record)

# Visualize the subset
analysis_bundle.plot('analysis_subset.png')

Custom Graph Styling

from prov.dot import prov_to_dot, DOT_PROV_STYLE

# Examine default styling
print("Default PROV styling:")
for prov_type, style in DOT_PROV_STYLE.items():
    print(f"  {prov_type}: {style}")

# Create custom visualization with modified styling
dot_graph = prov_to_dot(doc)

# Customize graph attributes
dot_graph.set_bgcolor('white')
dot_graph.set_fontsize('12')
dot_graph.set_rankdir('TB')  # Top-to-bottom

# Save customized version
dot_graph.write_svg('custom_styled.svg')

Graph Metrics and Analysis

# Convert to graph for detailed analysis
graph = prov_to_graph(doc)

# Calculate various graph metrics
metrics = {
    'nodes': graph.number_of_nodes(),
    'edges': graph.number_of_edges(),
    'density': nx.density(graph),
    'is_dag': nx.is_directed_acyclic_graph(graph),
    'weak_components': nx.number_weakly_connected_components(graph),
    'strong_components': nx.number_strongly_connected_components(graph)
}

print("Graph Metrics:")
for metric, value in metrics.items():
    print(f"  {metric}: {value}")

# Analyze node types
node_types = {}
for node in graph.nodes():
    prov_type = graph.nodes[node].get('prov:type', 'unknown')
    node_types[prov_type] = node_types.get(prov_type, 0) + 1

print("\nNode Type Distribution:")
for node_type, count in node_types.items():
    print(f"  {node_type}: {count}")

Round-trip Conversion

# Test round-trip conversion (PROV -> Graph -> PROV)
original_doc = ProvDocument()
# ... create some PROV content ...

# Convert to graph and back
graph = prov_to_graph(original_doc)
reconstructed_doc = graph_to_prov(graph)

# Compare documents
print(f"Original records: {len(original_doc.records)}")
print(f"Reconstructed records: {len(reconstructed_doc.records)}")

# Check if documents are equivalent
print(f"Documents equal: {original_doc == reconstructed_doc}")

Integration with Jupyter Notebooks

from IPython.display import Image, SVG
import tempfile
import os

def display_prov_graph(bundle, format='svg'):
    """Display PROV graph inline in Jupyter notebook."""
    with tempfile.NamedTemporaryFile(suffix=f'.{format}', delete=False) as tmp:
        bundle.plot(tmp.name, use_labels=True)
        
        if format == 'svg':
            return SVG(tmp.name)
        elif format == 'png':
            return Image(tmp.name)
        
        # Clean up
        os.unlink(tmp.name)

# In Jupyter notebook cell:
# display_prov_graph(doc)

Filtering and Subgraph Analysis

# Create subgraphs based on element types
graph = prov_to_graph(doc)

# Extract entity-only subgraph
entity_nodes = [n for n in graph.nodes() 
                if graph.nodes[n].get('prov:type') == 'prov:Entity']
entity_subgraph = graph.subgraph(entity_nodes)

# Extract activity workflow
activity_nodes = [n for n in graph.nodes() 
                  if graph.nodes[n].get('prov:type') == 'prov:Activity']
activity_subgraph = graph.subgraph(activity_nodes)

# Analyze workflows
if activity_subgraph.number_of_nodes() > 0:
    workflow_length = nx.dag_longest_path_length(activity_subgraph)
    print(f"Longest workflow path: {workflow_length}")

Install with Tessl CLI