A library for W3C Provenance Data Model supporting PROV-JSON, PROV-XML and PROV-O (RDF)
—
Integration with NetworkX for graph analysis and visualization capabilities, including conversion to/from graph formats and DOT export for graphical rendering. Enables provenance graph analysis and visual representation of PROV documents.
Convert between PROV documents and NetworkX graph representations.
def prov_to_graph(prov_document):
"""
Convert a PROV document to a NetworkX MultiDiGraph.
Args:
prov_document (ProvDocument): PROV document to convert
Returns:
networkx.MultiDiGraph: Graph representation of the provenance
Notes:
- Nodes represent PROV elements (entities, activities, agents)
- Edges represent PROV relationships
- Node and edge attributes preserve PROV metadata
- Multiple edges between same nodes are supported (MultiDiGraph)
"""
def graph_to_prov(g):
"""
Convert a NetworkX MultiDiGraph back to a PROV document.
Args:
g (networkx.MultiDiGraph): Graph to convert
Returns:
ProvDocument: PROV document reconstructed from graph
Notes:
- Requires properly formatted node and edge attributes
- Node types determine PROV element types
- Edge types determine PROV relationship types
"""Generate DOT format for graphical rendering with Graphviz.
def prov_to_dot(bundle, show_nary=True, use_labels=False, direction="BT",
show_element_attributes=True, show_relation_attributes=True):
"""
Convert a PROV bundle to DOT graph format for visualization.
Args:
bundle (ProvBundle): PROV bundle to visualize
show_nary (bool): Show n-ary relations as nodes (default: True)
use_labels (bool): Use labels instead of identifiers (default: False)
direction (str): Graph direction - "BT", "TB", "LR", "RL" (default: "BT")
show_element_attributes (bool): Show element attributes (default: True)
show_relation_attributes (bool): Show relation attributes (default: True)
Returns:
pydot.Dot: DOT graph object that can be rendered to various formats
Notes:
- Requires pydot and graphviz for rendering
- Supports various output formats: PNG, SVG, PDF, etc.
- Direction: BT=bottom-to-top, TB=top-to-bottom, LR=left-to-right, RL=right-to-left
"""Convenient plotting method available on ProvBundle objects.
class ProvBundle:
def plot(self, filename=None, show_nary=True, use_labels=False, direction="BT"):
"""
Create a visualization of this bundle.
Args:
filename (str, optional): Output filename (format inferred from extension)
show_nary (bool): Show n-ary relations as nodes
use_labels (bool): Use labels instead of identifiers
direction (str): Graph layout direction
Returns:
Graph object that can be further customized
Notes:
- If filename provided, saves to file
- If no filename, returns graph object for interactive use
- Supports formats: PNG, SVG, PDF, DOT, etc.
"""Predefined styling for DOT graph elements.
# Generic node styling
GENERIC_NODE_STYLE: dict
"""Base styling for all nodes."""
# PROV-specific DOT styling
DOT_PROV_STYLE: dict
"""PROV element type specific styling including colors and shapes."""
# Annotation styling
ANNOTATION_STYLE: dict
"""Styling for annotation elements."""from prov.model import ProvDocument
from prov.graph import prov_to_graph, graph_to_prov
import networkx as nx
# Create a PROV document
doc = ProvDocument()
doc.add_namespace('ex', 'http://example.org/')
entity1 = doc.entity('ex:entity1')
activity1 = doc.activity('ex:activity1')
agent1 = doc.agent('ex:agent1')
doc.generation(entity1, activity1)
doc.association(activity1, agent1)
# Convert to NetworkX graph
graph = prov_to_graph(doc)
# Analyze graph properties
print(f"Nodes: {graph.number_of_nodes()}")
print(f"Edges: {graph.number_of_edges()}")
print(f"Node types: {[graph.nodes[n].get('prov:type') for n in graph.nodes()]}")
# Graph analysis with NetworkX
print(f"Is directed acyclic graph: {nx.is_directed_acyclic_graph(graph)}")
print(f"Weakly connected components: {nx.number_weakly_connected_components(graph)}")import networkx as nx
# Convert document to graph for analysis
graph = prov_to_graph(doc)
# Find paths between elements
try:
entity_nodes = [n for n in graph.nodes() if graph.nodes[n].get('prov:type') == 'prov:Entity']
if len(entity_nodes) >= 2:
paths = list(nx.all_simple_paths(graph, entity_nodes[0], entity_nodes[1]))
print(f"Paths between entities: {len(paths)}")
except nx.NetworkXNoPath:
print("No path found between entities")
# Analyze centrality
centrality = nx.degree_centrality(graph)
most_central = max(centrality, key=centrality.get)
print(f"Most central node: {most_central} (centrality: {centrality[most_central]:.3f})")
# Find strongly connected components
scc = list(nx.strongly_connected_components(graph))
print(f"Strongly connected components: {len(scc)}")from prov.dot import prov_to_dot
# Create DOT graph for visualization
dot_graph = prov_to_dot(doc)
# Save to various formats
dot_graph.write_png('provenance.png')
dot_graph.write_svg('provenance.svg')
dot_graph.write_pdf('provenance.pdf')
dot_graph.write_dot('provenance.dot')
# Custom visualization options
custom_dot = prov_to_dot(doc,
show_nary=False, # Hide n-ary relations
use_labels=True, # Use labels instead of IDs
direction="LR", # Left-to-right layout
show_element_attributes=False, # Hide element attrs
show_relation_attributes=False) # Hide relation attrs
custom_dot.write_png('provenance_simple.png')# Direct plotting from bundle
doc.plot('visualization.png') # Save to PNG
doc.plot('visualization.svg', direction="TB") # Top-to-bottom layout
doc.plot('visualization.pdf', use_labels=True) # Use labels
# Interactive plotting (returns graph object)
graph_obj = doc.plot()
# Customize the returned graph object further
graph_obj.set_bgcolor('lightgray')
graph_obj.write_png('custom_viz.png')# For large documents, visualize specific bundles
large_doc = ProvDocument()
# ... populate with many records ...
# Create bundle with subset of data
analysis_bundle = large_doc.bundle('ex:analysis_subset')
# Add only relevant records to bundle
entities_of_interest = ['ex:dataset1', 'ex:result1', 'ex:report1']
for entity_id in entities_of_interest:
records = large_doc.get_record(entity_id)
for record in records:
analysis_bundle.add_record(record)
# Visualize the subset
analysis_bundle.plot('analysis_subset.png')from prov.dot import prov_to_dot, DOT_PROV_STYLE
# Examine default styling
print("Default PROV styling:")
for prov_type, style in DOT_PROV_STYLE.items():
print(f" {prov_type}: {style}")
# Create custom visualization with modified styling
dot_graph = prov_to_dot(doc)
# Customize graph attributes
dot_graph.set_bgcolor('white')
dot_graph.set_fontsize('12')
dot_graph.set_rankdir('TB') # Top-to-bottom
# Save customized version
dot_graph.write_svg('custom_styled.svg')# Convert to graph for detailed analysis
graph = prov_to_graph(doc)
# Calculate various graph metrics
metrics = {
'nodes': graph.number_of_nodes(),
'edges': graph.number_of_edges(),
'density': nx.density(graph),
'is_dag': nx.is_directed_acyclic_graph(graph),
'weak_components': nx.number_weakly_connected_components(graph),
'strong_components': nx.number_strongly_connected_components(graph)
}
print("Graph Metrics:")
for metric, value in metrics.items():
print(f" {metric}: {value}")
# Analyze node types
node_types = {}
for node in graph.nodes():
prov_type = graph.nodes[node].get('prov:type', 'unknown')
node_types[prov_type] = node_types.get(prov_type, 0) + 1
print("\nNode Type Distribution:")
for node_type, count in node_types.items():
print(f" {node_type}: {count}")# Test round-trip conversion (PROV -> Graph -> PROV)
original_doc = ProvDocument()
# ... create some PROV content ...
# Convert to graph and back
graph = prov_to_graph(original_doc)
reconstructed_doc = graph_to_prov(graph)
# Compare documents
print(f"Original records: {len(original_doc.records)}")
print(f"Reconstructed records: {len(reconstructed_doc.records)}")
# Check if documents are equivalent
print(f"Documents equal: {original_doc == reconstructed_doc}")from IPython.display import Image, SVG
import tempfile
import os
def display_prov_graph(bundle, format='svg'):
"""Display PROV graph inline in Jupyter notebook."""
with tempfile.NamedTemporaryFile(suffix=f'.{format}', delete=False) as tmp:
bundle.plot(tmp.name, use_labels=True)
if format == 'svg':
return SVG(tmp.name)
elif format == 'png':
return Image(tmp.name)
# Clean up
os.unlink(tmp.name)
# In Jupyter notebook cell:
# display_prov_graph(doc)# Create subgraphs based on element types
graph = prov_to_graph(doc)
# Extract entity-only subgraph
entity_nodes = [n for n in graph.nodes()
if graph.nodes[n].get('prov:type') == 'prov:Entity']
entity_subgraph = graph.subgraph(entity_nodes)
# Extract activity workflow
activity_nodes = [n for n in graph.nodes()
if graph.nodes[n].get('prov:type') == 'prov:Activity']
activity_subgraph = graph.subgraph(activity_nodes)
# Analyze workflows
if activity_subgraph.number_of_nodes() > 0:
workflow_length = nx.dag_longest_path_length(activity_subgraph)
print(f"Longest workflow path: {workflow_length}")Install with Tessl CLI
npx tessl i tessl/pypi-prov