0
# Statistics and Analysis
1
2
Comprehensive statistical analysis functions and advanced analytical operations for process behavior, performance metrics, model analysis, and process intelligence. PM4PY provides both descriptive statistics and advanced analytical capabilities.
3
4
## Capabilities
5
6
### Basic Statistics
7
8
Fundamental statistical functions for extracting basic information from event logs.
9
10
```python { .api }
11
def get_start_activities(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
12
"""
13
Get start activities and their frequencies across all cases.
14
15
Parameters:
16
- log (Union[EventLog, pd.DataFrame]): Event log data
17
- activity_key (str): Activity attribute name
18
- timestamp_key (str): Timestamp attribute name
19
- case_id_key (str): Case ID attribute name
20
21
Returns:
22
Dict[str, int]: Start activities with their frequencies
23
"""
24
25
def get_end_activities(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
26
"""
27
Get end activities and their frequencies across all cases.
28
29
Parameters:
30
- log (Union[EventLog, pd.DataFrame]): Event log data
31
- activity_key (str): Activity attribute name
32
- timestamp_key (str): Timestamp attribute name
33
- case_id_key (str): Case ID attribute name
34
35
Returns:
36
Dict[str, int]: End activities with their frequencies
37
"""
38
39
def get_event_attributes(log):
40
"""
41
Get list of all event attribute names in the log.
42
43
Parameters:
44
- log (Union[EventLog, pd.DataFrame]): Event log data
45
46
Returns:
47
List[str]: List of event attribute names
48
"""
49
50
def get_event_attribute_values(log, attribute_key):
51
"""
52
Get all unique values for a specific event attribute.
53
54
Parameters:
55
- log (Union[EventLog, pd.DataFrame]): Event log data
56
- attribute_key (str): Attribute name to extract values for
57
58
Returns:
59
List[Any]: Unique values of the specified attribute
60
"""
61
62
def get_trace_attributes(log):
63
"""
64
Get list of all trace (case-level) attribute names.
65
66
Parameters:
67
- log (Union[EventLog, pd.DataFrame]): Event log data
68
69
Returns:
70
List[str]: List of trace attribute names
71
"""
72
73
def get_trace_attribute_values(log, attribute_key):
74
"""
75
Get all unique values for a specific trace attribute.
76
77
Parameters:
78
- log (Union[EventLog, pd.DataFrame]): Event log data
79
- attribute_key (str): Trace attribute name
80
81
Returns:
82
List[Any]: Unique values of the specified trace attribute
83
"""
84
```
85
86
### Variant Analysis
87
88
Analyze process variants (unique activity sequences) and their characteristics.
89
90
```python { .api }
91
def get_variants(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
92
"""
93
Get trace variants with their corresponding case IDs.
94
95
Parameters:
96
- log (Union[EventLog, pd.DataFrame]): Event log data
97
- activity_key (str): Activity attribute name
98
- timestamp_key (str): Timestamp attribute name
99
- case_id_key (str): Case ID attribute name
100
101
Returns:
102
Dict[Tuple[str, ...], List[str]]: Variants mapped to list of case IDs
103
"""
104
105
def get_variants_as_tuples(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
106
"""
107
Get variants as tuples with their frequencies.
108
109
Parameters:
110
- log (Union[EventLog, pd.DataFrame]): Event log data
111
- activity_key (str): Activity attribute name
112
- timestamp_key (str): Timestamp attribute name
113
- case_id_key (str): Case ID attribute name
114
115
Returns:
116
Dict[Tuple[str, ...], int]: Variants with their frequencies
117
"""
118
119
def split_by_process_variant(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
120
"""
121
Split log into separate logs by process variant.
122
123
Parameters:
124
- log (Union[EventLog, pd.DataFrame]): Event log data
125
- activity_key (str): Activity attribute name
126
- timestamp_key (str): Timestamp attribute name
127
- case_id_key (str): Case ID attribute name
128
129
Returns:
130
Dict[Tuple[str, ...], Union[EventLog, pd.DataFrame]]: Variants mapped to their sub-logs
131
"""
132
```
133
134
### Temporal Statistics
135
136
Analyze temporal patterns including case durations, arrival rates, and performance metrics.
137
138
```python { .api }
139
def get_case_arrival_average(log, timestamp_key='time:timestamp', case_id_key='case:concept:name'):
140
"""
141
Calculate average case arrival rate (cases per time unit).
142
143
Parameters:
144
- log (Union[EventLog, pd.DataFrame]): Event log data
145
- timestamp_key (str): Timestamp attribute name
146
- case_id_key (str): Case ID attribute name
147
148
Returns:
149
float: Average case arrival rate (cases per second)
150
"""
151
152
def get_all_case_durations(log, timestamp_key='time:timestamp', case_id_key='case:concept:name'):
153
"""
154
Get durations of all cases in the log.
155
156
Parameters:
157
- log (Union[EventLog, pd.DataFrame]): Event log data
158
- timestamp_key (str): Timestamp attribute name
159
- case_id_key (str): Case ID attribute name
160
161
Returns:
162
List[float]: List of case durations in seconds
163
"""
164
165
def get_case_duration(log, timestamp_key='time:timestamp', case_id_key='case:concept:name'):
166
"""
167
Calculate average case duration across all cases.
168
169
Parameters:
170
- log (Union[EventLog, pd.DataFrame]): Event log data
171
- timestamp_key (str): Timestamp attribute name
172
- case_id_key (str): Case ID attribute name
173
174
Returns:
175
float: Average case duration in seconds
176
"""
177
178
def get_cycle_time(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
179
"""
180
Calculate cycle time of the process (end-to-end duration).
181
182
Parameters:
183
- log (Union[EventLog, pd.DataFrame]): Event log data
184
- activity_key (str): Activity attribute name
185
- timestamp_key (str): Timestamp attribute name
186
- case_id_key (str): Case ID attribute name
187
188
Returns:
189
float: Average cycle time in seconds
190
"""
191
192
def get_service_time(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
193
"""
194
Calculate service time for each activity.
195
196
Parameters:
197
- log (Union[EventLog, pd.DataFrame]): Event log data
198
- activity_key (str): Activity attribute name
199
- timestamp_key (str): Timestamp attribute name
200
- case_id_key (str): Case ID attribute name
201
202
Returns:
203
Dict[str, float]: Service times per activity in seconds
204
"""
205
206
def get_variants_paths_duration(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
207
"""
208
Get durations for each variant path.
209
210
Parameters:
211
- log (Union[EventLog, pd.DataFrame]): Event log data
212
- activity_key (str): Activity attribute name
213
- timestamp_key (str): Timestamp attribute name
214
- case_id_key (str): Case ID attribute name
215
216
Returns:
217
Dict[Tuple[str, ...], List[float]]: Durations per variant
218
"""
219
```
220
221
### Advanced Statistics
222
223
Complex statistical analysis including loops, segments, and behavioral patterns.
224
225
```python { .api }
226
def get_minimum_self_distances(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
227
"""
228
Calculate minimum self-distances for activities (loop detection).
229
230
Parameters:
231
- log (Union[EventLog, pd.DataFrame]): Event log data
232
- activity_key (str): Activity attribute name
233
- timestamp_key (str): Timestamp attribute name
234
- case_id_key (str): Case ID attribute name
235
236
Returns:
237
Dict[str, int]: Minimum self-distances per activity
238
"""
239
240
def get_minimum_self_distance_witnesses(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
241
"""
242
Get witness traces for minimum self-distances.
243
244
Parameters:
245
- log (Union[EventLog, pd.DataFrame]): Event log data
246
- activity_key (str): Activity attribute name
247
- timestamp_key (str): Timestamp attribute name
248
- case_id_key (str): Case ID attribute name
249
250
Returns:
251
Dict[str, List[str]]: Witness cases per activity
252
"""
253
254
def get_frequent_trace_segments(log, min_length=2, max_length=5, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
255
"""
256
Extract frequent trace segments of specified lengths.
257
258
Parameters:
259
- log (Union[EventLog, pd.DataFrame]): Event log data
260
- min_length (int): Minimum segment length
261
- max_length (int): Maximum segment length
262
- activity_key (str): Activity attribute name
263
- timestamp_key (str): Timestamp attribute name
264
- case_id_key (str): Case ID attribute name
265
266
Returns:
267
Dict[Tuple[str, ...], int]: Frequent segments with frequencies
268
"""
269
270
def get_rework_cases_per_activity(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
271
"""
272
Get count of cases with rework per activity.
273
274
Parameters:
275
- log (Union[EventLog, pd.DataFrame]): Event log data
276
- activity_key (str): Activity attribute name
277
- timestamp_key (str): Timestamp attribute name
278
- case_id_key (str): Case ID attribute name
279
280
Returns:
281
Dict[str, int]: Rework cases count per activity
282
"""
283
284
def get_case_overlap(log, timestamp_key='time:timestamp', case_id_key='case:concept:name'):
285
"""
286
Calculate case overlap measure (parallel case execution).
287
288
Parameters:
289
- log (Union[EventLog, pd.DataFrame]): Event log data
290
- timestamp_key (str): Timestamp attribute name
291
- case_id_key (str): Case ID attribute name
292
293
Returns:
294
float: Case overlap ratio
295
"""
296
297
def get_activity_position_summary(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
298
"""
299
Get position summary statistics for each activity.
300
301
Parameters:
302
- log (Union[EventLog, pd.DataFrame]): Event log data
303
- activity_key (str): Activity attribute name
304
- timestamp_key (str): Timestamp attribute name
305
- case_id_key (str): Case ID attribute name
306
307
Returns:
308
Dict[str, Dict[str, Any]]: Position statistics per activity
309
"""
310
```
311
312
### Stochastic Language
313
314
Generate probabilistic representations of process behavior.
315
316
```python { .api }
317
def get_stochastic_language(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
318
"""
319
Generate stochastic language from log or model.
320
Creates probabilistic representation of process behavior.
321
322
Parameters:
323
- log (Union[EventLog, pd.DataFrame]): Event log data
324
- activity_key (str): Activity attribute name
325
- timestamp_key (str): Timestamp attribute name
326
- case_id_key (str): Case ID attribute name
327
328
Returns:
329
Dict[List[str], float]: Traces mapped to their probabilities
330
"""
331
```
332
333
### Model Analysis
334
335
Advanced analytical functions for process model evaluation and manipulation.
336
337
```python { .api }
338
def check_soundness(petri_net, initial_marking, final_marking):
339
"""
340
Check if Petri net is sound (proper termination, no deadlocks).
341
342
Parameters:
343
- petri_net (PetriNet): Petri net model
344
- initial_marking (Marking): Initial marking
345
- final_marking (Marking): Final marking
346
347
Returns:
348
bool: True if the Petri net is sound
349
"""
350
351
def check_is_workflow_net(petri_net):
352
"""
353
Check if Petri net is a workflow net (single source, single sink).
354
355
Parameters:
356
- petri_net (PetriNet): Petri net model
357
358
Returns:
359
bool: True if it's a workflow net
360
"""
361
362
def simplicity_petri_net(petri_net):
363
"""
364
Calculate simplicity metric of Petri net.
365
366
Parameters:
367
- petri_net (PetriNet): Petri net model
368
369
Returns:
370
float: Simplicity value between 0 and 1
371
"""
372
```
373
374
### Mathematical Operations
375
376
Mathematical analysis functions for process models and languages.
377
378
```python { .api }
379
def compute_emd(language1, language2):
380
"""
381
Compute Earth Mover Distance between two stochastic languages.
382
383
Parameters:
384
- language1 (Dict): First stochastic language
385
- language2 (Dict): Second stochastic language
386
387
Returns:
388
float: Earth Mover Distance value
389
"""
390
391
def solve_marking_equation(petri_net, initial_marking, final_marking, cost_function=None):
392
"""
393
Solve marking equation for Petri net reachability.
394
395
Parameters:
396
- petri_net (PetriNet): Petri net model
397
- initial_marking (Marking): Initial marking
398
- final_marking (Marking): Target marking
399
- cost_function (Optional[Callable]): Cost function for optimization
400
401
Returns:
402
float: Solution cost or distance
403
"""
404
405
def solve_extended_marking_equation(petri_net, initial_marking, final_marking, **kwargs):
406
"""
407
Solve extended marking equation with additional constraints.
408
409
Parameters:
410
- petri_net (PetriNet): Petri net model
411
- initial_marking (Marking): Initial marking
412
- final_marking (Marking): Target marking
413
- **kwargs: Additional parameters and constraints
414
415
Returns:
416
Dict[str, Any]: Solution with detailed information
417
"""
418
```
419
420
### Similarity Analysis
421
422
Calculate similarity between models, logs, and process representations.
423
424
```python { .api }
425
def behavioral_similarity(model1, model2, **kwargs):
426
"""
427
Calculate behavioral similarity between two process models.
428
429
Parameters:
430
- model1 (Any): First process model
431
- model2 (Any): Second process model
432
- **kwargs: Similarity computation parameters
433
434
Returns:
435
float: Behavioral similarity score (0-1)
436
"""
437
438
def structural_similarity(model1, model2, **kwargs):
439
"""
440
Calculate structural similarity between two process models.
441
442
Parameters:
443
- model1 (Any): First process model
444
- model2 (Any): Second process model
445
- **kwargs: Similarity computation parameters
446
447
Returns:
448
float: Structural similarity score (0-1)
449
"""
450
451
def embeddings_similarity(log1, log2, **kwargs):
452
"""
453
Calculate embeddings-based similarity between event logs.
454
455
Parameters:
456
- log1 (Union[EventLog, pd.DataFrame]): First event log
457
- log2 (Union[EventLog, pd.DataFrame]): Second event log
458
- **kwargs: Embedding parameters
459
460
Returns:
461
float: Embeddings similarity score (0-1)
462
"""
463
464
def label_sets_similarity(model1, model2, **kwargs):
465
"""
466
Calculate label set similarity between models.
467
468
Parameters:
469
- model1 (Any): First process model
470
- model2 (Any): Second process model
471
- **kwargs: Similarity parameters
472
473
Returns:
474
float: Label set similarity score (0-1)
475
"""
476
```
477
478
### Utility Analysis Functions
479
480
Utility functions for model manipulation and analysis.
481
482
```python { .api }
483
def get_enabled_transitions(petri_net, marking):
484
"""
485
Get list of transitions enabled in specific marking.
486
487
Parameters:
488
- petri_net (PetriNet): Petri net model
489
- marking (Marking): Current marking
490
491
Returns:
492
List[PetriNet.Transition]: List of enabled transitions
493
"""
494
495
def get_activity_labels(model):
496
"""
497
Get set of activity labels from process model.
498
499
Parameters:
500
- model (Any): Process model (Petri net, process tree, etc.)
501
502
Returns:
503
Set[str]: Set of activity labels
504
"""
505
506
def replace_activity_labels(model, replacement_dict):
507
"""
508
Replace activity labels in process model.
509
510
Parameters:
511
- model (Any): Process model to modify
512
- replacement_dict (Dict[str, str]): Label replacement mapping
513
514
Returns:
515
Any: Modified process model
516
"""
517
518
def map_labels_from_second_model(model1, model2):
519
"""
520
Create label mapping between two models.
521
522
Parameters:
523
- model1 (Any): First process model
524
- model2 (Any): Second process model
525
526
Returns:
527
Dict[str, str]: Label mapping from model1 to model2
528
"""
529
```
530
531
## Usage Examples
532
533
### Basic Statistical Analysis
534
535
```python
536
import pm4py
537
538
# Load event log
539
log = pm4py.read_xes('event_log.xes')
540
541
# Basic statistics
542
start_activities = pm4py.get_start_activities(log)
543
end_activities = pm4py.get_end_activities(log)
544
545
print("Start Activities:")
546
for activity, count in sorted(start_activities.items(), key=lambda x: x[1], reverse=True):
547
print(f" {activity}: {count}")
548
549
print("End Activities:")
550
for activity, count in sorted(end_activities.items(), key=lambda x: x[1], reverse=True):
551
print(f" {activity}: {count}")
552
553
# Attribute analysis
554
event_attributes = pm4py.get_event_attributes(log)
555
trace_attributes = pm4py.get_trace_attributes(log)
556
557
print(f"Event attributes: {event_attributes}")
558
print(f"Trace attributes: {trace_attributes}")
559
```
560
561
### Variant Analysis
562
563
```python
564
import pm4py
565
566
# Get variants with frequencies
567
variants = pm4py.get_variants_as_tuples(log)
568
569
print(f"Total variants: {len(variants)}")
570
print("Top 10 variants:")
571
for variant, count in sorted(variants.items(), key=lambda x: x[1], reverse=True)[:10]:
572
print(f" {' -> '.join(variant)}: {count} cases")
573
574
# Split log by variants
575
variant_logs = pm4py.split_by_process_variant(log)
576
577
print("Variant analysis:")
578
for variant, sub_log in variant_logs.items():
579
case_count = len(sub_log)
580
avg_duration = pm4py.get_case_duration(sub_log)
581
print(f" Variant with {len(variant)} steps: {case_count} cases, avg duration: {avg_duration/3600:.1f} hours")
582
```
583
584
### Temporal Analysis
585
586
```python
587
import pm4py
588
589
# Case duration analysis
590
all_durations = pm4py.get_all_case_durations(log)
591
avg_duration = pm4py.get_case_duration(log)
592
cycle_time = pm4py.get_cycle_time(log)
593
594
print(f"Case Duration Statistics:")
595
print(f" Average: {avg_duration/3600:.1f} hours")
596
print(f" Cycle time: {cycle_time/3600:.1f} hours")
597
print(f" Min: {min(all_durations)/3600:.1f} hours")
598
print(f" Max: {max(all_durations)/3600:.1f} hours")
599
600
# Arrival rate analysis
601
arrival_rate = pm4py.get_case_arrival_average(log)
602
print(f" Arrival rate: {arrival_rate*3600:.1f} cases/hour")
603
604
# Service time analysis
605
service_times = pm4py.get_service_time(log)
606
print("Service Times:")
607
for activity, time in sorted(service_times.items(), key=lambda x: x[1], reverse=True):
608
print(f" {activity}: {time/60:.1f} minutes")
609
```
610
611
### Advanced Behavioral Analysis
612
613
```python
614
import pm4py
615
616
# Loop analysis
617
self_distances = pm4py.get_minimum_self_distances(log)
618
witnesses = pm4py.get_minimum_self_distance_witnesses(log)
619
620
print("Loop Analysis:")
621
for activity, distance in self_distances.items():
622
if distance > 1: # Activity can loop
623
print(f" {activity}: min distance {distance} (witness cases: {len(witnesses[activity])})")
624
625
# Rework analysis
626
rework_cases = pm4py.get_rework_cases_per_activity(log)
627
total_cases = len(set(log['case:concept:name']) if isinstance(log, pd.DataFrame) else log)
628
629
print("Rework Analysis:")
630
for activity, rework_count in rework_cases.items():
631
rework_percentage = (rework_count / total_cases) * 100
632
print(f" {activity}: {rework_count} cases ({rework_percentage:.1f}%)")
633
634
# Case overlap analysis
635
overlap = pm4py.get_case_overlap(log)
636
print(f"Case Overlap: {overlap:.3f}")
637
638
# Activity position analysis
639
position_summary = pm4py.get_activity_position_summary(log)
640
print("Activity Position Summary:")
641
for activity, stats in position_summary.items():
642
print(f" {activity}:")
643
print(f" Avg position: {stats['mean_position']:.1f}")
644
print(f" Position range: {stats['min_position']} - {stats['max_position']}")
645
```
646
647
### Frequent Pattern Mining
648
649
```python
650
import pm4py
651
652
# Find frequent trace segments
653
frequent_segments = pm4py.get_frequent_trace_segments(
654
log,
655
min_length=2,
656
max_length=4
657
)
658
659
print("Frequent Trace Segments:")
660
for segment, frequency in sorted(frequent_segments.items(), key=lambda x: x[1], reverse=True)[:20]:
661
print(f" {' -> '.join(segment)}: {frequency} occurrences")
662
663
# Variant duration analysis
664
variant_durations = pm4py.get_variants_paths_duration(log)
665
print("Variant Performance Analysis:")
666
for variant, durations in variant_durations.items():
667
if len(durations) >= 5: # Only variants with sufficient data
668
avg_duration = sum(durations) / len(durations)
669
print(f" {' -> '.join(variant[:3])}: {avg_duration/3600:.1f}h avg ({len(durations)} cases)")
670
```
671
672
### Model Quality Assessment
673
674
```python
675
import pm4py
676
677
# Discover model
678
net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(log)
679
680
# Check model properties
681
is_sound = pm4py.check_soundness(net, initial_marking, final_marking)
682
is_workflow = pm4py.check_is_workflow_net(net)
683
simplicity = pm4py.simplicity_petri_net(net)
684
685
print("Model Quality Assessment:")
686
print(f" Sound: {is_sound}")
687
print(f" Workflow net: {is_workflow}")
688
print(f" Simplicity: {simplicity:.3f}")
689
690
# Get model labels
691
activity_labels = pm4py.get_activity_labels(net)
692
print(f" Activities in model: {len(activity_labels)}")
693
print(f" Activity labels: {sorted(activity_labels)}")
694
695
# Check enabled transitions in initial marking
696
enabled = pm4py.get_enabled_transitions(net, initial_marking)
697
print(f" Initially enabled transitions: {len(enabled)}")
698
```
699
700
### Stochastic Language Analysis
701
702
```python
703
import pm4py
704
705
# Generate stochastic language
706
stochastic_lang = pm4py.get_stochastic_language(log)
707
708
print("Stochastic Language Analysis:")
709
print(f" Unique traces: {len(stochastic_lang)}")
710
print(f" Most probable traces:")
711
712
# Show top traces by probability
713
sorted_traces = sorted(stochastic_lang.items(), key=lambda x: x[1], reverse=True)[:10]
714
for trace, prob in sorted_traces:
715
trace_str = ' -> '.join(trace[:5]) # Limit length for display
716
if len(trace) > 5:
717
trace_str += "..."
718
print(f" {trace_str}: {prob:.4f}")
719
720
# Calculate entropy
721
import math
722
entropy = -sum(p * math.log2(p) for p in stochastic_lang.values() if p > 0)
723
print(f" Process entropy: {entropy:.3f} bits")
724
```
725
726
### Model Comparison and Similarity
727
728
```python
729
import pm4py
730
731
# Discover two different models
732
net1, im1, fm1 = pm4py.discover_petri_net_inductive(log)
733
net2, im2, fm2 = pm4py.discover_petri_net_heuristics(log)
734
735
# Calculate similarities
736
behavioral_sim = pm4py.behavioral_similarity(net1, net2)
737
structural_sim = pm4py.structural_similarity(net1, net2)
738
label_sim = pm4py.label_sets_similarity(net1, net2)
739
740
print("Model Similarity Analysis:")
741
print(f" Behavioral similarity: {behavioral_sim:.3f}")
742
print(f" Structural similarity: {structural_sim:.3f}")
743
print(f" Label set similarity: {label_sim:.3f}")
744
745
# Create label mapping
746
label_mapping = pm4py.map_labels_from_second_model(net1, net2)
747
print(f" Common labels: {len(label_mapping)}")
748
749
# Compare model languages
750
lang1 = pm4py.get_stochastic_language(log) # Would use model if available
751
lang2 = pm4py.get_stochastic_language(log) # Would use different model
752
753
# emd_distance = pm4py.compute_emd(lang1, lang2)
754
# print(f" Earth Mover Distance: {emd_distance:.3f}")
755
```
756
757
### Comprehensive Process Analysis Dashboard
758
759
```python
760
import pm4py
761
762
def comprehensive_process_analysis(log):
763
"""Generate comprehensive process analysis report."""
764
765
print("=" * 60)
766
print("COMPREHENSIVE PROCESS ANALYSIS REPORT")
767
print("=" * 60)
768
769
# Basic statistics
770
total_cases = len(set(log['case:concept:name']) if isinstance(log, pd.DataFrame) else log)
771
total_events = len(log)
772
773
print(f"Dataset Overview:")
774
print(f" Cases: {total_cases:,}")
775
print(f" Events: {total_events:,}")
776
print(f" Events per case: {total_events/total_cases:.1f}")
777
778
# Temporal analysis
779
durations = pm4py.get_all_case_durations(log)
780
avg_duration = sum(durations) / len(durations)
781
782
print(f"\nTemporal Analysis:")
783
print(f" Average case duration: {avg_duration/3600:.1f} hours")
784
print(f" Shortest case: {min(durations)/60:.1f} minutes")
785
print(f" Longest case: {max(durations)/3600:.1f} hours")
786
787
# Variant analysis
788
variants = pm4py.get_variants_as_tuples(log)
789
variant_coverage = sum(sorted(variants.values(), reverse=True)[:10]) / total_cases
790
791
print(f"\nVariant Analysis:")
792
print(f" Total variants: {len(variants)}")
793
print(f" Top 10 variants cover: {variant_coverage:.1%} of cases")
794
795
# Behavioral patterns
796
rework = pm4py.get_rework_cases_per_activity(log)
797
total_rework = sum(rework.values())
798
799
print(f"\nBehavioral Patterns:")
800
print(f" Cases with rework: {total_rework} ({total_rework/total_cases:.1%})")
801
802
# Process model quality
803
net, im, fm = pm4py.discover_petri_net_inductive(log)
804
fitness = pm4py.fitness_alignments(log, net, im, fm)
805
precision = pm4py.precision_alignments(log, net, im, fm)
806
807
print(f"\nProcess Model Quality:")
808
print(f" Fitness: {fitness['log_fitness']:.3f}")
809
print(f" Precision: {precision:.3f}")
810
print(f" Soundness: {pm4py.check_soundness(net, im, fm)}")
811
812
return {
813
'cases': total_cases,
814
'events': total_events,
815
'avg_duration': avg_duration,
816
'variants': len(variants),
817
'rework_rate': total_rework/total_cases,
818
'fitness': fitness['log_fitness'],
819
'precision': precision
820
}
821
822
# Run comprehensive analysis
823
analysis_results = comprehensive_process_analysis(log)
824
```