CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pm4py

Process mining library for discovering, analyzing and visualizing business processes from event data

Pending
Overview
Eval results
Files

filtering.mddocs/

Filtering Operations

Comprehensive filtering capabilities for event logs and Object-Centric Event Logs (OCEL). PM4PY provides behavioral, temporal, organizational, and structural filters to preprocess data and focus analysis on specific aspects of process behavior.

Capabilities

Event and Case Filtering

Filter events and cases based on attribute values and occurrence patterns.

def filter_log_relative_occurrence_event_attribute(log, min_relative_stake, attribute_key='concept:name', level='cases', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter by relative occurrence of event attributes.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - min_relative_stake (float): Minimum relative occurrence (0.0-1.0)
    - attribute_key (str): Attribute to filter on
    - level (str): Filtering level ('cases', 'events')
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_start_activities(log, activities, retain=True, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter cases by start activities.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activities (List[str]): List of start activities to filter
    - retain (bool): True to keep, False to remove matching cases
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_end_activities(log, activities, retain=True, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter cases by end activities.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activities (List[str]): List of end activities to filter
    - retain (bool): True to keep, False to remove matching cases
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_event_attribute_values(log, attribute_values, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', retain=True):
    """
    Filter events by attribute values.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - attribute_values (List[Any]): Values to filter on
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - retain (bool): True to keep, False to remove matching events
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_trace_attribute_values(log, attribute_values, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', retain=True):
    """
    Filter traces by attribute values.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - attribute_values (List[Any]): Values to filter on
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - retain (bool): True to keep, False to remove matching traces
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

Behavioral Filtering

Filter based on process behavior patterns including variants and activity relationships.

def filter_variants(log, variants, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', retain=True):
    """
    Filter by trace variants (activity sequences).
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - variants (List[Tuple[str, ...]]): List of variants to filter
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - retain (bool): True to keep, False to remove matching variants
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_variants_by_coverage_percentage(log, percentage, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Keep variants that cover specified percentage of cases.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - percentage (float): Coverage percentage (0.0-1.0)
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_variants_top_k(log, k, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Keep top-k most frequent variants.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - k (int): Number of top variants to keep
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_directly_follows_relation(log, relations, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', retain=True):
    """
    Filter by directly-follows relations between activities.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - relations (List[Tuple[str, str]]): List of directly-follows relations
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - retain (bool): True to keep, False to remove cases with relations
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_eventually_follows_relation(log, relations, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', retain=True):
    """
    Filter by eventually-follows relations between activities.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - relations (List[Tuple[str, str]]): List of eventually-follows relations
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - retain (bool): True to keep, False to remove cases with relations
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

Time-Based Filtering

Filter events and cases based on temporal criteria and performance metrics.

def filter_time_range(log, dt1, dt2, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter events within specific time range.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - dt1 (datetime): Start of time range
    - dt2 (datetime): End of time range
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_case_performance(log, min_performance, max_performance, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter cases by performance (duration) thresholds.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - min_performance (float): Minimum case duration (seconds)
    - max_performance (float): Maximum case duration (seconds)
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

Structural Filtering

Filter based on structural properties like case size and activity patterns.

def filter_case_size(log, min_size, max_size, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter cases by number of events (case size).
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - min_size (int): Minimum number of events per case
    - max_size (int): Maximum number of events per case
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_between(log, activity1, activity2, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter events that occur between two specific activities.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activity1 (str): First activity (start marker)
    - activity2 (str): Second activity (end marker)
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_activities_rework(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', min_occurrences=2):
    """
    Filter cases with activity rework (repeated activities).
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - min_occurrences (int): Minimum occurrences to consider as rework
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_paths_performance(log, paths, min_performance, max_performance, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Filter by performance of specific activity paths.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - paths (List[Tuple[str, str]]): Activity paths to measure
    - min_performance (float): Minimum path performance (seconds)
    - max_performance (float): Maximum path performance (seconds)
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

Trace Segment Filtering

Extract specific segments of traces for focused analysis.

def filter_prefixes(log, length, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Extract trace prefixes of specified length.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - length (int): Length of prefixes to extract
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log with prefixes
    """

def filter_suffixes(log, length, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Extract trace suffixes of specified length.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - length (int): Length of suffixes to extract
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log with suffixes
    """

def filter_trace_segments(log, min_prefix_length, max_prefix_length, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name'):
    """
    Extract trace segments between specified lengths.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - min_prefix_length (int): Minimum prefix length
    - max_prefix_length (int): Maximum prefix length
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log with segments
    """

Organizational Filtering

Filter based on organizational patterns and resource behavior.

def filter_four_eyes_principle(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', resource_key='org:resource'):
    """
    Filter cases violating four-eyes principle (same resource performing critical activities).
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - resource_key (str): Resource attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

def filter_activity_done_different_resources(log, activity, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', resource_key='org:resource'):
    """
    Filter cases where specified activity is performed by different resources.
    
    Parameters:
    - log (Union[EventLog, pd.DataFrame]): Event log data
    - activity (str): Activity to check for resource diversity
    - activity_key (str): Activity attribute name
    - timestamp_key (str): Timestamp attribute name
    - case_id_key (str): Case ID attribute name
    - resource_key (str): Resource attribute name
    
    Returns:
    Union[EventLog, pd.DataFrame]: Filtered event log
    """

OCEL Filtering

Specialized filtering operations for Object-Centric Event Logs.

def filter_ocel_event_attribute(ocel, attribute_key, attribute_values):
    """
    Filter OCEL events by attribute values.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - attribute_key (str): Event attribute to filter on
    - attribute_values (List[Any]): Values to retain
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_object_attribute(ocel, attribute_key, attribute_values):
    """
    Filter OCEL objects by attribute values.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - attribute_key (str): Object attribute to filter on
    - attribute_values (List[Any]): Values to retain
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_object_types_allowed_activities(ocel, object_types_allowed_activities):
    """
    Filter OCEL by allowed activities per object type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_types_allowed_activities (Dict[str, List[str]]): Allowed activities per object type
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_object_per_type_count(ocel, object_type_count):
    """
    Filter OCEL by object count per type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_type_count (Dict[str, Tuple[int, int]]): Min/max object counts per type
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_start_events_per_object_type(ocel, start_events):
    """
    Filter OCEL by start events per object type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - start_events (Dict[str, List[str]]): Start events per object type
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_end_events_per_object_type(ocel, end_events):
    """
    Filter OCEL by end events per object type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - end_events (Dict[str, List[str]]): End events per object type
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_events_timestamp(ocel, timestamp_from, timestamp_to):
    """
    Filter OCEL events by timestamp range.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - timestamp_from (datetime): Start timestamp
    - timestamp_to (datetime): End timestamp
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_events(ocel, event_ids):
    """
    Filter OCEL by specific event IDs.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - event_ids (List[str]): Event IDs to retain
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_objects(ocel, object_ids):
    """
    Filter OCEL by specific object IDs.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_ids (List[str]): Object IDs to retain
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_object_types(ocel, object_types):
    """
    Filter OCEL by object types.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_types (List[str]): Object types to retain
    
    Returns:
    OCEL: Filtered object-centric event log
    """

OCEL Connected Component Filtering

Filter OCEL based on connected component analysis.

def filter_ocel_cc_object(ocel, object_id):
    """
    Filter OCEL by connected component containing specific object.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_id (str): Object ID to find connected component for
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_cc_length(ocel, min_length, max_length):
    """
    Filter OCEL by connected component length.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - min_length (int): Minimum component length
    - max_length (int): Maximum component length
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_cc_otype(ocel, object_type):
    """
    Filter OCEL by connected components containing specific object type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_type (str): Object type to filter by
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_cc_activity(ocel, activity):
    """
    Filter OCEL by connected components containing specific activity.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - activity (str): Activity to filter by
    
    Returns:
    OCEL: Filtered object-centric event log
    """

def filter_ocel_activities_connected_object_type(ocel, object_type):
    """
    Filter OCEL activities connected to specific object type.
    
    Parameters:
    - ocel (OCEL): Object-centric event log
    - object_type (str): Object type to filter activities for
    
    Returns:
    OCEL: Filtered object-centric event log
    """

DFG Filtering

Filter Directly-Follows Graphs based on activity and path frequencies.

def filter_dfg_activities_percentage(dfg, start_activities, end_activities, percentage):
    """
    Filter DFG by activity percentage threshold.
    
    Parameters:
    - dfg (dict): Directly-follows graph
    - start_activities (dict): Start activities and frequencies
    - end_activities (dict): End activities and frequencies
    - percentage (float): Percentage threshold (0.0-1.0)
    
    Returns:
    Tuple[dict, dict, dict]: Filtered (dfg, start_activities, end_activities)
    """

def filter_dfg_paths_percentage(dfg, start_activities, end_activities, percentage):
    """
    Filter DFG by path percentage threshold.
    
    Parameters:
    - dfg (dict): Directly-follows graph
    - start_activities (dict): Start activities and frequencies
    - end_activities (dict): End activities and frequencies
    - percentage (float): Percentage threshold (0.0-1.0)
    
    Returns:
    Tuple[dict, dict, dict]: Filtered (dfg, start_activities, end_activities)
    """

Usage Examples

Basic Filtering Operations

import pm4py

# Load event log
log = pm4py.read_xes('event_log.xes')

# Keep only top 10 most frequent variants
filtered_log = pm4py.filter_variants_top_k(log, 10)

# Filter by start activities
filtered_log = pm4py.filter_start_activities(log, ['Start Process', 'Initialize'])

# Filter by case performance (duration between 1 hour and 1 week)
filtered_log = pm4py.filter_case_performance(log, 3600, 604800)

# Filter by case size (between 5 and 50 events)
filtered_log = pm4py.filter_case_size(log, 5, 50)

Advanced Behavioral Filtering

import pm4py

# Filter cases containing specific directly-follows relations
relations = [('Submit Application', 'Review Application'), 
             ('Review Application', 'Make Decision')]
filtered_log = pm4py.filter_directly_follows_relation(log, relations, retain=True)

# Keep variants covering 80% of cases
filtered_log = pm4py.filter_variants_by_coverage_percentage(log, 0.8)

# Filter cases with rework (activities occurring more than once)
rework_log = pm4py.filter_activities_rework(log, min_occurrences=2)

Time-Based Filtering

import pm4py
from datetime import datetime

# Filter events within specific time range
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
time_filtered_log = pm4py.filter_time_range(log, start_date, end_date)

# Filter by path performance
paths = [('Submit', 'Approve'), ('Review', 'Decision')]
perf_filtered_log = pm4py.filter_paths_performance(
    log, paths, 
    min_performance=3600,  # 1 hour minimum
    max_performance=86400  # 1 day maximum
)

Trace Segment Analysis

import pm4py

# Extract prefixes of length 5 for predictive modeling
prefixes = pm4py.filter_prefixes(log, 5)

# Extract suffixes of length 3
suffixes = pm4py.filter_suffixes(log, 3)

# Extract segments between positions 2 and 8
segments = pm4py.filter_trace_segments(log, 2, 8)

Organizational Filtering

import pm4py

# Filter cases violating four-eyes principle
violations = pm4py.filter_four_eyes_principle(log)

# Filter cases where 'Approval' activity is done by different resources
diverse_approval = pm4py.filter_activity_done_different_resources(log, 'Approval')

OCEL Filtering

import pm4py

# Load OCEL
ocel = pm4py.read_ocel('ocel_data.csv')

# Filter by object types
filtered_ocel = pm4py.filter_ocel_object_types(ocel, ['Order', 'Invoice'])

# Filter by timestamp range
from datetime import datetime
start_time = datetime(2023, 1, 1)
end_time = datetime(2023, 6, 30)
time_filtered_ocel = pm4py.filter_ocel_events_timestamp(ocel, start_time, end_time)

# Filter by connected component length
cc_filtered_ocel = pm4py.filter_ocel_cc_length(ocel, min_length=10, max_length=100)

# Filter by object type constraints
constraints = {
    'Order': ['Create Order', 'Process Payment', 'Ship Order'],
    'Product': ['Add to Cart', 'Remove from Cart', 'Purchase']
}
constrained_ocel = pm4py.filter_ocel_object_types_allowed_activities(ocel, constraints)

Combining Multiple Filters

import pm4py

def create_analysis_subset(log):
    """Create a focused subset for detailed analysis."""
    
    # Start with top variants covering 90% of cases
    filtered_log = pm4py.filter_variants_by_coverage_percentage(log, 0.9)
    
    # Remove very short and very long cases
    filtered_log = pm4py.filter_case_size(filtered_log, 3, 30)
    
    # Filter by reasonable case duration (1 hour to 30 days)
    filtered_log = pm4py.filter_case_performance(filtered_log, 3600, 2592000)
    
    # Keep only cases starting with specific activities
    start_activities = ['Register', 'Submit Application', 'Create Order']
    filtered_log = pm4py.filter_start_activities(filtered_log, start_activities)
    
    return filtered_log

analysis_log = create_analysis_subset(log)
print(f"Original log: {len(log)} cases")
print(f"Filtered log: {len(analysis_log)} cases")

DFG Filtering

import pm4py

# Discover DFG
dfg, start_activities, end_activities = pm4py.discover_dfg(log)

# Filter DFG to keep top 80% of activities by frequency
filtered_dfg, filtered_start, filtered_end = pm4py.filter_dfg_activities_percentage(
    dfg, start_activities, end_activities, 0.8
)

# Filter DFG to keep top 90% of paths by frequency
path_filtered_dfg, path_start, path_end = pm4py.filter_dfg_paths_percentage(
    dfg, start_activities, end_activities, 0.9
)

Install with Tessl CLI

npx tessl i tessl/pypi-pm4py

docs

conformance-checking.md

filtering.md

index.md

ml-organizational.md

object-centric.md

process-discovery.md

reading-writing.md

statistics-analysis.md

utilities-conversion.md

visualization.md

tile.json