solving merge conflict. adding headers

pm4py · Jul 31, 2023 · 393ef3d · 393ef3d
2 parents dd65a31 + 787d032
commit 393ef3d
Show file tree

Hide file tree

Showing 17 changed files with 474 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,17 @@
 # Changelog of pm4py
 
 
-## pm4py 2.7.6 (2023.07.XX)
+## pm4py 2.7.6 (2023.08.XX)
 
 ### Added
 * 6760518dea19334a21442200bef647e4c07f3636
   * LLM abstraction of the temporal profile model
 * 13d001c76e3de40786dce75e76e56a13a821173a
   * set of event logs for fairness assessment (hospital, hiring, lending, renting)
+* e3044278b3e7d984c7fdf9e39554cc4551332739
+  * added OCEL filters:
+    * length of a connected components
+    * presence of at least an object of a given object type
 
 ### Changed
 * 84629e2ea342348e30aa04a7d41ad7b39159b400
@@ -16,7 +20,11 @@
   * refactored log_to_interval_tree methods in two methods
       (log to intervals, and intervals to tree)
   * added queue-related examples
-
+* da3a12f615dba3c46793a2d9977dfca11dad85b0
+  * avoid annotation start/end edges in DFG with performance metrics
+*  37fba9285cfde95309142e4404f9cfbcb2b9296c
+  * visualizations support nanoseconds granularity when needed
+
 ### Deprecated
 
 ### Fixed
@@ -34,14 +42,20 @@
   * fixed path to Graphviz.JS
 * ca79aa9b9e51ba3a95665d5d53c8e5ab5028bf12
   * minor fix TBR generalization parameters
+* 57a30fb452a759bc71f707e67bf0f63118194b7f
+  * method to sample OCEL connected components is fixed
 
 ### Removed
+* bf5574a34a31b93024dd9feb54acc5cc475640bd
+  * change-of-mind on format_dataframe deprecation warning
 
 ### Other
 * 916ea3163119afe7aa0fc9f6c43624147d6c0f9f
   * reference to published paper in OCEL feature extraction
 * 549aa7c6766f1a51425a7a65673173c55d9731e9
   * updated reference to PM4Py website
+* 20ce84db4e195937c77280c950ff12083fc5833b
+  * example for log granularity change
 
 ---
 

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -222,7 +222,8 @@ Also, some filtering techniques are offered on top of object-centric event logs:
   * :meth:`pm4py.filtering.filter_ocel_events`; filters a specified collection of event identifiers from the object-centric event log.
   * :meth:`pm4py.filtering.filter_ocel_objects`; filters a specified collection of object identifiers from the object-centric event log.
   * :meth:`pm4py.filtering.filter_ocel_cc_object`; filters a connected component from the object-centric event log to which the object with the provided identifier belongs.
-
+  * :meth:`pm4py.filtering.filter_ocel_cc_length`; filter the connected components from an object-centric event log having a number of objects falling in a provided range.
+  * :meth:`pm4py.filtering.filter_ocel_cc_otype`; filter the connected components from an object-centric event log having at least an object of the specified object type.
 
 Machine Learning (:mod:`pm4py.ml`)
 ------------------------------------------
@@ -547,6 +548,8 @@ Overall List of Methods
    pm4py.filtering.filter_ocel_events
    pm4py.filtering.filter_ocel_objects
    pm4py.filtering.filter_ocel_cc_object
+   pm4py.filtering.filter_ocel_cc_length
+   pm4py.filtering.filter_ocel_cc_otype
    pm4py.ml
    pm4py.ml.split_train_test
    pm4py.ml.get_prefixes_from_log

diff --git a/examples/activities_to_alphabet.py b/examples/activities_to_alphabet.py
@@ -0,0 +1,13 @@
+import pm4py
+from pm4py.objects.log.util import activities_to_alphabet
+from pm4py.util import constants
+
+
+def execute_script():
+    dataframe = pm4py.read_xes("../tests/input_data/running-example.xes")
+    renamed_dataframe = activities_to_alphabet.apply(dataframe, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: "concept:name"})
+    print(renamed_dataframe)
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/examples/ocel_filter_cc.py b/examples/ocel_filter_cc.py
@@ -0,0 +1,19 @@
+import pm4py
+import sys
+
+
+def execute_script():
+    ocel = pm4py.read_ocel("../tests/input_data/ocel/example_log.jsonocel")
+    print(ocel)
+    # filters the connected components of the OCEL in which there is at least a delivery,
+    # obtaining a filtered OCEL back.
+    ocel_with_del = pm4py.filter_ocel_cc_otype(ocel, "delivery")
+    print(ocel_with_del)
+    # filters the connected components of the OCEL with at least five different objects,
+    # obtaining a filtered OCEL back.
+    ocel_with_three_objs = pm4py.filter_ocel_cc_length(ocel, 5, sys.maxsize)
+    print(ocel_with_three_objs)
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/examples/timestamp_granularity.py b/examples/timestamp_granularity.py
@@ -0,0 +1,41 @@
+import pandas as pd
+import pm4py
+import time
+
+
+def execute_script():
+    dataframe = pd.read_csv("../tests/input_data/receipt.csv")
+    dataframe = pm4py.format_dataframe(dataframe)
+
+    # prints the original timestamp column of the dataframe
+    print(dataframe["time:timestamp"])
+
+    # Here are some common options that you can use as a granularity:
+    #
+    # 'D': Day
+    # 'H': Hour
+    # 'T' or 'min': Minute
+    # 'S': Second
+    # 'L' or 'ms': Millisecond
+    # 'U': Microsecond
+    # 'N': Nanosecond
+
+    st = time.time_ns()
+    # cast on the minute
+    dataframe["time:timestamp"] = dataframe["time:timestamp"].dt.floor('T')
+    ct = time.time_ns()
+
+    print("required time for the timestamp casting: %.2f seconds" % ((ct-st)/10**9))
+
+    # prints the new timestamp column of the dataframe
+    print(dataframe["time:timestamp"])
+
+    # for completeness, we report some alternatives methods in Pandas to do the same (casting on the minute):
+    #
+    # dataframe["time:timestamp"] = dataframe["time:timestamp"].apply(lambda x: x.replace(second=0, microsecond=0))
+    #
+    # dataframe["time:timestamp"] = dataframe["time:timestamp"].dt.round('min')
+
+
+if __name__ == "__main__":
+    execute_script()
diff --git a/pm4py/__init__.py b/pm4py/__init__.py
@@ -30,7 +30,7 @@
     filter_ocel_object_types_allowed_activities, filter_ocel_object_per_type_count, filter_ocel_start_events_per_object_type, \
     filter_ocel_end_events_per_object_type, filter_ocel_events_timestamp, filter_prefixes, filter_suffixes, \
     filter_four_eyes_principle, filter_activity_done_different_resources, filter_ocel_events, filter_ocel_objects, \
-    filter_ocel_object_types, filter_ocel_cc_object
+    filter_ocel_object_types, filter_ocel_cc_object, filter_ocel_cc_length, filter_ocel_cc_otype
 from pm4py.discovery import discover_petri_net_alpha, discover_petri_net_alpha_plus, discover_petri_net_ilp, discover_petri_net_heuristics, \
     discover_petri_net_inductive, discover_process_tree_inductive, discover_heuristics_net, \
     discover_dfg, discover_footprints, discover_eventually_follows_graph, discover_directly_follows_graph, discover_bpmn_inductive, \

diff --git a/pm4py/algo/discovery/batches/variants/pandas.py b/pm4py/algo/discovery/batches/variants/pandas.py
@@ -94,9 +94,14 @@ def apply(log: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], A
         attributes_to_consider.add(event_id_key)
 
     log = log[list(attributes_to_consider)]
-    log[timestamp_key] = log[timestamp_key].values.astype(np.int64) // 10**9
+    # the timestamp columns are expressed in nanoseconds values
+    # here, we want them to have the second granularity, so we divide by 10**9
+    # for example 1001000000 nanoseconds (value stored in the column)
+    # is equivalent to 1,001 seconds.
+    log[timestamp_key] = log[timestamp_key].values.astype(np.int64) / 10**9
     if start_timestamp_key != timestamp_key:
-        log[start_timestamp_key] = log[start_timestamp_key].values.astype(np.int64) // 10**9
+        # see the aforementioned explanation.
+        log[start_timestamp_key] = log[start_timestamp_key].values.astype(np.int64) / 10**9
 
     actres_grouping0 = log.groupby([activity_key, resource_key]).agg(list).to_dict()
     start_timestamps = actres_grouping0[start_timestamp_key]

diff --git a/pm4py/analysis.py b/pm4py/analysis.py
@@ -17,7 +17,7 @@
 __doc__ = """
 """
 
-from typing import List, Optional, Tuple, Dict, Union, Generator, Set
+from typing import List, Optional, Tuple, Dict, Union, Generator, Set, Any
 
 from pm4py.objects.log.obj import Trace, EventLog, EventStream
 from pm4py.objects.conversion.log import converter as log_converter
@@ -153,7 +153,7 @@ def solve_extended_marking_equation(trace: Trace, sync_net: PetriNet, sync_im: M
 
 
 def check_soundness(petri_net: PetriNet, initial_marking: Marking,
-                    final_marking: Marking) -> bool:
+                    final_marking: Marking, print_diagnostics: bool = False) -> Tuple[bool, Dict[str, Any]]:
     """
     Check if a given Petri net is a sound WF-net.
     A Petri net is a WF-net iff:
@@ -165,11 +165,15 @@ def check_soundness(petri_net: PetriNet, initial_marking: Marking,
         - it contains no deadlocks
         - we are able to always reach the final marking
     For a formal definition of sound WF-net, consider: http://www.padsweb.rwth-aachen.de/wvdaalst/publications/p628.pdf
+    In the returned object, the first element is a boolean indicating if the Petri net is a sound workflow net.
+    The second element is a set of diagnostics collected while running WOFLAN
+    (expressed as a dictionary associating the keys [name of the diagnostics] with the corresponding diagnostics).
 
     :param petri_net: petri net
     :param initial_marking: initial marking
     :param final_marking: final marking
-    :rtype: ``bool``
+    :param print_diagnostics: boolean value that sets up additional prints during the execution of WOFLAN
+    :rtype: ``Tuple[bool, Dict[str, Any]]``
 
     .. code-block:: python3
 
@@ -179,7 +183,8 @@ def check_soundness(petri_net: PetriNet, initial_marking: Marking,
         is_sound = pm4py.check_soundness(net, im, fm)
     """
     from pm4py.algo.analysis.woflan import algorithm as woflan
-    return woflan.apply(petri_net, initial_marking, final_marking, parameters={"return_asap_when_not_sound": True, "return_diagnostics": True})
+    return woflan.apply(petri_net, initial_marking, final_marking,
+                        parameters={"return_asap_when_not_sound": True, "return_diagnostics": True, "print_diagnostics": print_diagnostics})
 
 
 def cluster_log(log: Union[EventLog, EventStream, pd.DataFrame], sklearn_clusterer=None, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Generator[EventLog, None, None]:

diff --git a/pm4py/filtering.py b/pm4py/filtering.py
@@ -1114,3 +1114,81 @@ def filter_ocel_cc_object(ocel: OCEL, object_id: str) -> OCEL:
     for cc in ocel_splits:
         if object_id in cc.objects[ocel.object_id_column].unique():
             return cc
+
+
+def filter_ocel_cc_length(ocel: OCEL, min_cc_length: int, max_cc_length: int) -> OCEL:
+    """
+    Keeps only the objects in an OCEL belonging to a connected component with a length
+    falling in a specified range
+
+    Paper:
+    Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022.
+
+    :param ocel: object-centric event log
+    :param min_cc_length: minimum allowed length for the connected component
+    :param max_cc_length: maximum allowed length for the connected component
+    :rtype: ``OCEL``
+
+    .. code-block:: python3
+
+        import pm4py
+
+        ocel = pm4py.read_ocel('log.jsonocel')
+        filtered_ocel = pm4py.filter_ocel_cc_length(ocel, 2, 10)
+    """
+    from pm4py.algo.transformation.ocel.graphs import object_interaction_graph
+    import networkx as nx
+
+    g0 = object_interaction_graph.apply(ocel)
+    g = nx.Graph()
+
+    for edge in g0:
+        g.add_edge(edge[0], edge[1])
+
+    conn_comp = list(nx.connected_components(g))
+    conn_comp = [x for x in conn_comp if min_cc_length <= len(x) <= max_cc_length]
+    objs = [y for x in conn_comp for y in x]
+
+    return filter_ocel_objects(ocel, objs)
+
+
+def filter_ocel_cc_otype(ocel: OCEL, otype: str, positive: bool = True) -> OCEL:
+    """
+    Filters the objects belonging to the connected components having at least an object
+    of the provided object type.
+
+    Paper:
+    Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022.
+
+    :param ocel: object-centric event log
+    :param otype: object type
+    :param positive: boolean that keeps or discards the objects of these components
+    :rtype: ``OCEL``
+
+    .. code-block:: python3
+
+        import pm4py
+
+        ocel = pm4py.read_ocel('log.jsonocel')
+        filtered_ocel = pm4py.filter_ocel_cc_otype(ocel, 'order')
+    """
+    if positive:
+        objs = set(ocel.objects[ocel.objects[ocel.object_type_column] == otype][ocel.object_id_column])
+    else:
+        objs = set(ocel.objects[~(ocel.objects[ocel.object_type_column] == otype)][ocel.object_id_column])
+
+    from pm4py.algo.transformation.ocel.graphs import object_interaction_graph
+    import networkx as nx
+
+    g0 = object_interaction_graph.apply(ocel)
+    g = nx.Graph()
+
+    for edge in g0:
+        g.add_edge(edge[0], edge[1])
+
+    conn_comp = list(nx.connected_components(g))
+    conn_comp = [x for x in conn_comp if len(set(x).intersection(objs)) > 0]
+
+    objs = [y for x in conn_comp for y in x]
+
+    return filter_ocel_objects(ocel, objs)
diff --git a/pm4py/objects/log/util/activities_to_alphabet.py b/pm4py/objects/log/util/activities_to_alphabet.py
@@ -0,0 +1,78 @@
+'''
+    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
+
+    PM4Py is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    PM4Py is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
+'''
+
+from enum import Enum
+from pm4py.util import exec_utils, xes_constants
+import pandas as pd
+from typing import Optional, Dict, Any, Union, Tuple
+
+
+class Parameters(Enum):
+    ACTIVITY_KEY = "activity_key"
+    RETURN_MAPPING = "return_mapping"
+
+
+def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Union[
+    pd.DataFrame, Tuple[pd.DataFrame, Dict[str, str]]]:
+    """
+    Remap the activities in a dataframe using an augmented alphabet to minimize the size of the encoding
+
+    Running example:
+
+        import pm4py
+        from pm4py.objects.log.util import activities_to_alphabet
+        from pm4py.util import constants
+
+        dataframe = pm4py.read_xes("tests/input_data/running-example.xes")
+        renamed_dataframe = activities_to_alphabet.apply(dataframe, parameters={constants.PARAMETER_CONSTANT_ACTIVITY_KEY: "concept:name"})
+        print(renamed_dataframe)
+
+    Parameters
+    --------------
+    dataframe
+        Pandas dataframe
+    parameters
+        Parameters of the method, including:
+        - Parameters.ACTIVITY_KEY => attribute to be used as activity
+        - Parameters.RETURN_MAPPING => (boolean) enables the returning the mapping dictionary (so the original activities can be re-constructed)
+
+    Returns
+    --------------
+    ren_dataframe
+        Pandas dataframe in which the activities have been remapped to the (augmented) alphabet
+    inv_mapping
+        (if required) Dictionary associating to every letter of the (augmented) alphabet the original activity
+    """
+    if parameters is None:
+        parameters = {}
+
+    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
+    return_mapping = exec_utils.get_param_value(Parameters.RETURN_MAPPING, parameters, False)
+
+    activities_count = list(dataframe[activity_key].value_counts().to_dict())
+    remap_dict = {}
+    for index, act in enumerate(activities_count):
+        result = ''
+        while index >= 0:
+            result = chr((index % 26) + ord('A')) + result
+            index = index // 26 - 1
+        remap_dict[act] = result
+    dataframe[activity_key] = dataframe[activity_key].map(remap_dict)
+    if return_mapping:
+        inverse_dct = {y: x for x, y in remap_dict.items()}
+        return dataframe, inverse_dct
+    return dataframe