Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/hotfixes' into release
Browse files Browse the repository at this point in the history
  • Loading branch information
fit-alessandro-berti committed Sep 12, 2023
2 parents 189ab51 + 282c82d commit f8d5f16
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
15 changes: 9 additions & 6 deletions pm4py/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,14 @@ def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activ

from pm4py.util import pandas_utils

fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True)
log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)
log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key)

return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key)


def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name", resource_key="org:resource", **kwargs) -> pd.DataFrame:
def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame:
"""
Extracts a dataframe containing the features of each case of the provided log object
Expand All @@ -149,8 +149,9 @@ def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=N
:param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case)
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: the attribute to be used as case identifier
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
:param resource_key: the attribute to be used as resource
:param include_case_id: includes the case identifier column in the features table
:rtype: ``pd.DataFrame``
.. code-block:: python3
Expand All @@ -175,6 +176,7 @@ def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=N
parameters["str_ev_attr"] = str_ev_attr
parameters["num_ev_attr"] = num_ev_attr
parameters["str_evsucc_attr"] = str_evsucc_attr
parameters["add_case_identifier_column"] = include_case_id

from pm4py.algo.transformation.log_to_features import algorithm as log_to_features

Expand Down Expand Up @@ -243,7 +245,7 @@ def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_pat
return dataframe


def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name", start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame:
def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame:
"""
Extracts a dataframe containing the temporal features of the provided log object
Expand All @@ -254,7 +256,7 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou
:param grouper_freq: the grouping frequency (D, W, M, Y) to use
:param activity_key: the attribute to be used as activity
:param timestamp_key: the attribute to be used as timestamp
:param case_id_key: the attribute to be used as case identifier
:param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier
:param resource_key: the attribute to be used as resource
:param start_timestamp_key: the attribute to be used as start timestamp
:rtype: ``pd.DataFrame``
Expand All @@ -275,7 +277,8 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou
parameters[temporal.Parameters.GROUPER_FREQ] = grouper_freq
parameters[temporal.Parameters.ACTIVITY_COLUMN] = activity_key
parameters[temporal.Parameters.TIMESTAMP_COLUMN] = timestamp_key
parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key
if case_id_key is not None:
parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key
parameters[temporal.Parameters.START_TIMESTAMP_COLUMN] = start_timestamp_key
parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key

Expand Down
5 changes: 5 additions & 0 deletions pm4py/objects/log/util/dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Parameters(Enum):
INDEX_KEY = "index_key"
CASE_INDEX_KEY = "case_index_key"
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
ADD_CASE_IDENTIFIER_COLUMN = "add_case_identifier_column"


def insert_partitioning(df, num_partitions, parameters=None):
Expand Down Expand Up @@ -353,6 +354,7 @@ def get_features_df(df: pd.DataFrame, list_columns: List[str],
parameters = {}

case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
add_case_identifier_column = exec_utils.get_param_value(Parameters.ADD_CASE_IDENTIFIER_COLUMN, parameters, False)

fea_df = pd.DataFrame({case_id_key: sorted(list(df[case_id_key].unique()))})
for col in list_columns:
Expand All @@ -361,6 +363,9 @@ def get_features_df(df: pd.DataFrame, list_columns: List[str],
elif "float" in str(df[col].dtype) or "int" in str(df[col].dtype):
fea_df = select_number_column(df, fea_df, col, case_id_key=case_id_key)
fea_df = fea_df.sort_values(case_id_key)
if not add_case_identifier_column:
del fea_df[case_id_key]

return fea_df


Expand Down

0 comments on commit f8d5f16

Please sign in to comment.