diff --git a/pm4py/ml.py b/pm4py/ml.py index b6327e4e6..2d7e4c3b2 100644 --- a/pm4py/ml.py +++ b/pm4py/ml.py @@ -131,14 +131,14 @@ def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activ from pm4py.util import pandas_utils - fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True) log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key) -def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name", resource_key="org:resource", **kwargs) -> pd.DataFrame: +def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame: """ Extracts a dataframe containing the features of each case of the provided log object @@ -149,8 +149,9 @@ def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=N :param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case) :param activity_key: the attribute to be used as activity :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: the attribute to be used as case identifier + :param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier :param resource_key: the attribute to be used as resource + :param include_case_id: includes the case identifier column in the features table :rtype: ``pd.DataFrame`` .. code-block:: python3 @@ -175,6 +176,7 @@ def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=N parameters["str_ev_attr"] = str_ev_attr parameters["num_ev_attr"] = num_ev_attr parameters["str_evsucc_attr"] = str_evsucc_attr + parameters["add_case_identifier_column"] = include_case_id from pm4py.algo.transformation.log_to_features import algorithm as log_to_features @@ -243,7 +245,7 @@ def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_pat return dataframe -def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name", start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame: +def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame: """ Extracts a dataframe containing the temporal features of the provided log object @@ -254,7 +256,7 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou :param grouper_freq: the grouping frequency (D, W, M, Y) to use :param activity_key: the attribute to be used as activity :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: the attribute to be used as case identifier + :param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier :param resource_key: the attribute to be used as resource :param start_timestamp_key: the attribute to be used as start timestamp :rtype: ``pd.DataFrame`` @@ -275,7 +277,8 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou parameters[temporal.Parameters.GROUPER_FREQ] = grouper_freq parameters[temporal.Parameters.ACTIVITY_COLUMN] = activity_key parameters[temporal.Parameters.TIMESTAMP_COLUMN] = timestamp_key - parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key + if case_id_key is not None: + parameters[temporal.Parameters.CASE_ID_COLUMN] = case_id_key parameters[temporal.Parameters.START_TIMESTAMP_COLUMN] = start_timestamp_key parameters[temporal.Parameters.RESOURCE_COLUMN] = resource_key diff --git a/pm4py/objects/log/util/dataframe_utils.py b/pm4py/objects/log/util/dataframe_utils.py index 9921a4171..0bf5e0a88 100644 --- a/pm4py/objects/log/util/dataframe_utils.py +++ b/pm4py/objects/log/util/dataframe_utils.py @@ -47,6 +47,7 @@ class Parameters(Enum): INDEX_KEY = "index_key" CASE_INDEX_KEY = "case_index_key" USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp" + ADD_CASE_IDENTIFIER_COLUMN = "add_case_identifier_column" def insert_partitioning(df, num_partitions, parameters=None): @@ -353,6 +354,7 @@ def get_features_df(df: pd.DataFrame, list_columns: List[str], parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) + add_case_identifier_column = exec_utils.get_param_value(Parameters.ADD_CASE_IDENTIFIER_COLUMN, parameters, False) fea_df = pd.DataFrame({case_id_key: sorted(list(df[case_id_key].unique()))}) for col in list_columns: @@ -361,6 +363,9 @@ def get_features_df(df: pd.DataFrame, list_columns: List[str], elif "float" in str(df[col].dtype) or "int" in str(df[col].dtype): fea_df = select_number_column(df, fea_df, col, case_id_key=case_id_key) fea_df = fea_df.sort_values(case_id_key) + if not add_case_identifier_column: + del fea_df[case_id_key] + return fea_df