Skip to content

Commit

Permalink
examples for log_to_acti_summary, log_to_top_resources, log_to_var_pa…
Browse files Browse the repository at this point in the history
…ths abstractions
  • Loading branch information
fit-alessandro-berti committed Apr 1, 2024
1 parent 070031d commit fc3a408
Show file tree
Hide file tree
Showing 3 changed files with 365 additions and 0 deletions.
124 changes: 124 additions & 0 deletions examples/llm/abstractions/log_to_acti_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from pm4py.objects.conversion.log import converter as log_converter
from typing import Union, Optional, Dict, Any
from pm4py.objects.log.obj import EventLog, EventStream
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
from pm4py.statistics.service_time.pandas import get as service_time_get
from pm4py.statistics.eventually_follows.pandas import get as eventually_follows
import pandas as pd


class Parameters(Enum):
MAX_LEN = "max_len"
RESPONSE_HEADER = "response_header"
PERFORMANCE_AGGREGATION = "performance_aggregation"
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
START_TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY


def apply(log_obj: Union[EventLog, EventStream, pd.DataFrame],
parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> str:
"""
Provides an abstraction of the activities' frequency and performance.
Minimum Viable Example:
import pm4py
from pm4py.algo.querying.llm.abstractions import log_to_acti_summary
log = pm4py.read_xes("tests/input_data/receipt.xes")
print(log_to_acti_summary.apply(log))
Example output:
Below you find the top activities of the event log, specified with their total number of occurrences, the number of cases in which they occur, an aggregation of the service time, an aggregation of the times from the preceding activities and to the succeeding activities.
Confirmation of receipt [tot.occ=1434; num.cases=1434; service time=0.00 s; time from prec.=0.00 s; time to succ.=76014.05 s]
T02 Check confirmation of receipt [tot.occ=1368; num.cases=1316; service time=0.00 s; time from prec.=100581.24 s; time to succ.=44701.62 s]
T06 Determine necessity of stop advice [tot.occ=1416; num.cases=1309; service time=0.00 s; time from prec.=186313.29 s; time to succ.=44757.08 s]
T04 Determine confirmation of receipt [tot.occ=1307; num.cases=1303; service time=0.00 s; time from prec.=36815.50 s; time to succ.=65668.55 s]
Parameters
---------------
log_obj
Log object
parameters
Optional parameters of the algorithm, including:
- Parameters.MAX_LEN => desidered length of the textual abstraction
- Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
- Parameters.PERFORMANCE_AGGREGATION => performance metric to be used to express the performance (e.g., mean). Available options: mean, median, stdev, min, max, sum
- Parameters.ACTIVITY_KEY => the attribute of the log to be used as activity
- Parameters.TIMESTAMP_KEY => the attribute of the log to be used as timestamp
- Parameters.CASE_ID_KEY => the attribute of the log to be used as case identifier
Returns
--------------
textual_abstraction
Textual abstraction of the activities frequency and of their performance.
"""
if parameters is None:
parameters = {}

response_header = exec_utils.get_param_value(Parameters.RESPONSE_HEADER, parameters, True)
max_len = exec_utils.get_param_value(Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN)

activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
performance_aggregation = exec_utils.get_param_value(Parameters.PERFORMANCE_AGGREGATION, parameters, "mean")

log_obj = log_converter.apply(log_obj, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters)
log_obj = log_obj[list({activity_key, timestamp_key, start_timestamp_key, case_id_key})]

num_occ = log_obj[activity_key].value_counts().to_dict()
num_cases = log_obj.groupby([case_id_key, activity_key]).first().reset_index()[
activity_key].value_counts().to_dict()

parameters["aggregationMeasure"] = performance_aggregation
service_times = service_time_get.apply(log_obj, parameters=parameters)

dir_follo_dataframe = eventually_follows.get_partial_order_dataframe(log_obj.copy(), activity_key=activity_key,
start_timestamp_key=start_timestamp_key,
timestamp_key=timestamp_key,
case_id_glue=case_id_key,
sort_caseid_required=False,
sort_timestamp_along_case_id=False,
reduce_dataframe=False)

post_times = dir_follo_dataframe.groupby(activity_key)[constants.DEFAULT_FLOW_TIME].agg(
performance_aggregation).to_dict()
pre_times = dir_follo_dataframe.groupby(activity_key + "_2")[constants.DEFAULT_FLOW_TIME].agg(
performance_aggregation).to_dict()

activities_list = []
for act in num_occ:
activities_list.append({"activity": act, "num_occ": num_occ[act], "num_cases": num_cases[act],
"agg_service_time": service_times[act] if act in service_times else 0.0,
"agg_pre_times": pre_times[act] if act in pre_times else 0.0,
"agg_post_times": post_times[act] if act in post_times else 0.0})

activities_list = sorted(activities_list, key=lambda x: (x["num_cases"], x["num_occ"], x["activity"]), reverse=True)

ret = "\n\n"

if response_header:
ret += "Below you find the top activities of the event log, specified with their total number of occurrences, the number of cases in which they occur, an aggregation of the service time, an aggregation of the times from the preceding activities and to the succeeding activities.\n\n"

for dct in activities_list:
ths = "%s [tot.occ=%d; num.cases=%d; service time=%.2f s; time from prec.=%.2f s; time to succ.=%.2f s]\n" % (
dct["activity"], dct["num_occ"], dct["num_cases"], dct["agg_service_time"], dct["agg_pre_times"],
dct["agg_post_times"])
if len(ret) + len(ths) < max_len:
ret = ret + ths
else:
break

ret += "\n\n"

return ret
140 changes: 140 additions & 0 deletions examples/llm/abstractions/log_to_top_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from pm4py.objects.conversion.log import converter as log_converter
from typing import Union, Optional, Dict, Any
from pm4py.objects.log.obj import EventLog, EventStream
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
import pandas as pd
from copy import copy


class Parameters(Enum):
MAX_LEN = "max_len"
RESPONSE_HEADER = "response_header"
DEFAULT_MIN_ACTIVITIES = "default_min_activities"
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
RESOURCE_KEY = constants.PARAMETER_CONSTANT_RESOURCE_KEY


def get_abstr_from_dict(ret, activities_dict, response_header):
"""
Internal method to get the textual abstraction starting from the computations already performed.
"""
abstr = ["\n\n"]

if response_header:
abstr.append(
"In the following text, you find the top activities along with their number of occurrences in the event log and the number of unique resources performing them.")
abstr.append("The top resources for such activities are included.\n\n")

sort_act = sorted([(x, activities_dict[x][0], activities_dict[x][1], ret[x]) for x in ret],
key=lambda x: (x[1], x[2], x[0]), reverse=True)

for el in sort_act:
abstr.append("%s (num.occ=%d ; num.resources=%d)" % (el[0], el[1], el[2]))

if el[3]:
abstr.append(" top resources=[")

this_res = sorted([(x, y) for x, y in el[3].items()], key=lambda z: (z[1], z[0]), reverse=True)

for i in range(len(this_res)):
if i > 0:
abstr.append("; ")
abstr.append("%s=%d" % (this_res[i][0], this_res[i][1]))
abstr.append("]")

abstr.append("\n")

abstr.append("\n\n")

abstr1 = "".join(abstr)
return abstr1


def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> str:
"""
Textually abstracts the top activities/resources combinations in the event log.
Minimum Viable Example:
import pm4py
from pm4py.algo.querying.llm.abstractions import log_to_resources
log = pm4py.read_xes("C:/receipt.xes")
res = log_to_resources.apply(log)
print(res)
Example output:
In the following text, you find the top activities along with their number of occurrences in the event log and the number of unique resources performing them.The top resources for such activities are included.
Confirmation of receipt (num.occ=1434 ; num.resources=41) top resources=[Resource01=195; admin2=114; Resource02=102; Resource03=87; Resource04=81; Resource07=78; Resource08=74; Resource06=70; Resource05=65; Resource11=58; Resource09=55; Resource15=51; Resource12=49; Resource13=47; Resource14=44; Resource17=43; Resource27=37; Resource16=35; Resource18=29; Resource10=21; Resource21=19; Resource20=18; Resource23=14; Resource22=12; Resource26=7; Resource25=7; Resource30=4; Resource33=2; Resource31=2; Resource29=2; Resource28=2; admin3=1; admin1=1; Resource43=1; Resource42=1; Resource38=1; Resource37=1; Resource36=1; Resource35=1; Resource34=1; Resource19=1]
T06 Determine necessity of stop advice (num.occ=1416 ; num.resources=34) top resources=[Resource01=203; Resource02=114; Resource04=85; Resource03=85; Resource05=84; Resource07=83; Resource08=75; Resource06=75; Resource11=74; Resource12=72; Resource09=67; Resource15=58; Resource13=53; Resource14=48; Resource17=43; Resource16=36; Resource18=28; admin2=20; Resource20=18; Resource21=16; Resource22=15; Resource23=14; Resource26=12; Resource25=12; Resource29=6; Resource28=6; Resource37=2; Resource35=2; Resource34=2; Resource33=2; Resource31=2; Resource30=2; test=1; Resource36=1]
T02 Check confirmation of receipt (num.occ=1368 ; num.resources=40) top resources=[Resource01=209; Resource02=95; Resource04=91; Resource03=86; Resource06=73; Resource08=65; Resource05=65; Resource19=64; Resource10=62; Resource13=55; Resource09=51; Resource07=50; Resource24=44; Resource12=44; Resource14=43; Resource16=36; Resource17=32; Resource15=32; Resource18=30; Resource11=30; Resource21=18; Resource20=18; Resource22=13; Resource23=12; admin2=9; Resource32=9; Resource25=6; Resource26=5; Resource28=4; Resource30=3; Resource39=2; Resource34=2; Resource31=2; Resource29=2; admin1=1; TEST=1; Resource38=1; Resource36=1; Resource35=1; Resource33=1]
T04 Determine confirmation of receipt (num.occ=1307 ; num.resources=37) top resources=[Resource10=240; Resource01=184; Resource03=81; Resource04=68; Resource02=67; Resource06=66; Resource19=61; Resource05=60; Resource07=58; Resource09=46; Resource14=41; Resource12=41; Resource13=40; Resource18=36; Resource16=36; Resource08=31; Resource11=29; Resource15=28; Resource20=18; Resource21=15; Resource17=13; Resource22=12; Resource23=11; admin2=3; Resource26=3; Resource25=3; admin3=2; admin1=2; Resource31=2; Resource29=2; Resource28=2; Resource38=1; Resource36=1; Resource35=1; Resource34=1; Resource33=1; Resource24=1]
Parameters
----------------
log
Log object
parameters
Parameters of the algorithm, including:
- Parameters.ACTIVITY_KEY => the attribute to be used as activity
- Parameters.RESOURCE_KEY => the attribute to be used as resource
- Parameters.DEFAULT_MIN_ACTIVITIES => minimum number of different activities to include in the textual abstraction
- Parameters.ACTIVITY_KEY => attribute of the log to be used as activity
- Parameters.RESOURCE_KEY => attribute of the log to be used as resource
Returns
----------------
textual_abstraction
Textual abstraction
"""
if parameters is None:
parameters = {}

max_len = exec_utils.get_param_value(Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN)
response_header = exec_utils.get_param_value(Parameters.RESPONSE_HEADER, parameters, True)
default_min_activities = exec_utils.get_param_value(Parameters.DEFAULT_MIN_ACTIVITIES, parameters, 15)
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY)

log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters)

activities = log[activity_key].value_counts().to_dict()
activities_unq_resources = log.groupby(activity_key)[resource_key].nunique().to_dict()
activities = [(x, y, activities_unq_resources[x]) for x, y in activities.items()]
activities_dict = {x[0]: (x[1], x[2]) for x in activities}

activities = sorted(activities, key=lambda z: (z[1], z[2], z[0]), reverse=True)

ret = {}
for i in range(min(len(activities), default_min_activities)):
new_ret = copy(ret)
new_ret[activities[i][0]] = {}

if len(get_abstr_from_dict(new_ret, activities_dict, response_header)) > max_len:
break

ret = new_ret

activities_resources = log.groupby([activity_key, resource_key]).size().to_dict()
activities_resources = sorted([(x, y) for x, y in activities_resources.items()], key=lambda z: (z[1], z[0]),
reverse=True)

for el in activities_resources:
new_ret = copy(ret)
if el[0][0] not in new_ret:
new_ret[el[0][0]] = {}
new_ret[el[0][0]][el[0][1]] = el[1]

if len(get_abstr_from_dict(new_ret, activities_dict, response_header)) > max_len:
break

ret = new_ret

return get_abstr_from_dict(ret, activities_dict, response_header)
101 changes: 101 additions & 0 deletions examples/llm/abstractions/log_to_var_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from pm4py.objects.conversion.log import converter as log_converter
from typing import Union, Optional, Dict, Any
from pm4py.objects.log.obj import EventLog, EventStream
from enum import Enum
from pm4py.util import exec_utils, constants, xes_constants
import pandas as pd


class Parameters(Enum):
MAX_LEN = "max_len"
RESPONSE_HEADER = "response_header"
PERFORMANCE_AGGREGATION = "performance_aggregation"
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY


def apply(log_obj: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None) -> str:
"""
Provides an abstraction of the paths performance for the process variants of the provided event log.
Minimum Viable Example:
import pm4py
from pm4py.algo.querying.llm.abstractions import log_to_var_paths
log = pm4py.read_xes("tests/input_data/receipt.xes")
print(log_to_var_paths.apply(log))
Example output:
Below your find a description of the top process variants of the event logs, along with their frequency. The paths of every reported variant are decorated with an aggregation (mean) of the performance of the path in the given variant.
Confirmation of receipt -(8064.44 s)-> T02 Check confirmation of receipt -(29250.98 s)-> T04 Determine confirmation of receipt -(218458.20 s)-> T05 Print and send confirmation of receipt -(39225.28 s)-> T06 Determine necessity of stop advice [frequency=713]
Confirmation of receipt -(3621.01 s)-> T06 Determine necessity of stop advice -(157907.57 s)-> T10 Determine necessity to stop indication -(116514.54 s)-> T02 Check confirmation of receipt -(144858.47 s)-> T04 Determine confirmation of receipt [frequency=123]
Confirmation of receipt -(79543.37 s)-> T02 Check confirmation of receipt -(169.38 s)-> T06 Determine necessity of stop advice -(144037.68 s)-> T10 Determine necessity to stop indication -(86823.89 s)-> T04 Determine confirmation of receipt [frequency=115]
Parameters
---------------
log_obj
Log object
parameters
Optional parameters of the algorithm, including:
- Parameters.MAX_LEN => desidered length of the textual abstraction
- Parameters.RESPONSE_HEADER => includes an header in the textual abstraction, which explains the context
- Parameters.PERFORMANCE_AGGREGATION => performance metric to be used to express the performance (e.g., mean). Available options: mean, median, stdev, min, max, sum
- Parameters.ACTIVITY_KEY => the attribute of the log to be used as activity
- Parameters.TIMESTAMP_KEY => the attribute of the log to be used as timestamp
- Parameters.CASE_ID_KEY => the attribute of the log to be used as case identifier
Returns
--------------
textual_abstraction
Textual abstraction of the paths' performance for every process variant
"""
if parameters is None:
parameters = {}

activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY)
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
xes_constants.DEFAULT_TIMESTAMP_KEY)
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
performance_aggregation = exec_utils.get_param_value(Parameters.PERFORMANCE_AGGREGATION, parameters, "mean")

response_header = exec_utils.get_param_value(Parameters.RESPONSE_HEADER, parameters, True)
max_len = exec_utils.get_param_value(Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN)

log_obj = log_converter.apply(log_obj, variant=log_converter.Variants.TO_DATA_FRAME, parameters=parameters)

import pm4py.stats
var_paths0 = pm4py.stats.get_variants_paths_duration(log_obj, activity_key=activity_key,
timestamp_key=timestamp_key, case_id_key=case_id_key,
times_agg=performance_aggregation).to_dict("records")
var_paths = []
for el in var_paths0:
if el["@@index_in_trace"] == 0:
var_paths.append([])
var_paths[-1].append(el)

ret = "\n\n"

if response_header:
ret += "Below your find a description of the top process variants of the event logs, along with their frequency. The paths of every reported variant are decorated with an aggregation (" + performance_aggregation + ") of the performance of the path in the given variant.\n\n"

for var in var_paths:
ths = var[0][activity_key]
for i in range(1, len(var)):
ths += " -(%.2f s)-> %s" % (var[i]["@@flow_time"], var[i][activity_key])

ths += " [frequency=%d]\n" % (var[0]["@@variant_count"])

if len(ret) + len(ths) < max_len:
ret = ret + ths
else:
break

ret = ret + "\n\n"

return ret

0 comments on commit fc3a408

Please sign in to comment.