From 1ab741c58d061a83b3978d03396ad0f3fe667e70 Mon Sep 17 00:00:00 2001 From: Chakrit Thong Ek <49002416+thongekchakrit@users.noreply.github.com> Date: Wed, 12 Apr 2023 08:33:40 +0800 Subject: [PATCH] updated main --- __pycache__/plot.cpython-311.pyc | Bin 9401 -> 10367 bytes main.py | 93 ++++++++------ plot.py | 211 +++++++++++++++++-------------- 3 files changed, 167 insertions(+), 137 deletions(-) diff --git a/__pycache__/plot.cpython-311.pyc b/__pycache__/plot.cpython-311.pyc index b336651cd241d25256fb365ea69420ffac8244b7..0b39312000c229b82651f1febafd2bc4afcf32fa 100644 GIT binary patch delta 3588 zcmaKuYitwQ6@cfCJ)XoKJGSFT9D5Qwv7Ok7LqcAVgapDPJOUd;32e2znwx}pNdgm7 zAZyk^%SsV~Vs?;lD@&v;OC#;J4TT@6v_Al$vfAArV>D8yL6MN!{nbB%%1WqG)t);} zLU?S)*Wa9T?mg$;xpU67|GeYBeU{&u%|;5YJ6|;8Z0L^V_e{Yfnxbw{Nu4-_8g;rP zB6ozuk5Ow6^ekWT6}g#0RDxZ$Q&e2PEGP9?hxPDbumQ7K=F5WH;PnM9_SyX6m68&D zlA2+}3&=s^Lh%{x(_>?j5uX}BFu}ZbxGo)fF(dnQ9ZgKTI{8DDHtFy1XF+obW-Sqd&DRR#8ytw zK;R(wMC`ZkX0Cwv(cM}5L)z&EQ3v0DRU7iDBzW`b#6miOtfdr#lCz$(UDV7S@Y*oWuD zHxR0W)CQ@HtRD0-Tz(wEozw=6p(e6A(1NdNp>;S6#(YUcvoy%0Bw{1PkrV`!gH~i~ zwZ2B&gqz_*C(22oIa_cmUYBKI?0Vd`(gVfDx8U}ro({YLcj7MS-w0*3DFK-#eq2=2 zgu7u?n_;~kyajIsZo@s8gCX0s_pu{eG)xw;6Ym1{CgF*Gfkd03L_BFLv%EX8+E=HW zb!EGDzqPHaH+@gO7Y?v4u5VR2i@l|L4GZ)L76fs&bUR9m&r7!izJ;h0C4ebPiuUK* zoNcyYMG^bG8-rJ2fIudR(1HlJ1@z$rIU@Xi!9=>Q14PanS*L~bfd=7P_UGIEhPhMo zj(cZT3dgZ`m$ z!9Ou@ERh%#{3lP0Cng>H29Ak;;N4!EcE}1mxGb;}Cr8G{1_gm^t8hVl$_G-fpM3&J zlLZqvlTdvSNg#E$z<|@Z5HY=ZBCaKY^2Z_>W*Iv@`m;gd$oSBa6C)D|Ax(@N5s3+p z3?vrVf$`I05L>tgeG?`zQ5H-6iHLXL0ERQ4wWt(gsujEEM(nfLj4o#n%J$&QUfEtX z%_=NMdV`A9dBgOX34R%@e%VTX8LXAl%>VC&$j_-bVxo)YqRT&(O4jGhZL+ydr3O=| z5GEZ2E^;=>z9GkT%3Pa?^O8m={<_G92Q~j(vU_iLVys z-+yo2{lNXLkE*5EE@(Mluk7oUw6?UIr83k?>1z%uDA%%I1})m3%0>I-Xn!tzKn@?s z1rExAgE{_vnSWpUApV4pOMG0ZX@%%LkD}7yBht}9`S6f5Iwp-LUite+3e_k0zV`VUA456S(9rNJR-_=G$-D)o(#_FigSYCK8W z+Ecm8Q|aDCt1~@(&2k;KQn9)-wrj>Tn`XbaIj)zH4YO-lv^uUAC=SoF%BT{qQ6hCp zRjpDTQ%wesOQi^yLYw_L1$aTAYS1maYcZaaXfLSLYsOboypmqNeECXwCH>%{%y!A% znd7=-u1n&&zUQp)mqewykr^`T6%}V7{eh|@hG*_TdQxSyJ_F_PXIM4GYFYv1ubACD z-}p5xxp(E8TB!9HsZu_}WEjuE%-QB!BzJedsbY~O|es< znz??-+mUZ-aW_iAVjYwxFcXp7wa`?jHnN1`gQ|hzLyE6lWptiWRmW7owifWPnJaK> zW201gmEsR5l|dyKS}Y69gnk*%#+7iL5)7+F1wP`jjELsZL6wJ=JW_AERDnw=aal&Y z=F*2??Mg*V3B{CPbhdk@PN|G8R)l7Te_ea4RtZOxaFtRWQ9{*9xLS#tn}fJdmR zdlqb%YfyD`v_fSV9@2SoLBSjw+E4fBY%A4`CL0nUT9ox|4})m4b^h&%~)^y{IEwyS*))eV02+rr2vbZ z$w1E+TOem02FC4S!C2UI0>gcg^^t(3ba_{~JRmcETzC zV9i#`&Su?THt;=R=5Z+p^5d|+CtCP8sv~j?5xK6r4U})1^*h(I-*g}c+`e@Mds*Fg ztiIR8erG|wCh=C~Cc{MtDO?f%S$S#8L4*t{wTcP43BkUvU<^$l1W)pAYJU{|2g7E* Ag8%>k delta 3033 zcmai$YfK!+9l&SyZn-_~1suoi!Y%jWfWz1cU>*k7HV#ieXn3R#$BKnL5Dwco?42E# zo^vE}Boa~UoQV}GT_qwFmnv>WbHU>(1v9D?6yu@YxCKSL8k zIm2~&%zcu-0pja-Gwwfgas`O2H2(b%l0h+T?>v+YrdeLB-Cap!*Meb`M9U`HROF!@ zUKhRTjoPe`aO5Ak(Zkuy}W?lS8qNFe+yxDeAxQl#wJ%7L@+^u5qDjGKSR$rvIT_)927=EE@;b20=W=x=kpV4YMNCb@e(u`&}kXdOq##kULOejm^2s9bKX3|3T z!^fPE*)d49k~Y$wu|VGra-iTTO!FI#LM!Rq>fc2UlJAj25Oo-e{s`%YDISH%i{u5; z1MDSz#0H*zV@n3uaez?<$uZ#ZB)r}YjFEHag+xx!@B4bWp&`2L8@4IzMp@@#M2XY? z_y+JzdPMFC{t=9E5ipO`+L3%)9bTT@R_Ka+q$Uj=6bm5ADeuG2IzAs~=>h?T?WS@I zKwp-lEk-&YXppXXIN$Egt~ITVebT;dP}sgfCL?Q{PE?N`U^lGN0$8_P?YpF098b*g zhxYDko>ZooXFo$LLmNXoLkGhZx>y6oN- zE&qF)0p!#E);80 z)q+zit>mQ2sFuv!`)qCUj`A?_RXg{>9IFJ(uRHGTwNv(vp3e6~%`zN4A zvdzOe&}v7rwIexTXfziZ&B|jrc`Pfvl#^ake>C|}npCApt*!;KTRxpuPra<39?zXR ztH!3)^RMM%v+9__+Gt#DPq5Z@Ia_ym$^JyQ8gF1nfyTA*dzREE)9T2mI`-q-$SHOF zta@%LHy%?@OtW_Uy!zY)*7jY@`Y)#K8}`z4-8(~zu#}qJogRMY*h2q8|Cf%E#aGx; zISp*sOBM&T63-)ltrn`&A`M!&UaN`fwjxiNju?2W!?}q7j{zx&0#P>=;*BJmEZyVn zoKLgwlAb(yvddzqUsWKhQ+0M`g|3{?r3zhN3ig~J>ZluAi+55hP04AWZesRFZaLkl z^G25<14X@F+0ab-HpRD-AUntp?B>1_>w^*#liirf+cK`qY diff --git a/main.py b/main.py index 7484732..71a2ac3 100644 --- a/main.py +++ b/main.py @@ -395,7 +395,7 @@ def recursion_batch(list_of_df, list_of_result, new_question, query_recommendati print("Recursive batch: ", list_of_df[0]) print("Length: ", len(list_of_result)) print("Content: ", list_of_result) - if len(list_of_df) <= 3: + if len(list_of_df) <= 10: if len(list_of_df) < 2: dataframe_json = list_of_df[0].to_json() prompt = f"You are an actuary, " \ @@ -420,35 +420,18 @@ def recursion_batch(list_of_df, list_of_result, new_question, query_recommendati return "Sorry, we've disabled huge processing of large file insights for now..." @st.cache_data -def recursive_summarizer_sub(list_of_response, new_question): +def recursive_summarizer_sub(list_of_response, list_of_result_response, new_question): if len(list_of_response) < 2: - data = '\n'.join(list_of_response) - return data + list_of_result_response = list_of_result_response + list_of_response + return list_of_result_response else: - # if len('\n'.join(list_of_response)) < 4000: - # data = '\n'.join(list_of_response) - # else: - pass - - # data = '\n'.join(list_of_response) - # print("Question in recursive_summarizer_sub: ", new_question) - # print("Answer in data: ", data) - # prompt = f"Give a report on the passage to answer the question: {new_question}." \ - # f"The passage: {data}" - # print(f"Prompt being asked: {prompt}") - # list_of_result = [gpt3.gpt_promt_davinci(prompt)] - # print(f"Answer: {list_of_result}") - # return '\n'.join(list_of_result) - # raise - # response_extract = list_of_response[0] - # prompt = f"You are an actuary" \ - # f"The main goal is to answer {new_question}" \ - # f"Please summarize the passage:" \ - # f"{response_extract}" - # list_of_result = list_of_summarize_text + [gpt3.gpt_promt_davinci(prompt)] - # new_list = list_of_response[1:] - # return recursion_batch(new_list, list_of_result) + data = '\n'.join(list_of_response[0]) + prompt = f"Given the question is {new_question}." \ + f"Summarize the following text after: {data}" + list_of_result_response = list_of_result_response + [gpt3.gpt_promt_davinci(prompt)] + new_list = list_of_response[1:] + return recursive_summarizer_sub(new_list, list_of_result_response, new_question) # def recursive_summarizer_main(response, list_of_response, new_question): # if len(response) < 2: @@ -458,6 +441,27 @@ def recursive_summarizer_sub(list_of_response, new_question): # response = recursive_summarizer_sub(response, list_of_summarize_text, new_question) # return recursive_summarizer_main(response, list_of_response, new_question) +@st.cache_data +def split_words_into_sublists(word_list, max_words_per_list): + """ + Joins words in a list together and splits them into sublists with a maximum word count + of `max_words_per_list`. + + Args: + word_list (list): List of words. + max_words_per_list (int): Maximum word count per sublist. + + Returns: + list: List of sublists containing words. + """ + # Join words into a single string + joined_words = ' '.join(word_list) + + # Split words into sublists of max_words_per_list each + sublists = [joined_words[i:i + max_words_per_list] for i in range(0, len(joined_words), max_words_per_list)] + + return sublists + @st.cache_data def explain_result(query_recommendation, new_question, dataframe_new): @@ -465,15 +469,24 @@ def explain_result(query_recommendation, new_question, dataframe_new): print(f"Batch size: {batch_size}") list_of_df = np.array_split(dataframe_new, batch_size) # sample data to first 10 dataframe to get result, to remove in prod - list_of_df = list_of_df[:2] + list_of_df = list_of_df[:3] list_of_result = [] with st.spinner("Working on the analysis, please wait..."): response = recursion_batch(list_of_df, list_of_result, new_question, query_recommendation) if response: + list_of_result_response = [] st.success('Done!') - response = recursive_summarizer_sub(response, new_question) + if len(response) >= 2: + print("Processing sub explaination") + max_words_per_list = 3500 + sublists = split_words_into_sublists(response, max_words_per_list) + response = recursive_summarizer_sub(sublists, list_of_result_response, new_question) + response = '\n'.join(response) + else: + print("Combining the response") + response = '\n'.join(response) return response @@ -752,17 +765,17 @@ def handle_layout_change(updated_layout): # Create a text element and let the reader know the data is loading. DATA, sample_data_overview = load_data(UPLOADED_FILE) - ##################################################### - # with st.expander("See data explaination"): - # get_data_overview(sample_data_overview) - # - # # Inspecting raw data - # with st.expander("See raw data"): - # get_raw_table(DATA) - # - # # Inspecting summary statistics - # with st.expander("See summary statistics"): - # get_summary_statistics(DATA) + #################################################### + with st.expander("See data explaination"): + get_data_overview(sample_data_overview) + + # Inspecting raw data + with st.expander("See raw data"): + get_raw_table(DATA) + + # Inspecting summary statistics + with st.expander("See summary statistics"): + get_summary_statistics(DATA) data_schema = convert_datatype(DATA) schema_data = str(data_schema.dtypes.to_dict().items()) diff --git a/plot.py b/plot.py index 2544c98..c67d7f3 100644 --- a/plot.py +++ b/plot.py @@ -32,6 +32,10 @@ def plot_metrics(dataframe, label, x_var): def create_bar_chart(data, x_var, y_var, hue_var, label): + hue_var = hue_var.split(",")[0] + x_var = x_var.split(",")[0] + y_var = y_var.split(",")[0] + with mui.Typography: html.div( label, @@ -172,106 +176,119 @@ def create_metric_chart(data, x_var, y_var, label): def create_scatter_plot(data, x_var, y_var, hue_var, label): - with st.spinner("Cooking the scatter plot now..."): - print("Scatterplot: Starting data transformation") - data_chart = data.to_dict('records') - number_of_list = [] - for x in data_chart: - number_of_list = number_of_list + [x[hue_var]] - number_of_list = len(list(set(number_of_list))) - list_of_dict = [] - counter = 0 - for x in data_chart: - if list_of_dict: - for y in list_of_dict: - if y['id'] == x[hue_var]: - y['data'].append({"x": x[x_var], "y": x[y_var]}) - elif len(list(set([x for x in [k['id'] for k in list_of_dict]]))) < number_of_list: - list_of_dict = list_of_dict + [{'id': x[hue_var], 'data' : [{"x": x[x_var], "y": x[y_var]}]}] - else: - list_of_dict = list_of_dict + [{'id': x[hue_var], 'data' : [{"x": x[x_var], "y": x[y_var]}]}] - print(counter) - counter+=1 + if hue_var: + hue_var = hue_var.split(",")[0] + if x_var: + x_var = x_var.split(",")[0] + if y_var: + y_var = y_var.split(",")[0] - with mui.Typography: - html.div( - label, - css={ - "display": "block", - "margin-top": "1em", - "margin-bottom": "1em", - "margin-left": "2em", - "margin-right": "0em" - } - ) - print("Scatterplot: Completed data transformation") + if hue_var: + with st.spinner("Cooking the scatter plot now..."): + print("Scatterplot: Starting data transformation") + data_chart = data.to_dict('records') + number_of_list = [] + for x in data_chart: + number_of_list = number_of_list + [x[hue_var]] + number_of_list = len(list(set(number_of_list))) + list_of_dict = [] + counter = 0 - nivo.ScatterPlot( - data=list_of_dict, - layout="vertical", - xFormat=">-.2f", - margin={"top": 20, "right": 130, "bottom": 100, "left": 60}, - padding={0.4}, - xScale={"type": 'linear', "min": 0, "max": 'auto'}, - yScale={"type": 'linear', "min": 0, "max": 'auto'}, - blendMode="multiply", - indexScale={"type": 'band', "round": "true"}, - colors={"scheme": 'pastel1'}, - borderColor={ - "from": 'color', - "modifiers": [ - [ - 'darker', - 1.6 + for x in data_chart: + if list_of_dict: + for y in list_of_dict: + if y['id'] == x[hue_var]: + y['data'].append({"x": x[x_var], "y": x[y_var]}) + elif len(list(set([x for x in [k['id'] for k in list_of_dict]]))) < number_of_list: + list_of_dict = list_of_dict + [{'id': x[hue_var], 'data' : [{"x": x[x_var], "y": x[y_var]}]}] + else: + list_of_dict = list_of_dict + [{'id': x[hue_var], 'data' : [{"x": x[x_var], "y": x[y_var]}]}] + print(counter) + counter+=1 + + with mui.Typography: + html.div( + label, + css={ + "display": "block", + "margin-top": "1em", + "margin-bottom": "1em", + "margin-left": "2em", + "margin-right": "0em" + } + ) + print("Scatterplot: Completed data transformation") + + nivo.ScatterPlot( + data=list_of_dict, + layout="vertical", + xFormat=">-.2f", + margin={"top": 20, "right": 130, "bottom": 100, "left": 60}, + padding={0.4}, + xScale={"type": 'linear', "min": 0, "max": 'auto'}, + yScale={"type": 'linear', "min": 0, "max": 'auto'}, + blendMode="multiply", + indexScale={"type": 'band', "round": "true"}, + colors={"scheme": 'pastel1'}, + borderColor={ + "from": 'color', + "modifiers": [ + [ + 'darker', + 1.6 + ] ] - ] - }, - axisBottom={ - 'orient': 'bottom', - "tickSize": 5, - "tickPadding": 5, - "tickRotation": 0, - "legend": str(x_var), - "legendPosition": 'middle', - "legendOffset": 32 - }, - axisLeft={ - 'orient': 'left', - "tickSize": 5, - "tickPadding": 5, - "tickRotation": 0, - "legend": str(y_var), - "legendPosition": 'middle', - "legendOffset": -40 - }, - legends=[ - { - "dataFrom": 'keys', - "anchor": 'top-right', - "direction": 'column', - "margin": { "left": 10 }, - "justify": "false", - "translateX": 120, - "translateY": 0, - "itemsSpacing": 2, - "itemWidth": 100, - "itemHeight": 20, - "itemDirection": 'left-to-right', - "itemOpacity": 0.85, - "symbolSize": 20, - "effects": [ - { - "on": 'hover', - "style": { - "itemOpacity": 1 + }, + axisBottom={ + 'orient': 'bottom', + "tickSize": 5, + "tickPadding": 5, + "tickRotation": 0, + "legend": str(x_var), + "legendPosition": 'middle', + "legendOffset": 32 + }, + axisLeft={ + 'orient': 'left', + "tickSize": 5, + "tickPadding": 5, + "tickRotation": 0, + "legend": str(y_var), + "legendPosition": 'middle', + "legendOffset": -40 + }, + legends=[ + { + "dataFrom": 'keys', + "anchor": 'top-right', + "direction": 'column', + "margin": { "left": 10 }, + "justify": "false", + "legend": str(hue_var), + "translateX": 120, + "translateY": 0, + "itemsSpacing": 2, + "itemWidth": 100, + "itemHeight": 20, + "itemDirection": 'left-to-right', + "itemOpacity": 0.85, + "symbolSize": 20, + "effects": [ + { + "on": 'hover', + "style": { + "itemOpacity": 1 + } } - } - ] - } - ], - role="application", - ariaLabel=label - ) - print("Scatterplot: Plotted") + ] + } + ], + role="application", + ariaLabel=label + ) + else: + with st.spinner("Cooking the scatter plot now..."): + st.error("Missing hue for scatter plot") + print("Scatterplot: Plotted")