diff --git a/README.md b/README.md index ab6f579..7175228 100644 --- a/README.md +++ b/README.md @@ -202,4 +202,4 @@ This presentation, demonstration, and demonstration model do not give you or you The information contained in this presentation, demonstration and demonstration model represents the current view of Microsoft on the issues discussed as of the date of presentation and/or demonstration, for the duration of your access to the demonstration model. Because Microsoft must respond to changing market conditions, it should not be interpreted to be a commitment on the part of Microsoft, and Microsoft cannot guarantee the accuracy of any information presented after the date of presentation and/or demonstration and for the duration of your access to the demonstration model. -No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies. +No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies. \ No newline at end of file diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index f7b101e..be54b01 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -2,10 +2,14 @@ load_dotenv() import streamlit as st +import streamlit.components.v1 as components import os import traceback from utilities.helper import LLMHelper +import requests +import regex as re + import logging logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING) @@ -65,6 +69,58 @@ def check_deployment(): st.error(traceback.format_exc()) +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''): + htmlstr = """ """ + + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) + components.html(f"{htmlstr}", height=0, width=0) + @st.cache_data() def get_languages(): return llm_helper.translator.get_available_languages() @@ -75,14 +131,34 @@ def get_languages(): default_question = "" default_answer = "" + if 'question' not in st.session_state: st.session_state['question'] = default_question - # if 'prompt' not in st.session_state: - # st.session_state['prompt'] = os.getenv("QUESTION_PROMPT", "Please reply to the question using only the information present in the text above. If you can't find it, reply 'Not in the text'.\nQuestion: _QUESTION_\nAnswer:").replace(r'\n', '\n') if 'response' not in st.session_state: st.session_state['response'] = default_answer if 'context' not in st.session_state: st.session_state['context'] = "" + if 'sources' not in st.session_state: + st.session_state['sources'] = "" + if 'followup_questions' not in st.session_state: + st.session_state['followup_questions'] = [] + if 'input_message_key' not in st.session_state: + st.session_state ['input_message_key'] = 1 + if 'do_not_process_question' not in st.session_state: + st.session_state['do_not_process_question'] = False + + if 'askedquestion' not in st.session_state: + st.session_state.askedquestion = default_question + + if 'context_show_option' not in st.session_state: + st.session_state['context_show_option'] = 'context within full source document' + + + if 'tab_context' not in st.session_state: + st.session_state['tab_context'] = 'Not opened yet' + else: + if st.session_state['question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Chat': + st.session_state['tab_context'] = 'Open_Queries' # Set page layout to wide screen and menu item menu_items = { @@ -106,6 +182,7 @@ def get_languages(): col1, col2, col3 = st.columns([2,2,2]) with col1: + ChangeButtonStyle("Check deployment", "#ADCDE7", wch_border_style="none", wch_textsize="10px") st.button("Check deployment", on_click=check_deployment) with col3: with st.expander("Settings"): @@ -118,16 +195,159 @@ def get_languages(): # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1) st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language') - question = st.text_input("OpenAI Semantic Answer", default_question) + # Callback to display document sources + def show_document_source(filename, link, contextList): + st.session_state['do_not_process_question'] = True + display_iframe(filename, link, contextList) + + # Callback to assign the follow-up question is selected by the user + def ask_followup_question(followup_question): + st.session_state.askedquestion = followup_question + st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1 + + def questionAsked(): + st.session_state.askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])] + + question = st.text_input("Azure OpenAI Semantic Answer", value=st.session_state['askedquestion'], key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked) + + # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context + def display_iframe(filename, link, contextList): + st.session_state['do_not_process_question'] = True + st.session_state['askedquestion'] = st.session_state.question + if st.session_state['context_show_option'] == 'context within full source document': + try: + response = requests.get(link) + text = response.text + text = llm_helper.clean_encoding(text) + for i, context in enumerate(contextList): + context = llm_helper.clean_encoding(context) + contextSpan = f" {context}" + text = text.replace(context, contextSpan) + text = text.replace('\n', '

') + + except Exception as e: + text = "Could not load the document source content" + else: + text = "" + for context in contextList: + text = text + context.replace('\n', '

') + '
' + + html_content = """ + + + + + +
+ + +
+ + """ + + def close_iframe(): + placeholder.empty() + st.session_state['do_not_process_question'] = True + + st.button("Close", on_click=close_iframe) + + placeholder = st.empty() + with placeholder: + htmlcontent = html_content.format(filename=filename, text=text) + components.html(htmlcontent, height=500) + + pass + + if st.session_state['tab_context'] != 'Open_Queries' and st.session_state['question'] != '' and st.session_state['question'] != st.session_state['followup_questions']: + st.session_state['tab_context'] = 'Open_Queries' + st.session_state['do_not_process_question'] = True + ask_followup_question(st.session_state['question']) + + # Answer the question if any + if st.session_state.askedquestion != '' and st.session_state['do_not_process_question'] != True: + st.session_state['question'] = st.session_state.askedquestion + st.session_state.askedquestion = "" + st.session_state['question'], \ + st.session_state['response'], \ + st.session_state['context'], \ + st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], []) + st.session_state['response'], followup_questions_list = llm_helper.extract_followupquestions(st.session_state['response']) + st.session_state['followup_questions'] = followup_questions_list + st.session_state['response'] = llm_helper.clean_encoding(st.session_state['response']) + st.session_state['context'] = llm_helper.clean_encoding(st.session_state['context']) + + st.session_state['do_not_process_question'] = False + sourceList = [] - if question != '': - st.session_state['question'] = question - st.session_state['question'], st.session_state['response'], st.session_state['context'], sources = llm_helper.get_semantic_answer_lang_chain(question, []) - st.markdown("Answer:" + st.session_state['response']) - st.markdown(f'\n\nSources: {sources}') + + # Display the sources and context - even if the page is reloaded + if st.session_state['sources'] or st.session_state['context']: + st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources']) + st.write("
", unsafe_allow_html=True) + st.markdown("**Answer:**" + st.session_state['response']) + + # Display proposed follow-up questions which can be clicked on to ask that question automatically + if len(st.session_state['followup_questions']) > 0: + st.write("
", unsafe_allow_html=True) + st.markdown('**Proposed follow-up questions:**') + with st.container(): + for questionId, followup_question in enumerate(st.session_state['followup_questions']): + if followup_question: + str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) + st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, )) + + if st.session_state['sources'] or st.session_state['context']: + # Buttons to display the context used to answer + st.write("
", unsafe_allow_html=True) + st.markdown('**Document sources:**') + for id in range(len(sourceList)): + st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], )) + + # Details on the question and answer context + st.write("

", unsafe_allow_html=True) with st.expander("Question and Answer Context"): - st.markdown(st.session_state['context'].replace('$', '\$')) - st.markdown(f"SOURCES: {sources}") + if not st.session_state['context'] is None and st.session_state['context'] != []: + for content_source in st.session_state['context'].keys(): + st.markdown(f"#### {content_source}") + for context_text in st.session_state['context'][content_source]: + context_text = llm_helper.clean_encoding(context_text) + st.markdown(f"{context_text}") + + st.markdown(f"SOURCES: {st.session_state['sources']}") + + # Source Buttons Styles + for id in range(len(sourceList)): + if filenameList[id] in matchedSourcesList: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px') + else: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px') + + for questionId, followup_question in enumerate(st.session_state['followup_questions']): + if followup_question: + str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) + ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px') if st.session_state['translation_language'] and st.session_state['translation_language'] != '': st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية") diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index fbfa51c..42d2cd3 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -1,33 +1,282 @@ import streamlit as st from streamlit_chat import message +import streamlit.components.v1 as components from utilities.helper import LLMHelper +import requests +import regex as re +import os def clear_chat_data(): - st.session_state['input'] = "" st.session_state['chat_history'] = [] - st.session_state['source_documents'] = [] + st.session_state['chat_source_documents'] = [] + st.session_state['chat_context'] = [] + st.session_state['chat_context_show_option'] = 'context within full source document' + st.session_state['chat_askedquestion'] = '' + st.session_state['chat_question'] = '' + st.session_state['chat_followup_questions'] = [] + st.session_state['do_not_process_question'] = False + st.session_state['tab_context'] = 'Not opened yet' + answer_with_citations = "" + # Initialize chat history +if 'chat_question' not in st.session_state: + st.session_state['chat_question'] = '' +if 'chat_askedquestion' not in st.session_state: + st.session_state.chat_askedquestion = '' if 'chat_history' not in st.session_state: st.session_state['chat_history'] = [] -if 'source_documents' not in st.session_state: - st.session_state['source_documents'] = [] +if 'chat_source_documents' not in st.session_state: + st.session_state['chat_source_documents'] = [] +if 'chat_context' not in st.session_state: + st.session_state['chat_context'] = [] +if 'chat_followup_questions' not in st.session_state: + st.session_state['chat_followup_questions'] = [] +if 'input_message_key' not in st.session_state: + st.session_state ['input_message_key'] = 1 + +if 'do_not_process_question' not in st.session_state: + st.session_state['do_not_process_question'] = False + +chat_context_show_options = ('extracted context only', 'context within full source document') +if 'chat_context_show_option' not in st.session_state: + st.session_state['chat_context_show_option'] = 'context within full source document' + +if 'tab_context' not in st.session_state: + st.session_state['tab_context'] = 'Not opened yet' +else: + if st.session_state['chat_question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Open_Queries': + st.session_state['tab_context'] = 'Chat' llm_helper = LLMHelper() + +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''): + htmlstr = """ """ + + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) + components.html(f"{htmlstr}", height=0, width=0) + + +def questionAsked(): + st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])] + st.session_state.chat_question = st.session_state.chat_askedquestion + +# Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context +def display_iframe(filename, link, contextList): + st.session_state['do_not_process_question'] = True + st.session_state['chat_askedquestion'] = st.session_state.chat_question + if st.session_state['chat_context_show_option'] == 'context within full source document': + try: + response = requests.get(link) + text = response.text + text = llm_helper.clean_encoding(text) + for i, context in enumerate(contextList): + context = llm_helper.clean_encoding(context) + contextSpan = f" {context}" + print(contextSpan) + text = text.replace(context, contextSpan) + checkit = text.find('') + print(text[checkit:checkit2]) + text = text.replace('\n', '

') + + except Exception as e: + text = "Could not load the document source content" + else: + text = "" + for context in contextList: + text = text + context.replace('\n', '

') + '
' + + html_content = """ + + + + + +
+ + +
+ + """ + + def close_iframe(): + placeholder.empty() + st.session_state['do_not_process_question'] = True + + st.button("Close", on_click=close_iframe) + + placeholder = st.empty() + with placeholder: + htmlcontent = html_content.format(filename=filename, text=text) + components.html(htmlcontent, height=500) + + pass + + +# Callback to assign the follow-up question is selected by the user +def ask_followup_question(followup_question): + st.session_state['tab_context'] = 'Chat' # Prevents side effect when first click after loading the page + st.session_state.chat_askedquestion = followup_question + st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1 + +# Reset the right asked question to the input box when this page is reopened after switching to the OpenAI_Queries page +if st.session_state['tab_context'] != 'Chat' and st.session_state['chat_question'] != '' and st.session_state['chat_question'] != st.session_state['chat_askedquestion']: + st.session_state['tab_context'] = 'Chat' + st.session_state['do_not_process_question'] = True + ask_followup_question(st.session_state['chat_question']) + + # Chat -input_text = st.text_input("You: ", placeholder="type your question", key="input") clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) +ChangeButtonStyle("Clear chat", "#ADCDE7", wch_border_style="none", wch_textsize="10px") + +input_text = st.text_input("You: ", placeholder="type your question", value=st.session_state.chat_askedquestion, key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked) -if input_text: - question = input_text - input_text = "" - question, result, _, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history']) - st.session_state['chat_history'].append((question, result)) - st.session_state['source_documents'].append(sources) +def show_document_source(filename, link, contextList): + st.session_state['do_not_process_question'] = True + display_iframe(filename, link, contextList) + +# If a question is asked execute the request to get the result, context, sources and up to 3 follow-up questions proposals +if st.session_state.chat_askedquestion and st.session_state.do_not_process_question != True: + st.session_state['chat_question'] = st.session_state.chat_askedquestion + st.session_state.chat_askedquestion = "" + st.session_state['chat_question'], result, context, sources = llm_helper.get_semantic_answer_lang_chain(st.session_state['chat_question'], st.session_state['chat_history']) + result = llm_helper.clean_encoding(result) + context = llm_helper.clean_encoding(context) + result, chat_followup_questions_list = llm_helper.extract_followupquestions(result) + st.session_state['chat_history'].append((st.session_state['chat_question'], result)) + st.session_state['chat_source_documents'].append(sources) + st.session_state['chat_context'].append(context) + st.session_state['chat_followup_questions'] = chat_followup_questions_list + +st.session_state['do_not_process_question'] = False + +# Displays the chat history if st.session_state['chat_history']: + history_range = range(len(st.session_state['chat_history'])-1, -1, -1) for i in range(len(st.session_state['chat_history'])-1, -1, -1): - message(st.session_state['chat_history'][i][1], key=str(i)) - st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}') - message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') + + # This history entry is the latest one - also show follow-up questions, buttons to access source(s) context(s) + if i == history_range.start: + answer_with_citations, sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i]) + st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) + + answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]).strip() # message() does not get Latex nor html + # message(answer_with_citations key=str(i)) + answer_message_height = int((len(answer_with_citations) / 22) * 1.1 * 8) + st.text_area(label='', value=answer_with_citations, height=answer_message_height, key=str(i)) + st.write("
", unsafe_allow_html=True) + + # Display proposed follow-up questions which can be clicked on to ask that question automatically + if len(st.session_state['chat_followup_questions']) > 0: + st.markdown('**Proposed follow-up questions:**') + with st.container(): + for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']): + if followup_question: + str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) + st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, )) + + if len(sourceList) > 0: + st.write("

", unsafe_allow_html=True) + # Selectbox to choose how to display the context(s) associated with the clicked source document name + st.session_state['chat_context_show_option'] = st.selectbox( + 'Choose how to display context used to answer the question when clicking on a document source below:', + chat_context_show_options, + index=chat_context_show_options.index(st.session_state['chat_context_show_option']) + ) + + # Buttons to display the context(s) associated with the clicked source document name + for id in range(len(sourceList)): + st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], )) + + # Source Buttons Styles + for id in range(len(sourceList)): + if filenameList[id] in matchedSourcesList: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px') + else: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px') + + + for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']): + if followup_question: + str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) + ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px') + + + # The old questions and answers within the history + else: + answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html + message(answer_with_citations, key=str(i)) + st.markdown(f'\n\nSources: {st.session_state["chat_source_documents"][i]}') + message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') diff --git a/code/pages/01_Add_Document.py b/code/pages/01_Add_Document.py index ffa00c4..ea0461c 100644 --- a/code/pages/01_Add_Document.py +++ b/code/pages/01_Add_Document.py @@ -140,4 +140,4 @@ def upload_file(bytes_data: bytes, file_name: str): except Exception as e: - st.error(traceback.format_exc()) + st.error(traceback.format_exc()) \ No newline at end of file diff --git a/code/pages/04_Index_Management.py b/code/pages/04_Index_Management.py index e2e7892..b86ca87 100644 --- a/code/pages/04_Index_Management.py +++ b/code/pages/04_Index_Management.py @@ -2,6 +2,7 @@ import os import traceback from utilities.helper import LLMHelper +import streamlit.components.v1 as components def delete_embedding(): llm_helper.vector_store.delete_keys([f"doc:{st.session_state['embedding_to_drop']}"]) @@ -11,6 +12,58 @@ def delete_file(): embeddings_to_delete = list(map(lambda x: f"doc:{x}", embeddings_to_delete)) llm_helper.vector_store.delete_keys(embeddings_to_delete) + +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''): + htmlstr = """ """ + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) + components.html(f"{htmlstr}", height=0, width=0) + try: # Set page layout to wide screen and menu item menu_items = { @@ -28,32 +81,40 @@ def delete_file(): # Query RediSearch to get all the embeddings data = llm_helper.get_all_documents(k=1000) - if len(data) == 0: + nb_embeddings = len(data) + + if nb_embeddings == 0: st.warning("No embeddings found. Go to the 'Add Document' tab to insert your docs.") else: st.dataframe(data, use_container_width=True) st.download_button("Download data", data.to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings') + ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="12px") st.text("") st.text("") col1, col2, col3, col4 = st.columns([3,2,2,1]) with col1: st.selectbox("Embedding id to delete", data.get('key',[]), key="embedding_to_drop") + # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="12px") with col2: st.text("") st.text("") st.button("Delete embedding", on_click=delete_embedding) + ChangeButtonStyle("Delete embedding", "#ADCDE7", wch_textsize="10px") with col3: st.selectbox("File name to delete", set(data.get('filename',[])), key="file_to_drop") + # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="12px") with col4: st.text("") st.text("") st.button("Delete file", on_click=delete_file) + ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="12px") st.text("") st.text("") st.button("Delete all embeddings", on_click=llm_helper.vector_store.delete_keys_pattern, args=("doc*",), type="secondary") + ChangeButtonStyle("Delete all embeddings", "#ADCDE7", wch_textsize="10px") except Exception as e: st.error(traceback.format_exc()) diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py index 9a70f83..a709863 100644 --- a/code/utilities/customprompt.py +++ b/code/utilities/customprompt.py @@ -2,9 +2,15 @@ from langchain.prompts import PromptTemplate template = """{summaries} -Please reply to the question using only the information present in the text above. -Include references to the sources you used to create the answer if those are relevant ("SOURCES"). +Please reply to the question using only the information present in the text above. +Detect the langage of the question and answer in the same language. If you can't find it, reply politely that the information is not in the knowledge base. +Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. +If asked for enumerations list all of them and do not invent any. +After answering the question generate three very brief follow-up questions that the user would likely ask next. +Only use double angle brackets to reference the questions, e.g. <>. +Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'. +Try not to repeat questions that have already been asked. Question: {question} Answer:""" diff --git a/code/utilities/helper.py b/code/utilities/helper.py index d08625b..72da2e4 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -101,14 +101,13 @@ def add_embeddings_lc(self, source_url): document.page_content = document.page_content.encode("iso-8859-1").decode("utf-8", errors="ignore") except: pass - docs = self.text_splitter.split_documents(documents) # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half) - pattern = re.compile(r'[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') + pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') # do not remove \x0a (\n) nor \x0d (\r) for(doc) in docs: doc.page_content = re.sub(pattern, '', doc.page_content) - + keys = [] for i, doc in enumerate(docs): # Create a unique key for the document @@ -130,8 +129,12 @@ def convert_file_and_add_embeddings(self, source_url, filename, enable_translati text = list(map(lambda x: self.translator.translate(x), text)) if self.enable_translation else text # Upload the text to Azure Blob Storage + converted_text = "n".join(text) + # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half) + pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') # do not remove \x0a (\n) nor \x0d (\r) + converted_text = re.sub(pattern, '', converted_text) converted_filename = f"converted/{filename}.txt" - source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain; charset=utf-8') + source_url = self.blob_client.upload_file(converted_text, f"converted/{filename}.txt", content_type='text/plain; charset=utf-8') print(f"Converted file uploaded to {source_url} with filename {filename}") # Update the metadata to indicate that the file has been converted @@ -151,6 +154,69 @@ def get_all_documents(self, k: int = None): 'metadata' : x.metadata, }, result))) + # remove paths from sources to only keep the filename + def filter_sourcesLinks(self, sources): + # use regex to replace all occurences of '[anypath/anypath/somefilename.xxx](the_link)' to '[somefilename](thelink)' in sources + pattern = r'\[[^\]]*?/([^/\]]*?)\]' + + match = re.search(pattern, sources) + while match: + withoutExtensions = match.group(1).split('.')[0] # remove any extension to the name of the source document + sources = sources[:match.start()] + f'[{withoutExtensions}]' + sources[match.end():] + match = re.search(pattern, sources) + + sources = ' \n ' + sources.replace('\n', ' \n ') # add a carriage return after each source + + return sources + + def extract_followupquestions(self, answer): + followupTag = answer.find('Follow-up Questions') + followupQuestions = answer.find('<<') + + # take min of followupTag and folloupQuestions if not -1 to avoid taking the followup questions if there is no followupTag + followupTag = min(followupTag, followupQuestions) if followupTag != -1 and followupQuestions != -1 else max(followupTag, followupQuestions) + answer_without_followupquestions = answer[:followupTag] if followupTag != -1 else answer + followup_questions = answer[followupTag:].strip() if followupTag != -1 else '' + + # Extract the followup questions as a list + pattern = r'\<\<(.*?)\>\>' + match = re.search(pattern, followup_questions) + followup_questions_list = [] + while match: + followup_questions_list.append(followup_questions[match.start()+2:match.end()-2]) + followup_questions = followup_questions[match.end():] + match = re.search(pattern, followup_questions) + + # Special case when 'Follow-up questions:' appears in the answer after the << + followupTag = answer_without_followupquestions.find('Follow-up Questions') + if followupTag != -1: + answer_without_followupquestions = answer_without_followupquestions[:followupTag] + + return answer_without_followupquestions, followup_questions_list + + # insert citations in the answer - find filenames in the answer maching sources from the filenamelist and replace them with '${(id+1)}' + def insert_citations_in_answer(self, answer, filenameList): + matched_sources = [] + pattern = r'\[\[(.*?)\]\]' + match = re.search(pattern, answer) + while match: + filename = match.group(1).split('.')[0] # remove any extension to the name of the source document + if filename in filenameList: + matched_sources.append(filename) + filenameIndex = filenameList.index(filename) + 1 + answer = answer[:match.start()] + '$^{' + f'{filenameIndex}' + '}$' + answer[match.end():] + else: + answer = answer[:match.start()] + '$^{' + f'{filename}' + '}$' + answer[match.end():] + match = re.search(pattern, answer) + + # When page is reloaded search for references already added to the answer (e.g. '${(id+1)}') + for id, filename in enumerate(filenameList): + reference = '$^{' + f'{id+1}' + '}$' + if reference in answer and not filename in matched_sources: + matched_sources.append(filename) + + return answer, matched_sources + def get_semantic_answer_lang_chain(self, question, chat_history): question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False) doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT) @@ -162,15 +228,23 @@ def get_semantic_answer_lang_chain(self, question, chat_history): # top_k_docs_for_context= self.k ) result = chain({"question": question, "chat_history": chat_history}) - context = "\n".join(list(map(lambda x: x.page_content, result['source_documents']))) - sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents']))) - container_sas = self.blob_client.get_container_sas() + contextDict ={} + for res in result['source_documents']: + source_key = self.filter_sourcesLinks(res.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)).replace('\n', '').replace(' ', '') + if source_key not in contextDict: + contextDict[source_key] = [] + contextDict[source_key].append(res.page_content) + + sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents']))) + result['answer'] = result['answer'].split('SOURCES:')[0].split('Sources:')[0].split('SOURCE:')[0].split('Source:')[0] sources = sources.replace('_SAS_TOKEN_PLACEHOLDER_', container_sas) - return question, result['answer'], context, sources + sources = self.filter_sourcesLinks(sources) + + return question, result['answer'], contextDict, sources def get_embeddings_model(self): OPENAI_EMBEDDINGS_ENGINE_DOC = os.getenv('OPENAI_EMEBDDINGS_ENGINE', os.getenv('OPENAI_EMBEDDINGS_ENGINE_DOC', 'text-embedding-ada-002')) @@ -185,3 +259,28 @@ def get_completion(self, prompt, **kwargs): return self.llm([HumanMessage(content=prompt)]).content else: return self.llm(prompt) + + def get_links_filenames(self, answer, sources): + split_sources = sources.split(' \n ') # soures are expected to be of format ' \n [filename1.ext](sourcelink1) \n [filename2.ext](sourcelink2) \n [filename3.ext](sourcelink3) \n ' + srcList = [] + linkList = [] + filenameList = [] + for src in split_sources: + if src != '': + srcList.append(src) + link = src[1:].split('(')[1][:-1].split(')')[0] # get the link + linkList.append(link) + filename = src[1:].split(']')[0] # retrieve the source filename + filenameList.append(filename) + answer, matchedSourcesList = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer + return answer, srcList, matchedSourcesList, linkList, filenameList + + def clean_encoding(self, text): + encoding = 'ISO-8859-1' + try: + reencodedtext = text.encode(encoding) + reencodedtext = reencodedtext.decode('utf-8') + except Exception as e: + reencodedtext = text + return reencodedtext +