From 330e5651337a6cbc3ab352ed81bb113f44575d2c Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 4 Apr 2023 16:51:22 +0200 Subject: [PATCH 01/17] Enhanced UI interface --- code/OpenAI_Queries.py | 101 ++++++++++++++++++++++++++++++++++++--- code/pages/00_Chat.py | 101 ++++++++++++++++++++++++++++++++++++--- code/utilities/helper.py | 34 +++++++++++-- 3 files changed, 219 insertions(+), 17 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index bfe31ec..6bef743 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -2,10 +2,13 @@ load_dotenv() import streamlit as st +import streamlit.components.v1 as components import os import traceback from utilities.helper import LLMHelper +import requests + import logging logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING) @@ -81,6 +84,8 @@ def get_languages(): st.session_state['response'] = default_answer if 'context' not in st.session_state: st.session_state['context'] = "" + if 'sources' not in st.session_state: + st.session_state['sources'] = "" # Set page layout to wide screen and menu item menu_items = { @@ -116,16 +121,98 @@ def get_languages(): # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1) st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language') - question = st.text_input("OpenAI Semantic Answer", default_question) + if 'askedquestion' not in st.session_state: + st.session_state.askedquestion = '' + + def questionAsked(): + st.session_state.askedquestion = st.session_state.inputquestion + + question = st.text_input("OpenAI Semantic Answer", default_question, key='inputquestion', on_change=questionAsked) - if question != '': - st.session_state['question'] = question - st.session_state['question'], st.session_state['response'], st.session_state['context'], sources = llm_helper.get_semantic_answer_lang_chain(question, []) + def display_iframe(filename, link, contextList): + if st.session_state['context_show_option'] == 'context within full source document': + try: + response = requests.get(link) + text = response.text + for i, context in enumerate(contextList): + contextSpan = f" {context}" + text = text.replace(context, contextSpan) + text = text.replace('\n', '

') + + except Exception as e: + text = "Could not load the document source content" + else: + text = "" + for context in contextList: + text = text + context.replace('\n', '

') + '
' + + html_content = """ + + + + + +
+ +
+ + """ + + if st.button("Close"): + st.placeholder.empty() + + placeholder = st.empty() + with placeholder: + # htmlcontent = html_content.format(link=link, filename=filename) + htmlcontent = html_content.format(filename=filename, text=text) + components.html(htmlcontent, height=500) + pass + + + if 'context_show_option' not in st.session_state: + st.session_state['context_show_option'] = 'context within full source document' + + # Answer the question if any + if st.session_state.askedquestion != '': + st.session_state['question'] = st.session_state.askedquestion + st.session_state.askedquestion = "" + st.session_state['question'], st.session_state['response'], st.session_state['context'], st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], []) + + # Display the sources and context - even if the page is reloaded + if st.session_state['sources'] or st.session_state['context']: st.markdown("Answer:" + st.session_state['response']) - st.markdown(f'\n\nSources: {sources}') + # st.markdown(f'\n\nSources: {sources}') + split_sources = st.session_state['sources'].split(' \n ') + for src in split_sources: + if src != '': + link = src[1:].split('(')[1][:-1].split(')')[0] + filename = src[1:].split(']')[0] + if st.button(filename, key=filename): + context = st.session_state['context'] + display_iframe(filename, link, st.session_state['context'][src]) with st.expander("Question and Answer Context"): - st.markdown(st.session_state['context'].replace('$', '\$')) - st.markdown(f"SOURCES: {sources}") + if not st.session_state['context'] is None and st.session_state['context'] != []: + for content_source in st.session_state['context'].keys(): + st.markdown(f"#### {content_source}") + for context_text in st.session_state['context'][content_source]: + st.markdown(f"{context_text}") + + # theContext = llm_helper.filter_sourcesLinks(st.session_state['context'].replace('$', '\$')) + # st.markdown(theContext) + st.markdown(f"SOURCES: {st.session_state['sources']}") if st.session_state['translation_language'] and st.session_state['translation_language'] != '': st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية") diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index fbfa51c..83ca9cf 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -1,33 +1,122 @@ import streamlit as st from streamlit_chat import message +import streamlit.components.v1 as components from utilities.helper import LLMHelper +import requests def clear_chat_data(): st.session_state['input'] = "" st.session_state['chat_history'] = [] st.session_state['source_documents'] = [] + st.session_state['chat_context'] = [] + st.session_state['context_show_option'] = 'context within full source document' + st.session_state['askedquestion'] = '' # Initialize chat history if 'chat_history' not in st.session_state: st.session_state['chat_history'] = [] if 'source_documents' not in st.session_state: st.session_state['source_documents'] = [] +if 'chat_context' not in st.session_state: + st.session_state['chat_context'] = [] + +context_show_options = ('extracted context only', 'context within full source document') +if 'context_show_option' not in st.session_state: + st.session_state['context_show_option'] = 'context within full source document' llm_helper = LLMHelper() +if 'askedquestion' not in st.session_state: + st.session_state.askedquestion = '' + +def questionAsked(): + st.session_state.askedquestion = st.session_state.input + # Chat -input_text = st.text_input("You: ", placeholder="type your question", key="input") +input_text = st.text_input("You: ", placeholder="type your question", key="input", on_change=questionAsked) clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) -if input_text: - question = input_text - input_text = "" - question, result, _, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history']) +def display_iframe(filename, link, contextList): + if st.session_state['context_show_option'] == 'context within full source document': + try: + response = requests.get(link) + text = response.text + for i, context in enumerate(contextList): + contextSpan = f" {context}" + text = text.replace(context, contextSpan) + text = text.replace('\n', '

') + + except Exception as e: + text = "Could not load the document source content" + else: + text = "" + for context in contextList: + text = text + context.replace('\n', '

') + '
' + + html_content = """ + + + + + +
+ +
+ + """ + + if st.button("Close"): + placeholder.empty() + + placeholder = st.empty() + with placeholder: + # htmlcontent = html_content.format(link=link, filename=filename) + htmlcontent = html_content.format(filename=filename, text=text) + components.html(htmlcontent, height=500) + pass + + +if st.session_state.askedquestion: + question = st.session_state.askedquestion + st.session_state.askedquestion = "" + question, result, context, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history']) st.session_state['chat_history'].append((question, result)) st.session_state['source_documents'].append(sources) + st.session_state['chat_context'].append(context) + if st.session_state['chat_history']: + history_range = range(len(st.session_state['chat_history'])-1, -1, -1) for i in range(len(st.session_state['chat_history'])-1, -1, -1): message(st.session_state['chat_history'][i][1], key=str(i)) - st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}') + if i == history_range.start: + + st.session_state['context_show_option'] = st.selectbox( + 'Choose how to display context used to answer the question when clicking on a document source below:', + context_show_options, + index=context_show_options.index(st.session_state['context_show_option']) + ) + + split_sources = st.session_state['source_documents'][i].split(' \n ') + for src in split_sources: + if src != '': + link = src[1:].split('(')[1][:-1].split(')')[0] + filename = src[1:].split(']')[0] + if st.button(filename, key=filename): + display_iframe(filename, link, st.session_state['chat_context'][i][src]) + else: + st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}') message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 153f39b..4e5b92b 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -123,6 +123,21 @@ def get_all_documents(self, k: int = None): 'metadata' : x.metadata, }, result))) + # remove paths from sources to only keep the filename + def filter_sourcesLinks(self, sources): + # use regex to replace all occurences of '[anypath/anypath/somefilename.xxx](the_link)' to '[somefilename](thelink)' in sources + pattern = r'\[[^\]]*?/([^/\]]*?)\]' + + match = re.search(pattern, sources) + while match: + withoutExtensions = match.group(1).split('.')[0] # remove any extension to the name of the source document + sources = sources[:match.start()] + f'[{withoutExtensions}]' + sources[match.end():] + match = re.search(pattern, sources) + + sources = ' \n ' + sources.replace('\n', ' \n ') # add a carriage return after each source + + return sources + def get_semantic_answer_lang_chain(self, question, chat_history): question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False) doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT) @@ -134,15 +149,26 @@ def get_semantic_answer_lang_chain(self, question, chat_history): top_k_docs_for_context= self.k ) result = chain({"question": question, "chat_history": chat_history}) - context = "\n".join(list(map(lambda x: x.page_content, result['source_documents']))) - sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents']))) - container_sas = self.blob_client.get_container_sas() + # context = "\n".join(list(map(lambda x: x.page_content, result['source_documents']))) + # context = "\n".join(list(map(lambda x: "{} \n {} \n".format(x.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas), x.page_content), result['source_documents']))) + + contextDict ={} + for res in result['source_documents']: + source_key = self.filter_sourcesLinks(res.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)).replace('\n', '').replace(' ', '') + if source_key not in contextDict: + contextDict[source_key] = [] + contextDict[source_key].append(res.page_content) + + sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents']))) + result['answer'] = result['answer'].split('SOURCES:')[0].split('Sources:')[0].split('SOURCE:')[0].split('Source:')[0] sources = sources.replace('_SAS_TOKEN_PLACEHOLDER_', container_sas) - return question, result['answer'], context, sources + sources = self.filter_sourcesLinks(sources) + + return question, result['answer'], contextDict, sources def get_embeddings_model(self): OPENAI_EMBEDDINGS_ENGINE_DOC = os.getenv('OPENAI_EMEBDDINGS_ENGINE', os.getenv('OPENAI_EMBEDDINGS_ENGINE_DOC', 'text-embedding-ada-002')) From 4acf9f434e4f88aabe1f8c952f5403e3bbcfecd6 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 4 Apr 2023 17:23:12 +0200 Subject: [PATCH 02/17] UI Enhancements From 866232ab16f57e0ea198603016842dc6df5584d1 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Thu, 6 Apr 2023 10:33:38 +0200 Subject: [PATCH 03/17] Adding citations reference in the answer --- code/OpenAI_Queries.py | 15 ++++++--------- code/pages/00_Chat.py | 21 ++++++++++++--------- code/pages/01_Add_Document.py | 19 +++++++++++++++++-- code/utilities/customprompt.py | 2 +- code/utilities/helper.py | 31 ++++++++++++++++++++++++++++--- 5 files changed, 64 insertions(+), 24 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index 6bef743..35abd31 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -193,16 +193,13 @@ def display_iframe(filename, link, contextList): # Display the sources and context - even if the page is reloaded if st.session_state['sources'] or st.session_state['context']: + st.session_state['response'], sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources']) st.markdown("Answer:" + st.session_state['response']) - # st.markdown(f'\n\nSources: {sources}') - split_sources = st.session_state['sources'].split(' \n ') - for src in split_sources: - if src != '': - link = src[1:].split('(')[1][:-1].split(')')[0] - filename = src[1:].split(']')[0] - if st.button(filename, key=filename): - context = st.session_state['context'] - display_iframe(filename, link, st.session_state['context'][src]) + + for id in range(len(sourceList)): + if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]): + display_iframe(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]]) + with st.expander("Question and Answer Context"): if not st.session_state['context'] is None and st.session_state['context'] != []: for content_source in st.session_state['context'].keys(): diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index 83ca9cf..85c578b 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -3,6 +3,7 @@ import streamlit.components.v1 as components from utilities.helper import LLMHelper import requests +import regex as re def clear_chat_data(): st.session_state['input'] = "" @@ -83,7 +84,6 @@ def display_iframe(filename, link, contextList): placeholder = st.empty() with placeholder: - # htmlcontent = html_content.format(link=link, filename=filename) htmlcontent = html_content.format(filename=filename, text=text) components.html(htmlcontent, height=500) pass @@ -101,7 +101,8 @@ def display_iframe(filename, link, contextList): if st.session_state['chat_history']: history_range = range(len(st.session_state['chat_history'])-1, -1, -1) for i in range(len(st.session_state['chat_history'])-1, -1, -1): - message(st.session_state['chat_history'][i][1], key=str(i)) + # message(st.session_state['chat_history'][i][1], key=str(i)) + if i == history_range.start: st.session_state['context_show_option'] = st.selectbox( @@ -110,13 +111,15 @@ def display_iframe(filename, link, contextList): index=context_show_options.index(st.session_state['context_show_option']) ) - split_sources = st.session_state['source_documents'][i].split(' \n ') - for src in split_sources: - if src != '': - link = src[1:].split('(')[1][:-1].split(')')[0] - filename = src[1:].split(']')[0] - if st.button(filename, key=filename): - display_iframe(filename, link, st.session_state['chat_context'][i][src]) + answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i]) + st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) + answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html + message(answer_with_citations, key=str(i)) + + for id in range(len(sourceList)): + if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]): + display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]]) + else: st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}') message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') diff --git a/code/pages/01_Add_Document.py b/code/pages/01_Add_Document.py index d2c8c25..7a85b8b 100644 --- a/code/pages/01_Add_Document.py +++ b/code/pages/01_Add_Document.py @@ -32,6 +32,12 @@ def delete_row(): st.session_state['data_to_drop'] redisembeddings.delete_document(st.session_state['data_to_drop']) +def add_urls(): + urls = st.session_state['urls'].split('\n') + for url in urls: + if url: + llm_helper.add_embeddings_lc(url) + st.success(f"Embeddings added successfully for {url}") try: # Set page layout to wide screen and menu item @@ -78,7 +84,7 @@ def delete_row(): with st.expander("Add text to the knowledge base", expanded=False): col1, col2 = st.columns([3,1]) with col1: - st.session_state['doc_text'] = st.text_area("Add a new text content and the click on 'Compute Embeddings'", height=600) + st.session_state['doc_text'] = st.text_area("Add a new text content and them click on 'Compute Embeddings'", height=600) with col2: st.session_state['embeddings_model'] = st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True) @@ -106,6 +112,15 @@ def delete_row(): with col3: st.button("Convert all files and add embeddings", on_click=remote_convert_files_and_add_embeddings, args=(True,)) + with st.expander("Add URLs to the knowledge base", expanded=True): + col1, col2 = st.columns([3,1]) + with col1: + st.session_state['urls'] = st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100) + + with col2: + st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True, key="embeddings_model_url") + st.button("Compute Embeddings", on_click=add_urls, key="add_url") + with st.expander("View documents in the knowledge base", expanded=False): # Query RediSearch to get all the embeddings try: @@ -122,4 +137,4 @@ def delete_row(): except Exception as e: - st.error(traceback.format_exc()) + st.error(traceback.format_exc()) \ No newline at end of file diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py index 9a70f83..2f27fd1 100644 --- a/code/utilities/customprompt.py +++ b/code/utilities/customprompt.py @@ -3,7 +3,7 @@ template = """{summaries} Please reply to the question using only the information present in the text above. -Include references to the sources you used to create the answer if those are relevant ("SOURCES"). +Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. If you can't find it, reply politely that the information is not in the knowledge base. Question: {question} Answer:""" diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 4e5b92b..4caa736 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -138,6 +138,19 @@ def filter_sourcesLinks(self, sources): return sources + def insert_citations_in_answer(self, answer, filenameList): + pattern = r'\[\[(.*?)\]\]' + match = re.search(pattern, answer) + while match: + filename = match.group(1).split('.')[0] # remove any extension to the name of the source document + if filename in filenameList: + filenameIndex = filenameList.index(filename) + 1 + answer = answer[:match.start()] + '$^{' + f'{filenameIndex}' + '}$' + answer[match.end():] + else: + answer = answer[:match.start()] + '$^{' + f'{filename}' + '}$' + answer[match.end():] + match = re.search(pattern, answer) + return answer + def get_semantic_answer_lang_chain(self, question, chat_history): question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False) doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT) @@ -151,9 +164,6 @@ def get_semantic_answer_lang_chain(self, question, chat_history): result = chain({"question": question, "chat_history": chat_history}) container_sas = self.blob_client.get_container_sas() - # context = "\n".join(list(map(lambda x: x.page_content, result['source_documents']))) - # context = "\n".join(list(map(lambda x: "{} \n {} \n".format(x.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas), x.page_content), result['source_documents']))) - contextDict ={} for res in result['source_documents']: source_key = self.filter_sourcesLinks(res.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)).replace('\n', '').replace(' ', '') @@ -180,3 +190,18 @@ def get_embeddings_model(self): def get_completion(self, prompt, **kwargs): return self.llm(prompt) + + def get_links_filenames(self, answer, sources): + split_sources = sources.split(' \n ') # soures are expected to be of format ' \n [filename1.ext](sourcelink1) \n [filename2.ext](sourcelink2) \n [filename3.ext](sourcelink3) \n ' + srcList = [] + linkList = [] + filenameList = [] + for src in split_sources: + if src != '': + srcList.append(src) + link = src[1:].split('(')[1][:-1].split(')')[0] # get the link + linkList.append(link) + filename = src[1:].split(']')[0] # retrieve the source filename + filenameList.append(filename) + answer = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer + return answer, srcList, linkList, filenameList From e17812e0b5871b2e09ffdca6f5e5fe0aab6825ad Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Thu, 6 Apr 2023 11:03:19 +0200 Subject: [PATCH 04/17] Display answer first, selection menu of context after the answer --- code/pages/00_Chat.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index 85c578b..dac5219 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -104,18 +104,17 @@ def display_iframe(filename, link, contextList): # message(st.session_state['chat_history'][i][1], key=str(i)) if i == history_range.start: + answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i]) + st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) + answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html + message(answer_with_citations, key=str(i)) st.session_state['context_show_option'] = st.selectbox( 'Choose how to display context used to answer the question when clicking on a document source below:', context_show_options, index=context_show_options.index(st.session_state['context_show_option']) ) - - answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i]) - st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) - answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html - message(answer_with_citations, key=str(i)) - + for id in range(len(sourceList)): if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]): display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]]) From 8deb4836c5c75b10d600f57271955e4afb220085 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Fri, 7 Apr 2023 20:30:28 +0200 Subject: [PATCH 05/17] Adding references, citations, and follow-up questions --- README.md | 2 +- code/OpenAI_Queries.py | 109 ++++++++++++++++++---- code/pages/00_Chat.py | 160 +++++++++++++++++++++++++-------- code/pages/01_Add_Document.py | 23 ++--- code/utilities/customprompt.py | 4 + code/utilities/helper.py | 24 ++++- 6 files changed, 253 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index baad9b1..49a380c 100644 --- a/README.md +++ b/README.md @@ -198,4 +198,4 @@ This presentation, demonstration, and demonstration model do not give you or you The information contained in this presentation, demonstration and demonstration model represents the current view of Microsoft on the issues discussed as of the date of presentation and/or demonstration, for the duration of your access to the demonstration model. Because Microsoft must respond to changing market conditions, it should not be interpreted to be a commitment on the part of Microsoft, and Microsoft cannot guarantee the accuracy of any information presented after the date of presentation and/or demonstration and for the duration of your access to the demonstration model. -No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies. +No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies. \ No newline at end of file diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index 35abd31..1dfff4c 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -66,6 +66,21 @@ def check_deployment(): st.error(traceback.format_exc()) +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''): + htmlstr = """ """ + + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style) + components.html(f"{htmlstr}", height=0, width=0) + @st.cache_data() def get_languages(): return llm_helper.translator.get_available_languages() @@ -76,16 +91,36 @@ def get_languages(): default_question = "" default_answer = "" + if 'question' not in st.session_state: st.session_state['question'] = default_question - # if 'prompt' not in st.session_state: - # st.session_state['prompt'] = os.getenv("QUESTION_PROMPT", "Please reply to the question using only the information present in the text above. If you can't find it, reply 'Not in the text'.\nQuestion: _QUESTION_\nAnswer:").replace(r'\n', '\n') if 'response' not in st.session_state: st.session_state['response'] = default_answer if 'context' not in st.session_state: st.session_state['context'] = "" if 'sources' not in st.session_state: st.session_state['sources'] = "" + if 'followup_questions' not in st.session_state: + st.session_state['followup_questions'] = [] + if 'input_message_key' not in st.session_state: + st.session_state ['input_message_key'] = 1 + if 'do_not_process_question' not in st.session_state: + st.session_state['do_not_process_question'] = False + + if 'askedquestion' not in st.session_state: + st.session_state.askedquestion = default_question + + if 'context_show_option' not in st.session_state: + st.session_state['context_show_option'] = 'context within full source document' + + + if 'tab_context' not in st.session_state: + st.session_state['tab_context'] = 'Not opened yet' + else: + tmp=st.session_state['tab_context'] + tmp2=st.session_state['question'] + if st.session_state['question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Chat': + st.session_state['tab_context'] = 'Open_Queries' # Set page layout to wide screen and menu item menu_items = { @@ -109,6 +144,7 @@ def get_languages(): col1, col2, col3 = st.columns([2,2,2]) with col1: + ChangeButtonStyle("Check deployment", "#885555") st.button("Check deployment", on_click=check_deployment) with col3: with st.expander("Settings"): @@ -121,15 +157,25 @@ def get_languages(): # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1) st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language') - if 'askedquestion' not in st.session_state: - st.session_state.askedquestion = '' + # Callback to display document sources + def show_document_source(filename, link, contextList): + st.session_state['do_not_process_question'] = True + display_iframe(filename, link, contextList) + + # Callback to assign the follow-up question is selected by the user + def ask_followup_question(followup_question): + st.session_state.askedquestion = followup_question + st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1 def questionAsked(): - st.session_state.askedquestion = st.session_state.inputquestion + st.session_state.askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])] - question = st.text_input("OpenAI Semantic Answer", default_question, key='inputquestion', on_change=questionAsked) + question = st.text_input("Azure OpenAI Semantic Answer", value=st.session_state['askedquestion'], key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked) + # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context def display_iframe(filename, link, contextList): + st.session_state['do_not_process_question'] = True + st.session_state['chat_askedquestion'] = st.session_state.question if st.session_state['context_show_option'] == 'context within full source document': try: response = requests.get(link) @@ -171,35 +217,50 @@ def display_iframe(filename, link, contextList): """ - if st.button("Close"): - st.placeholder.empty() + def close_iframe(): + placeholder.empty() + st.session_state['do_not_process_question'] = True + st.button("Close", on_click=close_iframe) + placeholder = st.empty() with placeholder: - # htmlcontent = html_content.format(link=link, filename=filename) htmlcontent = html_content.format(filename=filename, text=text) components.html(htmlcontent, height=500) + pass - - if 'context_show_option' not in st.session_state: - st.session_state['context_show_option'] = 'context within full source document' + tmp=st.session_state['tab_context'] + tmp2=st.session_state['question'] + if st.session_state['tab_context'] != 'Open_Queries' and st.session_state['question'] != '' and st.session_state['question'] != st.session_state['followup_questions']: + st.session_state['tab_context'] = 'Open_Queries' + st.session_state['do_not_process_question'] = True + ask_followup_question(st.session_state['question']) # Answer the question if any - if st.session_state.askedquestion != '': + if st.session_state.askedquestion != '' and st.session_state['do_not_process_question'] != True: st.session_state['question'] = st.session_state.askedquestion st.session_state.askedquestion = "" - st.session_state['question'], st.session_state['response'], st.session_state['context'], st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], []) + st.session_state['question'], \ + st.session_state['response'], \ + st.session_state['context'], \ + st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], []) + st.session_state['response'], followup_questions_list = llm_helper.extract_followupquestions(st.session_state['response']) + st.session_state['followup_questions'] = followup_questions_list + + st.session_state['do_not_process_question'] = False # Display the sources and context - even if the page is reloaded if st.session_state['sources'] or st.session_state['context']: st.session_state['response'], sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources']) - st.markdown("Answer:" + st.session_state['response']) + st.markdown("**Answer:**" + st.session_state['response']) + if st.session_state['sources'] or st.session_state['context']: + # Buttons to display the context used to answer for id in range(len(sourceList)): - if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]): - display_iframe(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]]) + st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], )) + # Details on the question and answer context with st.expander("Question and Answer Context"): if not st.session_state['context'] is None and st.session_state['context'] != []: for content_source in st.session_state['context'].keys(): @@ -207,10 +268,20 @@ def display_iframe(filename, link, contextList): for context_text in st.session_state['context'][content_source]: st.markdown(f"{context_text}") - # theContext = llm_helper.filter_sourcesLinks(st.session_state['context'].replace('$', '\$')) - # st.markdown(theContext) st.markdown(f"SOURCES: {st.session_state['sources']}") + # Display proposed follow-up questions which can be clicked on to ask that question automatically + if len(st.session_state['followup_questions']) > 0: + st.markdown('**Proposed follow-up questions:**') + with st.container(): + for questionId, followup_question in enumerate(st.session_state['followup_questions']): + if followup_question: + st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, )) + + for questionId, followup_question in enumerate(st.session_state['followup_questions']): + if followup_question: + ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none') + if st.session_state['translation_language'] and st.session_state['translation_language'] != '': st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية") st.write(f"{llm_helper.translator.translate(st.session_state['response'], available_languages[st.session_state['translation_language']])}") diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index dac5219..c0b11df 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -6,39 +6,75 @@ import regex as re def clear_chat_data(): - st.session_state['input'] = "" st.session_state['chat_history'] = [] - st.session_state['source_documents'] = [] + st.session_state['chat_source_documents'] = [] st.session_state['chat_context'] = [] - st.session_state['context_show_option'] = 'context within full source document' - st.session_state['askedquestion'] = '' + st.session_state['chat_context_show_option'] = 'context within full source document' + st.session_state['chat_askedquestion'] = '' + st.session_state['chat_question'] = '' + st.session_state['chat_followup_questions'] = [] + st.session_state['do_not_process_question'] = False + st.session_state['tab_context'] = 'Not opened yet' + # Initialize chat history +if 'chat_question' not in st.session_state: + st.session_state['chat_question'] = '' +if 'chat_askedquestion' not in st.session_state: + st.session_state.chat_askedquestion = '' if 'chat_history' not in st.session_state: st.session_state['chat_history'] = [] -if 'source_documents' not in st.session_state: - st.session_state['source_documents'] = [] +if 'chat_source_documents' not in st.session_state: + st.session_state['chat_source_documents'] = [] if 'chat_context' not in st.session_state: st.session_state['chat_context'] = [] - -context_show_options = ('extracted context only', 'context within full source document') -if 'context_show_option' not in st.session_state: - st.session_state['context_show_option'] = 'context within full source document' +if 'chat_followup_questions' not in st.session_state: + st.session_state['chat_followup_questions'] = [] +if 'input_message_key' not in st.session_state: + st.session_state ['input_message_key'] = 1 + +if 'do_not_process_question' not in st.session_state: + st.session_state['do_not_process_question'] = False + +chat_context_show_options = ('extracted context only', 'context within full source document') +if 'chat_context_show_option' not in st.session_state: + st.session_state['chat_context_show_option'] = 'context within full source document' + +if 'tab_context' not in st.session_state: + st.session_state['tab_context'] = 'Not opened yet' +else: + if st.session_state['chat_question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Open_Queries': + st.session_state['tab_context'] = 'Chat' +tmp=st.session_state['tab_context'] +tmp2=st.session_state['chat_question'] llm_helper = LLMHelper() -if 'askedquestion' not in st.session_state: - st.session_state.askedquestion = '' -def questionAsked(): - st.session_state.askedquestion = st.session_state.input +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''): + htmlstr = """ """ + + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style) + components.html(f"{htmlstr}", height=0, width=0) -# Chat -input_text = st.text_input("You: ", placeholder="type your question", key="input", on_change=questionAsked) -clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) +def questionAsked(): + st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])] + +# Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context def display_iframe(filename, link, contextList): - if st.session_state['context_show_option'] == 'context within full source document': + st.session_state['do_not_process_question'] = True + st.session_state['chat_askedquestion'] = st.session_state.chat_question + if st.session_state['chat_context_show_option'] == 'context within full source document': try: response = requests.get(link) text = response.text @@ -79,46 +115,96 @@ def display_iframe(filename, link, contextList): """ - if st.button("Close"): + def close_iframe(): placeholder.empty() + st.session_state['do_not_process_question'] = True + + st.button("Close", on_click=close_iframe) placeholder = st.empty() with placeholder: htmlcontent = html_content.format(filename=filename, text=text) components.html(htmlcontent, height=500) + pass -if st.session_state.askedquestion: - question = st.session_state.askedquestion - st.session_state.askedquestion = "" - question, result, context, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history']) - st.session_state['chat_history'].append((question, result)) - st.session_state['source_documents'].append(sources) +# Callback to assign the follow-up question is selected by the user +def ask_followup_question(followup_question): + st.session_state.chat_askedquestion = followup_question + st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1 + +tmp=st.session_state['tab_context'] +tmp2=st.session_state['chat_question'] +# Reset the right asked question to the input box when this page is reopened after switching to the OpenAI_Queries page +if st.session_state['tab_context'] != 'Chat' and st.session_state['chat_question'] != '' and st.session_state['chat_question'] != st.session_state['chat_askedquestion']: + st.session_state['tab_context'] = 'Chat' + st.session_state['do_not_process_question'] = True + ask_followup_question(st.session_state['chat_question']) + + +# Chat +input_text = st.text_input("You: ", placeholder="type your question", value=st.session_state.chat_askedquestion, key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked) + +clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) +ChangeButtonStyle("Clear chat", "#885555") + +def show_document_source(filename, link, contextList): + st.session_state['do_not_process_question'] = True + display_iframe(filename, link, contextList) + +# If a question is asked execute the request to get the result, context, sources and up to 3 follow-up questions proposals +if st.session_state.chat_askedquestion and st.session_state.do_not_process_question != True: + st.session_state['chat_question'] = st.session_state.chat_askedquestion + st.session_state.chat_askedquestion = "" + st.session_state['chat_question'], result, context, sources = llm_helper.get_semantic_answer_lang_chain(st.session_state['chat_question'], st.session_state['chat_history']) + result, chat_followup_questions_list = llm_helper.extract_followupquestions(result) + st.session_state['chat_history'].append((st.session_state['chat_question'], result)) + st.session_state['chat_source_documents'].append(sources) st.session_state['chat_context'].append(context) + st.session_state['chat_followup_questions'] = chat_followup_questions_list + +st.session_state['do_not_process_question'] = False - +# Displays the chat history if st.session_state['chat_history']: history_range = range(len(st.session_state['chat_history'])-1, -1, -1) for i in range(len(st.session_state['chat_history'])-1, -1, -1): - # message(st.session_state['chat_history'][i][1], key=str(i)) + # This history entry is the latest one - also show follow-up questions, buttons to access source(s) context(s) if i == history_range.start: - answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i]) + answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i]) st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) - answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html + + answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html message(answer_with_citations, key=str(i)) - st.session_state['context_show_option'] = st.selectbox( + # Selectbox to choose how to display the context(s) associated with the clicked source document name + st.session_state['chat_context_show_option'] = st.selectbox( 'Choose how to display context used to answer the question when clicking on a document source below:', - context_show_options, - index=context_show_options.index(st.session_state['context_show_option']) + chat_context_show_options, + index=chat_context_show_options.index(st.session_state['chat_context_show_option']) ) - + + # Buttons to display the context(s) associated with the clicked source document name for id in range(len(sourceList)): - if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]): - display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]]) + st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], )) + + # Display proposed follow-up questions which can be clicked on to ask that question automatically + if len(st.session_state['chat_followup_questions']) > 0: + st.markdown('**Proposed follow-up questions:**') + with st.container(): + for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']): + if followup_question: + st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, )) + for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']): + if followup_question: + ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none') + + # The old questions and answers within the history else: - st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}') - message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') + answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html + message(answer_with_citations, key=str(i)) + st.markdown(f'\n\nSources: {st.session_state["chat_source_documents"][i]}') + message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user') diff --git a/code/pages/01_Add_Document.py b/code/pages/01_Add_Document.py index 7a85b8b..ea0461c 100644 --- a/code/pages/01_Add_Document.py +++ b/code/pages/01_Add_Document.py @@ -4,13 +4,15 @@ import requests import mimetypes import traceback +import chardet from utilities.helper import LLMHelper import uuid from redis.exceptions import ResponseError + def upload_text_and_embeddings(): file_name = f"{uuid.uuid4()}.txt" - source_url = llm_helper.blob_client.upload_file(st.session_state['doc_text'], file_name=file_name, content_type='text/plain') + source_url = llm_helper.blob_client.upload_file(st.session_state['doc_text'], file_name=file_name, content_type='text/plain; charset=utf-8') llm_helper.add_embeddings_lc(source_url) st.success("Embeddings added successfully.") @@ -27,7 +29,6 @@ def remote_convert_files_and_add_embeddings(process_all=False): except Exception as e: st.error(traceback.format_exc()) - def delete_row(): st.session_state['data_to_drop'] redisembeddings.delete_document(st.session_state['data_to_drop']) @@ -39,6 +40,14 @@ def add_urls(): llm_helper.add_embeddings_lc(url) st.success(f"Embeddings added successfully for {url}") +def upload_file(bytes_data: bytes, file_name: str): + # Upload a new file + st.session_state['filename'] = file_name + content_type = mimetypes.MimeTypes().guess_type(file_name)[0] + charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else '' + st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type+charset) + + try: # Set page layout to wide screen and menu item menu_items = { @@ -62,11 +71,7 @@ def add_urls(): bytes_data = uploaded_file.getvalue() if st.session_state.get('filename', '') != uploaded_file.name: - # Upload a new file - st.session_state['filename'] = uploaded_file.name - content_type = mimetypes.MimeTypes().guess_type(uploaded_file.name)[0] - st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type) - + upload_file(bytes_data, uploaded_file.name) converted_filename = '' if uploaded_file.name.endswith('.txt'): # Add the text to the embeddings @@ -99,9 +104,7 @@ def add_urls(): if st.session_state.get('filename', '') != up.name: # Upload a new file - st.session_state['filename'] = up.name - content_type = mimetypes.MimeTypes().guess_type(up.name)[0] - st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type) + upload_file(bytes_data, up.name) if up.name.endswith('.txt'): # Add the text to the embeddings llm_helper.blob_client.upsert_blob_metadata(up.name, {'converted': "true"}) diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py index 2f27fd1..d854dac 100644 --- a/code/utilities/customprompt.py +++ b/code/utilities/customprompt.py @@ -5,6 +5,10 @@ Please reply to the question using only the information present in the text above. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. If you can't find it, reply politely that the information is not in the knowledge base. +After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook. +Only use double angle brackets to reference the questions, e.g. <>. +Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'. +Try not to repeat questions that have already been asked. Question: {question} Answer:""" diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 4caa736..2bcf8f7 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -138,6 +138,26 @@ def filter_sourcesLinks(self, sources): return sources + def extract_followupquestions(self, answer): + followupTag = answer.find('Follow-up Questions') + folloupQuestions = answer.find('<<') + + # take min of followupTag and folloupQuestions if not -1 to avoid taking the followup questions if there is no followupTag + followupTag = min(followupTag, folloupQuestions) if followupTag != -1 and folloupQuestions != -1 else max(followupTag, folloupQuestions) + answer_without_followupquestions = answer[:followupTag] if followupTag != -1 else answer + followup_questions = answer[followupTag:].strip() if followupTag != -1 else '' + + # Extract the followup questions as a list + pattern = r'\<\<(.*?)\>\>' + match = re.search(pattern, followup_questions) + followup_questions_list = [] + while match: + followup_questions_list.append(followup_questions[match.start()+2:match.end()-2]) + followup_questions = followup_questions[match.end():] + match = re.search(pattern, followup_questions) + + return answer_without_followupquestions, followup_questions_list + def insert_citations_in_answer(self, answer, filenameList): pattern = r'\[\[(.*?)\]\]' match = re.search(pattern, answer) @@ -172,7 +192,7 @@ def get_semantic_answer_lang_chain(self, question, chat_history): contextDict[source_key].append(res.page_content) sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents']))) - + result['answer'] = result['answer'].split('SOURCES:')[0].split('Sources:')[0].split('SOURCE:')[0].split('Source:')[0] sources = sources.replace('_SAS_TOKEN_PLACEHOLDER_', container_sas) @@ -204,4 +224,4 @@ def get_links_filenames(self, answer, sources): filename = src[1:].split(']')[0] # retrieve the source filename filenameList.append(filename) answer = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer - return answer, srcList, linkList, filenameList + return answer, srcList, linkList, filenameList \ No newline at end of file From b244859d8ccd0a27931eb1b6474afd508e39a05d Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 11 Apr 2023 13:57:04 +0200 Subject: [PATCH 06/17] Source Buttons styling and handling buttons with quotes in the name and handling latin characters in answers --- code/OpenAI_Queries.py | 36 +++++++++++++++++++++++++----------- code/pages/00_Chat.py | 29 ++++++++++++++++++++--------- code/utilities/helper.py | 25 ++++++++++++++++++++++--- 3 files changed, 67 insertions(+), 23 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index 1dfff4c..cb5d785 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -8,6 +8,7 @@ from utilities.helper import LLMHelper import requests +import regex as re import logging logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING) @@ -68,8 +69,10 @@ def check_deployment(): def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''): htmlstr = """ """ + // console.log(str_wgt_txt + ' ( ' + element_type + ' ) : ' + parentNode + ' ( ' + parent_type + ' , ' + parentNode.innerText + ' )'); + if (element_type == 'BUTTON') {{ + elements[i].style.color = '{wch_hex_colour}'; + let border_style = '{wch_border_style}'; + if (border_style.length > 0) {{ + elements[i].style.border ='{wch_border_style}'; + elements[i].style.outline ='{wch_border_style}'; + elements[i].addEventListener('focus', function() {{ + this.style.outline = '{wch_border_style}'; + this.style.boxShadow = '0px 0px 0px #FFFFFF'; + this.style.backgroundColor = "#FFFFFF"; + // console.log(this.innerText + ' FOCUS'); + }}); + elements[i].addEventListener('hover', function() {{ + this.style.outline = '{wch_border_style}'; + this.style.boxShadow = '0px 0px 0px #FFFFFF'; + this.style.backgroundColor = "#FFFFFF"; + // console.log(this.innerText + ' HOVER'); + }}); + }} + if ('{wch_textsize}' != '') {{ + elements[i].style.fontSize = '{wch_textsize}'; + }} + }} + else if (element_type == 'P' && '{wch_textsize}' != '') {{ + elements[i].style.fontSize = '{wch_textsize}'; + }} + }} }} + """ - htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style) + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) components.html(f"{htmlstr}", height=0, width=0) @@ -134,6 +166,7 @@ def close_iframe(): # Callback to assign the follow-up question is selected by the user def ask_followup_question(followup_question): + st.session_state['tab_context'] = 'Chat' # Prevents side effect when first click after loading the page st.session_state.chat_askedquestion = followup_question st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1 @@ -145,10 +178,11 @@ def ask_followup_question(followup_question): # Chat +clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) +ChangeButtonStyle("Clear chat", "#ADCDE7", wch_border_style="none", wch_textsize="10px") + input_text = st.text_input("You: ", placeholder="type your question", value=st.session_state.chat_askedquestion, key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked) -clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data) -ChangeButtonStyle("Clear chat", "#885555") def show_document_source(filename, link, contextList): st.session_state['do_not_process_question'] = True @@ -179,8 +213,11 @@ def show_document_source(filename, link, contextList): answer_with_citations, sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i]) st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,) - answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html - message(answer_with_citations, key=str(i)) + answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]).strip() # message() does not get Latex nor html + # message(answer_with_citations key=str(i)) + answer_message_height = int((len(answer_with_citations) / 22) * 1.1 * 8) + st.text_area(label='', value=answer_with_citations, height=answer_message_height, key=str(i)) + st.write("
", unsafe_allow_html=True) # Display proposed follow-up questions which can be clicked on to ask that question automatically if len(st.session_state['chat_followup_questions']) > 0: @@ -191,29 +228,31 @@ def show_document_source(filename, link, contextList): str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, )) - # Selectbox to choose how to display the context(s) associated with the clicked source document name - st.session_state['chat_context_show_option'] = st.selectbox( - 'Choose how to display context used to answer the question when clicking on a document source below:', - chat_context_show_options, - index=chat_context_show_options.index(st.session_state['chat_context_show_option']) - ) + if len(sourceList) > 0: + st.write("

", unsafe_allow_html=True) + # Selectbox to choose how to display the context(s) associated with the clicked source document name + st.session_state['chat_context_show_option'] = st.selectbox( + 'Choose how to display context used to answer the question when clicking on a document source below:', + chat_context_show_options, + index=chat_context_show_options.index(st.session_state['chat_context_show_option']) + ) - # Buttons to display the context(s) associated with the clicked source document name - for id in range(len(sourceList)): - st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], )) + # Buttons to display the context(s) associated with the clicked source document name + for id in range(len(sourceList)): + st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], )) - # Source Buttons Styles - for id in range(len(sourceList)): - if filenameList[id] in matchedSourcesList: - ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none') - else: - ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none') + # Source Buttons Styles + for id in range(len(sourceList)): + if filenameList[id] in matchedSourcesList: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px') + else: + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px') for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']): if followup_question: str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) - ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none') + ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px') # The old questions and answers within the history From def582eabb7a5bcf967b3156130f25cd9b251efc Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Mon, 17 Apr 2023 10:12:36 +0200 Subject: [PATCH 10/17] Synching with latest main branch --- README.md | 6 +++++- code/OpenAI_Queries.py | 10 ++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 49a380c..7175228 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,9 @@ Therefore, we have provided a way for you to continue using the previous format If you want to move to the new format, please go to: - "Add Document" -> "Add documents in Batch" and click on "Convert all files and add embeddings" to reprocess your documents. +# Use the Repo with Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4) +By default, the repo uses an Instruction based model (like text-davinci-003) for QnA and Chat experience. +If you want to use a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4), please change the environment variables as described [here](#environment-variables) # Running this repo You have multiple options to run the code: @@ -159,7 +162,8 @@ Here is the explanation of the parameters: | App Setting | Value | Note | | --- | --- | ------------- | -|OPENAI_ENGINE|text-davinci-003|Instruction engine deployed in your Azure OpenAI resource| +|OPENAI_ENGINE|text-davinci-003|Engine deployed in your Azure OpenAI resource. E.g. Instruction based model: text-davinci-003 or Chat based model: gpt-35-turbo or gpt-4-32k or gpt-4. Please use the deployment name and not the model name.| +|OPENAI_DEPLOYMENT_TYPE | Text | Text for Instruction engines (text-davinci-003),
Chat for Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4) | |OPENAI_EMBEDDINGS_ENGINE_DOC | text-embedding-ada-002 | Embedding engine for documents deployed in your Azure OpenAI resource| |OPENAI_EMBEDDINGS_ENGINE_QUERY | text-embedding-ada-002 | Embedding engine for query deployed in your Azure OpenAI resource| |OPENAI_API_BASE | https://YOUR_AZURE_OPENAI_RESOURCE.openai.azure.com/ | Your Azure OpenAI Resource name. Get it in the [Azure Portal](https://portal.azure.com)| diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index a5f046c..05205a5 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -18,11 +18,13 @@ def check_deployment(): #\ 1. Check if the llm is working try: llm_helper = LLMHelper() - llm_helper.llm("Generate a joke!") + llm_helper.get_completion("Generate a joke!") st.success("LLM is working!") except Exception as e: - st.error(f"""LLM is not working. - Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}. + st.error(f"""LLM is not working. + Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}. + If you are using an Instructions based deployment (text-davinci-003), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Text or delete the environment variable OPENAI_DEPLOYMENT_TYPE. + If you are using a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Chat. Then restart your application. """) st.error(traceback.format_exc()) @@ -33,7 +35,7 @@ def check_deployment(): st.success("Embedding is working!") except Exception as e: st.error(f"""Embedding model is not working. - Please check you have a deployment name {llm_helper.model} in your Azure OpenAI resource {llm_helper.api_base}. + Please check you have a deployment named "text-embedding-ada-002" for "text-embedding-ada-002" model in your Azure OpenAI resource {llm_helper.api_base}. Then restart your application. """) st.error(traceback.format_exc()) From 849a6f935c5923885f9f4a143cac5d7064130a0e Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Mon, 17 Apr 2023 16:07:18 +0200 Subject: [PATCH 11/17] Handling brwoser's dark mode and enhancing Prompt engineering --- code/OpenAI_Queries.py | 75 ++++++++++++++++++++++++++++------ code/pages/00_Chat.py | 26 +++++++++--- code/utilities/customprompt.py | 6 ++- 3 files changed, 88 insertions(+), 19 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index 05205a5..e9862fb 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -69,21 +69,56 @@ def check_deployment(): st.error(traceback.format_exc()) -def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''): - htmlstr = """ """ + // console.log(str_wgt_txt + ' ( ' + element_type + ' ) : ' + parentNode + ' ( ' + parent_type + ' , ' + parentNode.innerText + ' )'); + if (element_type == 'BUTTON') {{ + elements[i].style.color = '{wch_hex_colour}'; + let border_style = '{wch_border_style}'; + if (border_style.length > 0) {{ + elements[i].style.border ='{wch_border_style}'; + elements[i].style.outline ='{wch_border_style}'; + elements[i].addEventListener('focus', function() {{ + this.style.outline = '{wch_border_style}'; + this.style.boxShadow = '0px 0px 0px ' + backgroundColor; + this.style.backgroundColor = '"' + backgroundColor + '"'; + // console.log(this.innerText + ' FOCUS'); + }}); + elements[i].addEventListener('hover', function() {{ + this.style.outline = '{wch_border_style}'; + this.style.boxShadow = '0px 0px 0px ' + backgroundColor; + this.style.backgroundColor = '"' + backgroundColor + '"'; + // console.log(this.innerText + ' HOVER'); + }}); + }} + if ('{wch_textsize}' != '') {{ + elements[i].style.fontSize = '{wch_textsize}'; + }} + }} + else if (element_type == 'P' && '{wch_textsize}' != '') {{ + elements[i].style.fontSize = '{wch_textsize}'; + }} + }} }} + """ - htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style) + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) components.html(f"{htmlstr}", height=0, width=0) @st.cache_data() @@ -147,7 +182,7 @@ def get_languages(): col1, col2, col3 = st.columns([2,2,2]) with col1: - ChangeButtonStyle("Check deployment", "#885555") + ChangeButtonStyle("Check deployment", "#ADCDE7", wch_border_style="none", wch_textsize="10px") st.button("Check deployment", on_click=check_deployment) with col3: with st.expander("Settings"): @@ -218,6 +253,18 @@ def display_iframe(filename, link, contextList):
+
""" @@ -260,10 +307,12 @@ def close_iframe(): # Display the sources and context - even if the page is reloaded if st.session_state['sources'] or st.session_state['context']: st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources']) + st.write("
", unsafe_allow_html=True) st.markdown("**Answer:**" + st.session_state['response']) # Display proposed follow-up questions which can be clicked on to ask that question automatically if len(st.session_state['followup_questions']) > 0: + st.write("
", unsafe_allow_html=True) st.markdown('**Proposed follow-up questions:**') with st.container(): for questionId, followup_question in enumerate(st.session_state['followup_questions']): @@ -273,11 +322,13 @@ def close_iframe(): if st.session_state['sources'] or st.session_state['context']: # Buttons to display the context used to answer + st.write("
", unsafe_allow_html=True) st.markdown('**Document sources:**') for id in range(len(sourceList)): st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], )) # Details on the question and answer context + st.write("

", unsafe_allow_html=True) with st.expander("Question and Answer Context"): if not st.session_state['context'] is None and st.session_state['context'] != []: for content_source in st.session_state['context'].keys(): @@ -291,14 +342,14 @@ def close_iframe(): # Source Buttons Styles for id in range(len(sourceList)): if filenameList[id] in matchedSourcesList: - ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none') + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px') else: - ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none') + ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px') for questionId, followup_question in enumerate(st.session_state['followup_questions']): if followup_question: str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question) - ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none') + ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px') if st.session_state['translation_language'] and st.session_state['translation_language'] != '': st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية") diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index baf5b63..cdaa62e 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -52,7 +52,11 @@ def clear_chat_data(): def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''): - htmlstr = """ """ diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py index d854dac..17a3fab 100644 --- a/code/utilities/customprompt.py +++ b/code/utilities/customprompt.py @@ -2,9 +2,11 @@ from langchain.prompts import PromptTemplate template = """{summaries} -Please reply to the question using only the information present in the text above. -Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. +Please reply to the question using only the information present in the text above. +Detect the langage of the question and answer in the same language. If you can't find it, reply politely that the information is not in the knowledge base. +Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. +If asked for enumerations list all of them and do not invent any. After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook. Only use double angle brackets to reference the questions, e.g. <>. Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'. From 733a60cd8a6f8f16dd56630c2c4fd63a8b865af9 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Mon, 17 Apr 2023 20:30:20 +0200 Subject: [PATCH 12/17] Updating the prompt to be generic --- code/utilities/customprompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py index 17a3fab..a709863 100644 --- a/code/utilities/customprompt.py +++ b/code/utilities/customprompt.py @@ -7,7 +7,7 @@ If you can't find it, reply politely that the information is not in the knowledge base. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]]. If asked for enumerations list all of them and do not invent any. -After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook. +After answering the question generate three very brief follow-up questions that the user would likely ask next. Only use double angle brackets to reference the questions, e.g. <>. Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'. Try not to repeat questions that have already been asked. From e8fdcde11fda37ee899a26465ea969eed67e9d29 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Mon, 17 Apr 2023 23:13:15 +0200 Subject: [PATCH 13/17] Removing CR from Pattern for non-ascii characters --- code/OpenAI_Queries.py | 2 - code/pages/00_Chat.py | 2 - code/pages/04_Index_Management.py | 63 ++++++++++++++++++++++++++++++- code/utilities/helper.py | 55 ++++++++++++++++++++++----- 4 files changed, 107 insertions(+), 15 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index e9862fb..fc349d7 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -260,9 +260,7 @@ def display_iframe(filename, link, contextList): let textColor = '#222222'; if (prefersDark) {{ textColor = '#EEEEEE'; }} var body = frame.contentWindow.document.querySelector('body'); - console.log(textColor); body.style.color = textColor; - console.log(body.style.color); }}; diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index cdaa62e..7843f73 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -157,9 +157,7 @@ def display_iframe(filename, link, contextList): let textColor = '#222222'; if (prefersDark) {{ textColor = '#EEEEEE'; }} var body = frame.contentWindow.document.querySelector('body'); - console.log(textColor); body.style.color = textColor; - console.log(body.style.color); }}; diff --git a/code/pages/04_Index_Management.py b/code/pages/04_Index_Management.py index e2e7892..0cb61cf 100644 --- a/code/pages/04_Index_Management.py +++ b/code/pages/04_Index_Management.py @@ -2,6 +2,7 @@ import os import traceback from utilities.helper import LLMHelper +import streamlit.components.v1 as components def delete_embedding(): llm_helper.vector_store.delete_keys([f"doc:{st.session_state['embedding_to_drop']}"]) @@ -11,6 +12,58 @@ def delete_file(): embeddings_to_delete = list(map(lambda x: f"doc:{x}", embeddings_to_delete)) llm_helper.vector_store.delete_keys(embeddings_to_delete) + +def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''): + htmlstr = """ """ + htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize) + components.html(f"{htmlstr}", height=0, width=0) + try: # Set page layout to wide screen and menu item menu_items = { @@ -28,32 +81,40 @@ def delete_file(): # Query RediSearch to get all the embeddings data = llm_helper.get_all_documents(k=1000) - if len(data) == 0: + nb_embeddings = len(data) + + if nb_embeddings == 0: st.warning("No embeddings found. Go to the 'Add Document' tab to insert your docs.") else: st.dataframe(data, use_container_width=True) st.download_button("Download data", data.to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings') + ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="10px") st.text("") st.text("") col1, col2, col3, col4 = st.columns([3,2,2,1]) with col1: st.selectbox("Embedding id to delete", data.get('key',[]), key="embedding_to_drop") + # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="10px") with col2: st.text("") st.text("") st.button("Delete embedding", on_click=delete_embedding) + ChangeButtonStyle("Delete embedding", "#ADCDE7", wch_textsize="10px") with col3: st.selectbox("File name to delete", set(data.get('filename',[])), key="file_to_drop") + # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="10px") with col4: st.text("") st.text("") st.button("Delete file", on_click=delete_file) + ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="10px") st.text("") st.text("") st.button("Delete all embeddings", on_click=llm_helper.vector_store.delete_keys_pattern, args=("doc*",), type="secondary") + ChangeButtonStyle("Delete all embeddings", "#ADCDE7", wch_textsize="10px") except Exception as e: st.error(traceback.format_exc()) diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 2334d21..965b277 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -9,6 +9,7 @@ from langchain.llms import AzureOpenAI from langchain.vectorstores.base import VectorStore from langchain.chains import ChatVectorDBChain +from langchain.chains import ConversationalRetrievalChain from langchain.chains.qa_with_sources import load_qa_with_sources_chain from langchain.chains.llm import LLMChain from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT @@ -17,6 +18,9 @@ from langchain.text_splitter import TokenTextSplitter, TextSplitter from langchain.document_loaders.base import BaseLoader from langchain.document_loaders import TextLoader +from langchain.chat_models import ChatOpenAI +from langchain.schema import AIMessage, HumanMessage, SystemMessage + from utilities.formrecognizer import AzureFormRecognizerClient from utilities.azureblobstorage import AzureBlobStorageClient @@ -27,6 +31,7 @@ import pandas as pd import urllib +from fake_useragent import UserAgent class LLMHelper: def __init__(self, document_loaders : BaseLoader = None, @@ -43,7 +48,7 @@ def __init__(self, load_dotenv() openai.api_type = "azure" openai.api_base = os.getenv('OPENAI_API_BASE') - openai.api_version = "2022-12-01" + openai.api_version = "2023-03-15-preview" openai.api_key = os.getenv("OPENAI_API_KEY") # Azure OpenAI settings @@ -52,6 +57,7 @@ def __init__(self, self.index_name: str = "embeddings" self.model: str = os.getenv('OPENAI_EMBEDDINGS_ENGINE_DOC', "text-embedding-ada-002") self.deployment_name: str = os.getenv("OPENAI_ENGINE", os.getenv("OPENAI_ENGINES", "text-davinci-003")) + self.deployment_type: str = os.getenv("OPENAI_DEPLOYMENT_TYPE", "Text") # Vector store settings self.vector_store_address: str = os.getenv('REDIS_ADDRESS', "localhost") @@ -69,7 +75,10 @@ def __init__(self, self.document_loaders: BaseLoader = WebBaseLoader if document_loaders is None else document_loaders self.text_splitter: TextSplitter = TokenTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) if text_splitter is None else text_splitter self.embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=self.model, chunk_size=1) if embeddings is None else embeddings - self.llm: AzureOpenAI = AzureOpenAI(deployment_name=self.deployment_name) if llm is None else llm + if self.deployment_type == "Chat": + self.llm: ChatOpenAI = ChatOpenAI(model_name=self.deployment_name, engine=self.deployment_name) if llm is None else llm + else: + self.llm: AzureOpenAI = AzureOpenAI(deployment_name=self.deployment_name) if llm is None else llm self.vector_store: RedisExtended = RedisExtended(redis_url=self.vector_store_full_address, index_name=self.index_name, embedding_function=self.embeddings.embed_query) if vector_store is None else vector_store self.k : int = 3 if k is None else k @@ -78,16 +87,34 @@ def __init__(self, self.enable_translation : bool = False if enable_translation is None else enable_translation self.translator : AzureTranslatorClient = AzureTranslatorClient() if translator is None else translator + self.user_agent: UserAgent() = UserAgent() + self.user_agent.random def add_embeddings_lc(self, source_url): try: documents = self.document_loaders(source_url).load() + + # Convert to UTF-8 encoding for non-ascii text + for(document) in documents: + try: + if document.page_content.encode("iso-8859-1") == document.page_content.encode("latin-1"): + document.page_content = document.page_content.encode("iso-8859-1").decode("utf-8", errors="ignore") + except: + pass docs = self.text_splitter.split_documents(documents) + + # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half) + # pattern = re.compile(r'[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') + pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') # do not remove \x0a (\n) nor \x0d (\r) + for(doc) in docs: + doc.page_content = re.sub(pattern, '', doc.page_content) + keys = [] for i, doc in enumerate(docs): # Create a unique key for the document source_url = source_url.split('?')[0] filename = "/".join(source_url.split('/')[4:]) hash_key = hashlib.sha1(f"{source_url}_{i}".encode('utf-8')).hexdigest() + hash_key = f"doc:{self.index_name}:{hash_key}" keys.append(hash_key) doc.metadata = {"source": f"[{source_url}]({source_url}_SAS_TOKEN_PLACEHOLDER_)" , "chunk": i, "key": hash_key, "filename": filename} self.vector_store.add_documents(documents=docs, redis_url=self.vector_store_full_address, index_name=self.index_name, keys=keys) @@ -103,7 +130,7 @@ def convert_file_and_add_embeddings(self, source_url, filename, enable_translati # Upload the text to Azure Blob Storage converted_filename = f"converted/{filename}.txt" - source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain') + source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain; charset=utf-8') print(f"Converted file uploaded to {source_url} with filename {filename}") # Update the metadata to indicate that the file has been converted @@ -140,10 +167,10 @@ def filter_sourcesLinks(self, sources): def extract_followupquestions(self, answer): followupTag = answer.find('Follow-up Questions') - folloupQuestions = answer.find('<<') + followupQuestions = answer.find('<<') # take min of followupTag and folloupQuestions if not -1 to avoid taking the followup questions if there is no followupTag - followupTag = min(followupTag, folloupQuestions) if followupTag != -1 and folloupQuestions != -1 else max(followupTag, folloupQuestions) + followupTag = min(followupTag, followupQuestions) if followupTag != -1 and followupQuestions != -1 else max(followupTag, followupQuestions) answer_without_followupquestions = answer[:followupTag] if followupTag != -1 else answer followup_questions = answer[followupTag:].strip() if followupTag != -1 else '' @@ -155,6 +182,11 @@ def extract_followupquestions(self, answer): followup_questions_list.append(followup_questions[match.start()+2:match.end()-2]) followup_questions = followup_questions[match.end():] match = re.search(pattern, followup_questions) + + # Special case when 'Follow-up questions:' appears in the answer after the << + followupTag = answer_without_followupquestions.find('Follow-up Questions') + if followupTag != -1: + answer_without_followupquestions = answer_without_followupquestions[:followupTag] return answer_without_followupquestions, followup_questions_list @@ -184,12 +216,12 @@ def insert_citations_in_answer(self, answer, filenameList): def get_semantic_answer_lang_chain(self, question, chat_history): question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False) doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT) - chain = ChatVectorDBChain( - vectorstore=self.vector_store, + chain = ConversationalRetrievalChain( + retriever=self.vector_store.as_retriever(), question_generator=question_generator, combine_docs_chain=doc_chain, return_source_documents=True, - top_k_docs_for_context= self.k + # top_k_docs_for_context= self.k ) result = chain({"question": question, "chat_history": chat_history}) container_sas = self.blob_client.get_container_sas() @@ -219,7 +251,10 @@ def get_embeddings_model(self): } def get_completion(self, prompt, **kwargs): - return self.llm(prompt) + if self.deployment_type == 'Chat': + return self.llm([HumanMessage(content=prompt)]).content + else: + return self.llm(prompt) def get_links_filenames(self, answer, sources): split_sources = sources.split(' \n ') # soures are expected to be of format ' \n [filename1.ext](sourcelink1) \n [filename2.ext](sourcelink2) \n [filename3.ext](sourcelink3) \n ' @@ -243,4 +278,4 @@ def clean_encoding(self, text): reencodedtext = reencodedtext.decode('utf-8') except Exception as e: reencodedtext = text - return reencodedtext \ No newline at end of file + return reencodedtext From d114cd367a780131f20b0d83e1e96e671f563332 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 18 Apr 2023 08:18:16 +0200 Subject: [PATCH 14/17] Handling AutoScroll --- code/OpenAI_Queries.py | 2 +- code/pages/00_Chat.py | 2 +- code/pages/04_Index_Management.py | 8 ++++---- code/utilities/helper.py | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index fc349d7..1adfe4b 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -221,7 +221,7 @@ def display_iframe(filename, link, contextList): text = llm_helper.clean_encoding(text) for i, context in enumerate(contextList): context = llm_helper.clean_encoding(context) - contextSpan = f" {context}" + contextSpan = f" {context}" text = text.replace(context, contextSpan) text = text.replace('\n', '

') diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index 7843f73..57b6d71 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -118,7 +118,7 @@ def display_iframe(filename, link, contextList): text = llm_helper.clean_encoding(text) for i, context in enumerate(contextList): context = llm_helper.clean_encoding(context) - contextSpan = f" {context}" + contextSpan = f" {context}" text = text.replace(context, contextSpan) text = text.replace('\n', '

') diff --git a/code/pages/04_Index_Management.py b/code/pages/04_Index_Management.py index 0cb61cf..b86ca87 100644 --- a/code/pages/04_Index_Management.py +++ b/code/pages/04_Index_Management.py @@ -89,14 +89,14 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '' st.dataframe(data, use_container_width=True) st.download_button("Download data", data.to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings') - ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="10px") + ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="12px") st.text("") st.text("") col1, col2, col3, col4 = st.columns([3,2,2,1]) with col1: st.selectbox("Embedding id to delete", data.get('key',[]), key="embedding_to_drop") - # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="10px") + # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="12px") with col2: st.text("") st.text("") @@ -104,12 +104,12 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '' ChangeButtonStyle("Delete embedding", "#ADCDE7", wch_textsize="10px") with col3: st.selectbox("File name to delete", set(data.get('filename',[])), key="file_to_drop") - # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="10px") + # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="12px") with col4: st.text("") st.text("") st.button("Delete file", on_click=delete_file) - ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="10px") + ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="12px") st.text("") st.text("") diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 965b277..6989138 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -103,7 +103,6 @@ def add_embeddings_lc(self, source_url): docs = self.text_splitter.split_documents(documents) # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half) - # pattern = re.compile(r'[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') # do not remove \x0a (\n) nor \x0d (\r) for(doc) in docs: doc.page_content = re.sub(pattern, '', doc.page_content) From 028858ec8d1a492a2169f99f32c4f414a12ea566 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 18 Apr 2023 09:39:56 +0200 Subject: [PATCH 15/17] Applying pattern to converted txt --- code/utilities/helper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/code/utilities/helper.py b/code/utilities/helper.py index 6989138..5088e5c 100644 --- a/code/utilities/helper.py +++ b/code/utilities/helper.py @@ -128,8 +128,12 @@ def convert_file_and_add_embeddings(self, source_url, filename, enable_translati text = list(map(lambda x: self.translator.translate(x), text)) if self.enable_translation else text # Upload the text to Azure Blob Storage + converted_text = "n".join(text) + # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half) + pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]') # do not remove \x0a (\n) nor \x0d (\r) + converted_text = re.sub(pattern, '', converted_text) converted_filename = f"converted/{filename}.txt" - source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain; charset=utf-8') + source_url = self.blob_client.upload_file(converted_text, f"converted/{filename}.txt", content_type='text/plain; charset=utf-8') print(f"Converted file uploaded to {source_url} with filename {filename}") # Update the metadata to indicate that the file has been converted From 718bbd5fae90fa1164f1e237f718083ae899ed95 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 18 Apr 2023 13:23:50 +0200 Subject: [PATCH 16/17] Bug correction on asking second question of the session --- code/pages/00_Chat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index 57b6d71..f32e81e 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -106,6 +106,7 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '' def questionAsked(): st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])] + st.session_state.chat_question = st.session_state.chat_askedquestion # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context def display_iframe(filename, link, contextList): @@ -119,7 +120,11 @@ def display_iframe(filename, link, contextList): for i, context in enumerate(contextList): context = llm_helper.clean_encoding(context) contextSpan = f" {context}" + print(contextSpan) text = text.replace(context, contextSpan) + checkit = text.find('') + print(text[checkit:checkit2]) text = text.replace('\n', '

') except Exception as e: From 2b8cee2b7ee251d9a1617f4c498d181e26a29ee3 Mon Sep 17 00:00:00 2001 From: Philippe Limantour Date: Tue, 18 Apr 2023 13:41:27 +0200 Subject: [PATCH 17/17] Augmenting timout before autoscroll to leave time for content to be fully loaded --- code/OpenAI_Queries.py | 4 ++-- code/pages/00_Chat.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py index 1adfe4b..ae0381e 100644 --- a/code/OpenAI_Queries.py +++ b/code/OpenAI_Queries.py @@ -238,7 +238,7 @@ def display_iframe(filename, link, contextList): diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py index f32e81e..42d2cd3 100644 --- a/code/pages/00_Chat.py +++ b/code/pages/00_Chat.py @@ -140,7 +140,7 @@ def display_iframe(filename, link, contextList):