From 330e5651337a6cbc3ab352ed81bb113f44575d2c Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 4 Apr 2023 16:51:22 +0200
Subject: [PATCH 01/17] Enhanced UI interface

---
 code/OpenAI_Queries.py   | 101 ++++++++++++++++++++++++++++++++++++---
 code/pages/00_Chat.py    | 101 ++++++++++++++++++++++++++++++++++++---
 code/utilities/helper.py |  34 +++++++++++--
 3 files changed, 219 insertions(+), 17 deletions(-)
diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index bfe31ec..6bef743 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -2,10 +2,13 @@
 load_dotenv()
 
 import streamlit as st
+import streamlit.components.v1 as components
 import os
 import traceback
 from utilities.helper import LLMHelper
 
+import requests
+
 import logging
 logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
 
@@ -81,6 +84,8 @@ def get_languages():
         st.session_state['response'] = default_answer
     if 'context' not in st.session_state:
         st.session_state['context'] = ""
+    if 'sources' not in st.session_state:
+        st.session_state['sources'] = ""
 
     # Set page layout to wide screen and menu item
     menu_items = {
@@ -116,16 +121,98 @@ def get_languages():
             # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1)
             st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language')
 
-    question = st.text_input("OpenAI Semantic Answer", default_question)
+    if 'askedquestion' not in st.session_state:
+        st.session_state.askedquestion = ''
+
+    def questionAsked():
+        st.session_state.askedquestion = st.session_state.inputquestion
+
+    question = st.text_input("OpenAI Semantic Answer", default_question, key='inputquestion', on_change=questionAsked)
 
-    if question != '':
-        st.session_state['question'] = question
-        st.session_state['question'], st.session_state['response'], st.session_state['context'], sources = llm_helper.get_semantic_answer_lang_chain(question, [])
+    def display_iframe(filename, link, contextList):
+        if st.session_state['context_show_option'] == 'context within full source document':
+            try:
+                response = requests.get(link)
+                text = response.text
+                for i, context in enumerate(contextList):
+                    contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
+                    text = text.replace(context, contextSpan)
+                text = text.replace('\n', '<br><br>')
+
+            except Exception as e:
+                text = "Could not load the document source content"
+        else:
+            text = ""
+            for context in contextList:
+                text = text + context.replace('\n', '<br><br>') + '<br>'
+
+        html_content = """
+        <!DOCTYPE html>
+        <head>
+        <script>
+            window.onload = function() {{
+            setTimeout(function() {{
+                // Code to execute after 0.5 seconds
+                var iframe = this.document.getElementById('{filename}');
+                var element = iframe.contentDocument.getElementById('ContextTag0');
+                if (element !== null) {{
+                    element.scrollIntoView({{
+                    behavior: 'smooth',
+                    }});
+                }}
+            }}, 500);
+            }};
+        </script>
+        </head>
+        <body>
+            <div>
+            <iframe id="{filename}" srcdoc="{text}" width="100%" height="480px"></iframe>
+            </div>
+        </body>
+        """
+
+        if st.button("Close"):
+            st.placeholder.empty()
+
+        placeholder = st.empty()
+        with placeholder:
+            # htmlcontent = html_content.format(link=link, filename=filename)
+            htmlcontent = html_content.format(filename=filename, text=text)
+            components.html(htmlcontent, height=500)
+        pass
+
+
+    if 'context_show_option' not in st.session_state:
+        st.session_state['context_show_option'] = 'context within full source document'
+
+    # Answer the question if any
+    if st.session_state.askedquestion != '':
+        st.session_state['question'] = st.session_state.askedquestion
+        st.session_state.askedquestion = ""
+        st.session_state['question'], st.session_state['response'], st.session_state['context'], st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], [])
+
+    # Display the sources and context - even if the page is reloaded
+    if st.session_state['sources'] or st.session_state['context']:
         st.markdown("Answer:" + st.session_state['response'])
-        st.markdown(f'\n\nSources: {sources}') 
+        # st.markdown(f'\n\nSources: {sources}')
+        split_sources = st.session_state['sources'].split('  \n ')
+        for src in split_sources:
+            if src != '':
+                link = src[1:].split('(')[1][:-1].split(')')[0]
+                filename = src[1:].split(']')[0]
+                if st.button(filename, key=filename):
+                    context = st.session_state['context']
+                    display_iframe(filename, link, st.session_state['context'][src])
         with st.expander("Question and Answer Context"):
-            st.markdown(st.session_state['context'].replace('$', '\$'))
-            st.markdown(f"SOURCES: {sources}") 
+            if not st.session_state['context'] is None and st.session_state['context'] != []:
+                for content_source in st.session_state['context'].keys():
+                    st.markdown(f"#### {content_source}")
+                    for context_text in st.session_state['context'][content_source]:
+                        st.markdown(f"{context_text}")
+            
+            # theContext = llm_helper.filter_sourcesLinks(st.session_state['context'].replace('$', '\$'))
+            # st.markdown(theContext)
+            st.markdown(f"SOURCES: {st.session_state['sources']}") 
 
     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index fbfa51c..83ca9cf 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -1,33 +1,122 @@
 import streamlit as st
 from streamlit_chat import message
+import streamlit.components.v1 as components
 from utilities.helper import LLMHelper
+import requests
 
 def clear_chat_data():
     st.session_state['input'] = ""
     st.session_state['chat_history'] = []
     st.session_state['source_documents'] = []
+    st.session_state['chat_context'] = []
+    st.session_state['context_show_option'] = 'context within full source document'
+    st.session_state['askedquestion'] = ''
 
 # Initialize chat history
 if 'chat_history' not in st.session_state:
     st.session_state['chat_history'] = []
 if 'source_documents' not in st.session_state:
     st.session_state['source_documents'] = []
+if 'chat_context' not in st.session_state:
+    st.session_state['chat_context'] = []
+
+context_show_options = ('extracted context only', 'context within full source document')
+if 'context_show_option' not in st.session_state:
+    st.session_state['context_show_option'] = 'context within full source document'
 
 llm_helper = LLMHelper()
 
+if 'askedquestion' not in st.session_state:
+    st.session_state.askedquestion = ''
+
+def questionAsked():
+    st.session_state.askedquestion = st.session_state.input
+
 # Chat 
-input_text = st.text_input("You: ", placeholder="type your question", key="input")
+input_text = st.text_input("You: ", placeholder="type your question", key="input", on_change=questionAsked)
 clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
 
-if input_text:
-    question = input_text
-    input_text = ""
-    question, result, _, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history'])
+def display_iframe(filename, link, contextList):
+    if st.session_state['context_show_option'] == 'context within full source document':
+        try:
+            response = requests.get(link)
+            text = response.text
+            for i, context in enumerate(contextList):
+                contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
+                text = text.replace(context, contextSpan)
+            text = text.replace('\n', '<br><br>')
+
+        except Exception as e:
+            text = "Could not load the document source content"
+    else:
+        text = ""
+        for context in contextList:
+            text = text + context.replace('\n', '<br><br>') + '<br>'
+
+    html_content = """
+    <!DOCTYPE html>
+    <head>
+    <script>
+        window.onload = function() {{
+        setTimeout(function() {{
+            // Code to execute after 0.5 seconds
+            var iframe = this.document.getElementById('{filename}');
+            var element = iframe.contentDocument.getElementById('ContextTag0');
+            if (element !== null) {{
+                element.scrollIntoView({{
+                behavior: 'smooth',
+                }});
+            }}
+        }}, 500);
+        }};
+    </script>
+    </head>
+    <body>
+        <div>
+        <iframe id="{filename}" srcdoc="{text}" width="100%" height="480px"></iframe>
+        </div>
+    </body>
+    """
+
+    if st.button("Close"):
+        placeholder.empty()
+
+    placeholder = st.empty()
+    with placeholder:
+        # htmlcontent = html_content.format(link=link, filename=filename)
+        htmlcontent = html_content.format(filename=filename, text=text)
+        components.html(htmlcontent, height=500)
+    pass
+
+
+if st.session_state.askedquestion:
+    question = st.session_state.askedquestion
+    st.session_state.askedquestion = ""
+    question, result, context, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history'])
     st.session_state['chat_history'].append((question, result))
     st.session_state['source_documents'].append(sources)
+    st.session_state['chat_context'].append(context)
 
+          
 if st.session_state['chat_history']:
+    history_range = range(len(st.session_state['chat_history'])-1, -1, -1)
     for i in range(len(st.session_state['chat_history'])-1, -1, -1):
         message(st.session_state['chat_history'][i][1], key=str(i))
-        st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}')
+        if i == history_range.start:
+
+            st.session_state['context_show_option'] = st.selectbox(
+                'Choose how to display context used to answer the question when clicking on a document source below:',
+                context_show_options,
+                index=context_show_options.index(st.session_state['context_show_option'])
+            )
+
+            split_sources = st.session_state['source_documents'][i].split('  \n ')
+            for src in split_sources:
+                if src != '':
+                    link = src[1:].split('(')[1][:-1].split(')')[0]
+                    filename = src[1:].split(']')[0]
+                    if st.button(filename, key=filename):
+                        display_iframe(filename, link, st.session_state['chat_context'][i][src])
+        else:
+            st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}')
         message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user')
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 153f39b..4e5b92b 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -123,6 +123,21 @@ def get_all_documents(self, k: int = None):
                 'metadata' : x.metadata,
                 }, result)))
 
+    # remove paths from sources to only keep the filename
+    def filter_sourcesLinks(self, sources):
+        # use regex to replace all occurences of '[anypath/anypath/somefilename.xxx](the_link)' to '[somefilename](thelink)' in sources
+        pattern = r'\[[^\]]*?/([^/\]]*?)\]'
+
+        match = re.search(pattern, sources)
+        while match:
+            withoutExtensions = match.group(1).split('.')[0] # remove any extension to the name of the source document
+            sources = sources[:match.start()] + f'[{withoutExtensions}]' + sources[match.end():]
+            match = re.search(pattern, sources)
+        
+        sources = '  \n ' + sources.replace('\n', '  \n ') # add a carriage return after each source
+
+        return sources
+
     def get_semantic_answer_lang_chain(self, question, chat_history):
         question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)
         doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT)
@@ -134,15 +149,26 @@ def get_semantic_answer_lang_chain(self, question, chat_history):
             top_k_docs_for_context= self.k
         )
         result = chain({"question": question, "chat_history": chat_history})
-        context = "\n".join(list(map(lambda x: x.page_content, result['source_documents'])))
-        sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents'])))
-
         container_sas = self.blob_client.get_container_sas()
         
+        # context = "\n".join(list(map(lambda x: x.page_content, result['source_documents'])))
+        # context = "\n".join(list(map(lambda x: "{}  \n {}  \n".format(x.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas), x.page_content), result['source_documents'])))
+        
+        contextDict ={}
+        for res in result['source_documents']:
+            source_key = self.filter_sourcesLinks(res.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)).replace('\n', '').replace(' ', '')
+            if source_key not in contextDict:
+                contextDict[source_key] = []
+            contextDict[source_key].append(res.page_content)
+ 
+        sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents'])))
+        
         result['answer'] = result['answer'].split('SOURCES:')[0].split('Sources:')[0].split('SOURCE:')[0].split('Source:')[0]
         sources = sources.replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)
 
-        return question, result['answer'], context, sources
+        sources = self.filter_sourcesLinks(sources)
+
+        return question, result['answer'], contextDict, sources
 
     def get_embeddings_model(self):
         OPENAI_EMBEDDINGS_ENGINE_DOC = os.getenv('OPENAI_EMEBDDINGS_ENGINE', os.getenv('OPENAI_EMBEDDINGS_ENGINE_DOC', 'text-embedding-ada-002'))  

From 4acf9f434e4f88aabe1f8c952f5403e3bbcfecd6 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 4 Apr 2023 17:23:12 +0200
Subject: [PATCH 02/17] UI Enhancements


From 866232ab16f57e0ea198603016842dc6df5584d1 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Thu, 6 Apr 2023 10:33:38 +0200
Subject: [PATCH 03/17] Adding citations reference in the answer

---
 code/OpenAI_Queries.py         | 15 ++++++---------
 code/pages/00_Chat.py          | 21 ++++++++++++---------
 code/pages/01_Add_Document.py  | 19 +++++++++++++++++--
 code/utilities/customprompt.py |  2 +-
 code/utilities/helper.py       | 31 ++++++++++++++++++++++++++++---
 5 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index 6bef743..35abd31 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -193,16 +193,13 @@ def display_iframe(filename, link, contextList):
 
     # Display the sources and context - even if the page is reloaded
     if st.session_state['sources'] or st.session_state['context']:
+        st.session_state['response'], sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
         st.markdown("Answer:" + st.session_state['response'])
-        # st.markdown(f'\n\nSources: {sources}')
-        split_sources = st.session_state['sources'].split('  \n ')
-        for src in split_sources:
-            if src != '':
-                link = src[1:].split('(')[1][:-1].split(')')[0]
-                filename = src[1:].split(']')[0]
-                if st.button(filename, key=filename):
-                    context = st.session_state['context']
-                    display_iframe(filename, link, st.session_state['context'][src])
+ 
+        for id in range(len(sourceList)):
+            if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]):
+                display_iframe(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]])
+
         with st.expander("Question and Answer Context"):
             if not st.session_state['context'] is None and st.session_state['context'] != []:
                 for content_source in st.session_state['context'].keys():
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 83ca9cf..85c578b 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -3,6 +3,7 @@
 import streamlit.components.v1 as components
 from utilities.helper import LLMHelper
 import requests
+import regex as re
 
 def clear_chat_data():
     st.session_state['input'] = ""
@@ -83,7 +84,6 @@ def display_iframe(filename, link, contextList):
 
     placeholder = st.empty()
     with placeholder:
-        # htmlcontent = html_content.format(link=link, filename=filename)
         htmlcontent = html_content.format(filename=filename, text=text)
         components.html(htmlcontent, height=500)
     pass
@@ -101,7 +101,8 @@ def display_iframe(filename, link, contextList):
 if st.session_state['chat_history']:
     history_range = range(len(st.session_state['chat_history'])-1, -1, -1)
     for i in range(len(st.session_state['chat_history'])-1, -1, -1):
-        message(st.session_state['chat_history'][i][1], key=str(i))
+        # message(st.session_state['chat_history'][i][1], key=str(i))
+
         if i == history_range.start:
 
             st.session_state['context_show_option'] = st.selectbox(
@@ -110,13 +111,15 @@ def display_iframe(filename, link, contextList):
                 index=context_show_options.index(st.session_state['context_show_option'])
             )
 
-            split_sources = st.session_state['source_documents'][i].split('  \n ')
-            for src in split_sources:
-                if src != '':
-                    link = src[1:].split('(')[1][:-1].split(')')[0]
-                    filename = src[1:].split(']')[0]
-                    if st.button(filename, key=filename):
-                        display_iframe(filename, link, st.session_state['chat_context'][i][src])
+            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i])
+            st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
+            answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
+            message(answer_with_citations, key=str(i))
+
+            for id in range(len(sourceList)):
+                if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]):
+                    display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]])
+
         else:
             st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}')
         message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user')
diff --git a/code/pages/01_Add_Document.py b/code/pages/01_Add_Document.py
index d2c8c25..7a85b8b 100644
--- a/code/pages/01_Add_Document.py
+++ b/code/pages/01_Add_Document.py
@@ -32,6 +32,12 @@ def delete_row():
     st.session_state['data_to_drop'] 
     redisembeddings.delete_document(st.session_state['data_to_drop'])
 
+def add_urls():
+    urls = st.session_state['urls'].split('\n')
+    for url in urls:
+        if url:
+            llm_helper.add_embeddings_lc(url)
+            st.success(f"Embeddings added successfully for {url}")
 
 try:
     # Set page layout to wide screen and menu item
@@ -78,7 +84,7 @@ def delete_row():
     with st.expander("Add text to the knowledge base", expanded=False):
         col1, col2 = st.columns([3,1])
         with col1: 
-            st.session_state['doc_text'] = st.text_area("Add a new text content and the click on 'Compute Embeddings'", height=600)
+            st.session_state['doc_text'] = st.text_area("Add a new text content and them click on 'Compute Embeddings'", height=600)
 
         with col2:
             st.session_state['embeddings_model'] = st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True)
@@ -106,6 +112,15 @@ def delete_row():
         with col3:
             st.button("Convert all files and add embeddings", on_click=remote_convert_files_and_add_embeddings, args=(True,))
 
+    with st.expander("Add URLs to the knowledge base", expanded=True):
+        col1, col2 = st.columns([3,1])
+        with col1: 
+            st.session_state['urls'] = st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100)
+
+        with col2:
+            st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True, key="embeddings_model_url")
+            st.button("Compute Embeddings", on_click=add_urls, key="add_url")
+
     with st.expander("View documents in the knowledge base", expanded=False):
         # Query RediSearch to get all the embeddings
         try:
@@ -122,4 +137,4 @@ def delete_row():
 
 
 except Exception as e:
-    st.error(traceback.format_exc())
+    st.error(traceback.format_exc())
\ No newline at end of file
diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py
index 9a70f83..2f27fd1 100644
--- a/code/utilities/customprompt.py
+++ b/code/utilities/customprompt.py
@@ -3,7 +3,7 @@
 
 template = """{summaries}
 Please reply to the question using only the information present in the text above. 
-Include references to the sources you used to create the answer if those are relevant ("SOURCES"). 
+Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
 If you can't find it, reply politely that the information is not in the knowledge base.
 Question: {question}
 Answer:"""
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 4e5b92b..4caa736 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -138,6 +138,19 @@ def filter_sourcesLinks(self, sources):
 
         return sources
 
+    def insert_citations_in_answer(self, answer, filenameList):
+        pattern = r'\[\[(.*?)\]\]'
+        match = re.search(pattern, answer)
+        while match:
+            filename = match.group(1).split('.')[0] # remove any extension to the name of the source document
+            if filename in filenameList:
+                filenameIndex = filenameList.index(filename) + 1
+                answer = answer[:match.start()] + '$^{' + f'{filenameIndex}' + '}$' + answer[match.end():]
+            else:
+                answer = answer[:match.start()] + '$^{' + f'{filename}' + '}$' + answer[match.end():]
+            match = re.search(pattern, answer)
+        return answer
+
     def get_semantic_answer_lang_chain(self, question, chat_history):
         question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)
         doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT)
@@ -151,9 +164,6 @@ def get_semantic_answer_lang_chain(self, question, chat_history):
         result = chain({"question": question, "chat_history": chat_history})
         container_sas = self.blob_client.get_container_sas()
         
-        # context = "\n".join(list(map(lambda x: x.page_content, result['source_documents'])))
-        # context = "\n".join(list(map(lambda x: "{}  \n {}  \n".format(x.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas), x.page_content), result['source_documents'])))
-        
         contextDict ={}
         for res in result['source_documents']:
             source_key = self.filter_sourcesLinks(res.metadata['source'].replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)).replace('\n', '').replace(' ', '')
@@ -180,3 +190,18 @@ def get_embeddings_model(self):
 
     def get_completion(self, prompt, **kwargs):
         return self.llm(prompt)
+    
+    def get_links_filenames(self, answer, sources):
+        split_sources = sources.split('  \n ') # soures are expected to be of format '  \n  [filename1.ext](sourcelink1)  \n [filename2.ext](sourcelink2)  \n  [filename3.ext](sourcelink3)  \n '
+        srcList = []
+        linkList = []
+        filenameList = []
+        for src in split_sources:
+            if src != '':
+                srcList.append(src)
+                link = src[1:].split('(')[1][:-1].split(')')[0] # get the link
+                linkList.append(link)
+                filename = src[1:].split(']')[0] # retrieve the source filename
+                filenameList.append(filename)
+        answer = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer
+        return answer, srcList, linkList, filenameList

From e17812e0b5871b2e09ffdca6f5e5fe0aab6825ad Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Thu, 6 Apr 2023 11:03:19 +0200
Subject: [PATCH 04/17] Display answer first, selection menu of context after
 the answer

---
 code/pages/00_Chat.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 85c578b..dac5219 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -104,18 +104,17 @@ def display_iframe(filename, link, contextList):
         # message(st.session_state['chat_history'][i][1], key=str(i))
 
         if i == history_range.start:
+            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i])
+            st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
+            answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
+            message(answer_with_citations, key=str(i))
 
             st.session_state['context_show_option'] = st.selectbox(
                 'Choose how to display context used to answer the question when clicking on a document source below:',
                 context_show_options,
                 index=context_show_options.index(st.session_state['context_show_option'])
             )
-
-            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i])
-            st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
-            answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
-            message(answer_with_citations, key=str(i))
-
+            
             for id in range(len(sourceList)):
                 if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]):
                     display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]])

From 8deb4836c5c75b10d600f57271955e4afb220085 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Fri, 7 Apr 2023 20:30:28 +0200
Subject: [PATCH 05/17] Adding references, citations, and follow-up questions

---
 README.md                      |   2 +-
 code/OpenAI_Queries.py         | 109 ++++++++++++++++++----
 code/pages/00_Chat.py          | 160 +++++++++++++++++++++++++--------
 code/pages/01_Add_Document.py  |  23 ++---
 code/utilities/customprompt.py |   4 +
 code/utilities/helper.py       |  24 ++++-
 6 files changed, 253 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index baad9b1..49a380c 100644
--- a/README.md
+++ b/README.md
@@ -198,4 +198,4 @@ This presentation, demonstration, and demonstration model do not give you or you
 
 The information contained in this presentation, demonstration and demonstration model represents the current view of Microsoft on the issues discussed as of the date of presentation and/or demonstration, for the duration of your access to the demonstration model. Because Microsoft must respond to changing market conditions, it should not be interpreted to be a commitment on the part of Microsoft, and Microsoft cannot guarantee the accuracy of any information presented after the date of presentation and/or demonstration and for the duration of your access to the demonstration model.
 
-No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies.
+No Microsoft technology, nor any of its component technologies, including the demonstration model, is intended or made available as a substitute for the professional advice, opinion, or judgment of (1) a certified financial services professional, or (2) a certified medical professional. Partners or customers are responsible for ensuring the regulatory compliance of any solution they build using Microsoft technologies.
\ No newline at end of file
diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index 35abd31..1dfff4c 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -66,6 +66,21 @@ def check_deployment():
         st.error(traceback.format_exc())
 
 
+def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
+    htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
+                for (i = 0; i < elements.length; ++i) 
+                    {{ if (elements[i].innerText == '{wgt_txt}') 
+                        {{
+                            elements[i].style.color  = '{wch_hex_colour}';
+                            let border_style = '{wch_border_style}';
+                            if (border_style.length > 0) {{
+                                elements[i].style.border ='{wch_border_style}';
+                                }}
+                        }} }}</script>  """
+
+    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style)
+    components.html(f"{htmlstr}", height=0, width=0)
+
 @st.cache_data()
 def get_languages():
     return llm_helper.translator.get_available_languages()
@@ -76,16 +91,36 @@ def get_languages():
     default_question = "" 
     default_answer = ""
 
+
     if 'question' not in st.session_state:
         st.session_state['question'] = default_question
-    # if 'prompt' not in st.session_state:
-    #     st.session_state['prompt'] = os.getenv("QUESTION_PROMPT", "Please reply to the question using only the information present in the text above. If you can't find it, reply 'Not in the text'.\nQuestion: _QUESTION_\nAnswer:").replace(r'\n', '\n')
     if 'response' not in st.session_state:
         st.session_state['response'] = default_answer
     if 'context' not in st.session_state:
         st.session_state['context'] = ""
     if 'sources' not in st.session_state:
         st.session_state['sources'] = ""
+    if 'followup_questions' not in st.session_state:
+        st.session_state['followup_questions'] = []
+    if 'input_message_key' not in st.session_state:
+        st.session_state ['input_message_key'] = 1
+    if 'do_not_process_question' not in st.session_state:
+        st.session_state['do_not_process_question'] = False
+
+    if 'askedquestion' not in st.session_state:
+        st.session_state.askedquestion = default_question
+
+    if 'context_show_option' not in st.session_state:
+        st.session_state['context_show_option'] = 'context within full source document'
+
+
+    if 'tab_context' not in st.session_state:
+        st.session_state['tab_context'] = 'Not opened yet'
+    else:
+        tmp=st.session_state['tab_context']
+        tmp2=st.session_state['question']
+        if st.session_state['question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Chat':
+            st.session_state['tab_context'] = 'Open_Queries'
 
     # Set page layout to wide screen and menu item
     menu_items = {
@@ -109,6 +144,7 @@ def get_languages():
 
     col1, col2, col3 = st.columns([2,2,2])
     with col1:
+        ChangeButtonStyle("Check deployment", "#885555")
         st.button("Check deployment", on_click=check_deployment)
     with col3:
         with st.expander("Settings"):
@@ -121,15 +157,25 @@ def get_languages():
             # st.temperature = st.slider("Temperature", 0.0, 1.0, 0.1)
             st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language')
 
-    if 'askedquestion' not in st.session_state:
-        st.session_state.askedquestion = ''
+    # Callback to display document sources
+    def show_document_source(filename, link, contextList):
+        st.session_state['do_not_process_question'] = True
+        display_iframe(filename, link, contextList)
+
+    # Callback to assign the follow-up question is selected by the user
+    def ask_followup_question(followup_question):
+        st.session_state.askedquestion = followup_question
+        st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
 
     def questionAsked():
-        st.session_state.askedquestion = st.session_state.inputquestion
+        st.session_state.askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])]
 
-    question = st.text_input("OpenAI Semantic Answer", default_question, key='inputquestion', on_change=questionAsked)
+    question = st.text_input("Azure OpenAI Semantic Answer", value=st.session_state['askedquestion'], key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked)
 
+    # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context
     def display_iframe(filename, link, contextList):
+        st.session_state['do_not_process_question'] = True
+        st.session_state['chat_askedquestion'] = st.session_state.question
         if st.session_state['context_show_option'] == 'context within full source document':
             try:
                 response = requests.get(link)
@@ -171,35 +217,50 @@ def display_iframe(filename, link, contextList):
         </body>
         """
 
-        if st.button("Close"):
-            st.placeholder.empty()
+        def close_iframe():
+            placeholder.empty()
+            st.session_state['do_not_process_question'] = True
 
+        st.button("Close", on_click=close_iframe)
+        
         placeholder = st.empty()
         with placeholder:
-            # htmlcontent = html_content.format(link=link, filename=filename)
             htmlcontent = html_content.format(filename=filename, text=text)
             components.html(htmlcontent, height=500)
+
         pass
 
-
-    if 'context_show_option' not in st.session_state:
-        st.session_state['context_show_option'] = 'context within full source document'
+    tmp=st.session_state['tab_context']
+    tmp2=st.session_state['question']
+    if st.session_state['tab_context'] != 'Open_Queries' and st.session_state['question'] != '' and st.session_state['question'] != st.session_state['followup_questions']:
+        st.session_state['tab_context'] = 'Open_Queries'
+        st.session_state['do_not_process_question'] = True
+        ask_followup_question(st.session_state['question'])
 
     # Answer the question if any
-    if st.session_state.askedquestion != '':
+    if st.session_state.askedquestion != '' and st.session_state['do_not_process_question'] != True:
         st.session_state['question'] = st.session_state.askedquestion
         st.session_state.askedquestion = ""
-        st.session_state['question'], st.session_state['response'], st.session_state['context'], st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], [])
+        st.session_state['question'], \
+        st.session_state['response'], \
+        st.session_state['context'], \
+        st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], [])
+        st.session_state['response'], followup_questions_list = llm_helper.extract_followupquestions(st.session_state['response'])
+        st.session_state['followup_questions'] = followup_questions_list
+        
+    st.session_state['do_not_process_question'] = False
 
     # Display the sources and context - even if the page is reloaded
     if st.session_state['sources'] or st.session_state['context']:
         st.session_state['response'], sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
-        st.markdown("Answer:" + st.session_state['response'])
+        st.markdown("**Answer:**" + st.session_state['response'])
  
+    if st.session_state['sources'] or st.session_state['context']:
+        # Buttons to display the context used to answer
         for id in range(len(sourceList)):
-            if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]):
-                display_iframe(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]])
+            st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], ))
 
+        # Details on the question and answer context
         with st.expander("Question and Answer Context"):
             if not st.session_state['context'] is None and st.session_state['context'] != []:
                 for content_source in st.session_state['context'].keys():
@@ -207,10 +268,20 @@ def display_iframe(filename, link, contextList):
                     for context_text in st.session_state['context'][content_source]:
                         st.markdown(f"{context_text}")
             
-            # theContext = llm_helper.filter_sourcesLinks(st.session_state['context'].replace('$', '\$'))
-            # st.markdown(theContext)
             st.markdown(f"SOURCES: {st.session_state['sources']}") 
 
+    # Display proposed follow-up questions which can be clicked on to ask that question automatically
+    if len(st.session_state['followup_questions']) > 0:
+        st.markdown('**Proposed follow-up questions:**')
+    with st.container():
+        for questionId, followup_question in enumerate(st.session_state['followup_questions']):
+            if followup_question:
+                st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
+
+        for questionId, followup_question in enumerate(st.session_state['followup_questions']):
+            if followup_question:
+                ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none')
+
     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
         st.write(f"{llm_helper.translator.translate(st.session_state['response'], available_languages[st.session_state['translation_language']])}")		
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index dac5219..c0b11df 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -6,39 +6,75 @@
 import regex as re
 
 def clear_chat_data():
-    st.session_state['input'] = ""
     st.session_state['chat_history'] = []
-    st.session_state['source_documents'] = []
+    st.session_state['chat_source_documents'] = []
     st.session_state['chat_context'] = []
-    st.session_state['context_show_option'] = 'context within full source document'
-    st.session_state['askedquestion'] = ''
+    st.session_state['chat_context_show_option'] = 'context within full source document'
+    st.session_state['chat_askedquestion'] = ''
+    st.session_state['chat_question'] = ''
+    st.session_state['chat_followup_questions'] = []
+    st.session_state['do_not_process_question'] = False
+    st.session_state['tab_context'] = 'Not opened yet'
+
 
 # Initialize chat history
+if 'chat_question' not in st.session_state:
+        st.session_state['chat_question'] = ''
+if 'chat_askedquestion' not in st.session_state:
+    st.session_state.chat_askedquestion = ''
 if 'chat_history' not in st.session_state:
     st.session_state['chat_history'] = []
-if 'source_documents' not in st.session_state:
-    st.session_state['source_documents'] = []
+if 'chat_source_documents' not in st.session_state:
+    st.session_state['chat_source_documents'] = []
 if 'chat_context' not in st.session_state:
     st.session_state['chat_context'] = []
-
-context_show_options = ('extracted context only', 'context within full source document')
-if 'context_show_option' not in st.session_state:
-    st.session_state['context_show_option'] = 'context within full source document'
+if 'chat_followup_questions' not in st.session_state:
+    st.session_state['chat_followup_questions'] = []
+if 'input_message_key' not in st.session_state:
+    st.session_state ['input_message_key'] = 1
+
+if 'do_not_process_question' not in st.session_state:
+    st.session_state['do_not_process_question'] = False
+
+chat_context_show_options = ('extracted context only', 'context within full source document')
+if 'chat_context_show_option' not in st.session_state:
+    st.session_state['chat_context_show_option'] = 'context within full source document'
+
+if 'tab_context' not in st.session_state:
+    st.session_state['tab_context'] = 'Not opened yet'
+else:
+    if st.session_state['chat_question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Open_Queries':
+        st.session_state['tab_context'] = 'Chat'
+tmp=st.session_state['tab_context']
+tmp2=st.session_state['chat_question']
 
 llm_helper = LLMHelper()
 
-if 'askedquestion' not in st.session_state:
-    st.session_state.askedquestion = ''
 
-def questionAsked():
-    st.session_state.askedquestion = st.session_state.input
+def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
+    htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
+                for (i = 0; i < elements.length; ++i) 
+                    {{ if (elements[i].innerText == '{wgt_txt}') 
+                        {{
+                            elements[i].style.color  = '{wch_hex_colour}';
+                            let border_style = '{wch_border_style}';
+                            if (border_style.length > 0) {{
+                                elements[i].style.border ='{wch_border_style}';
+                                }}
+                        }} }}</script>  """
+
+    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style)
+    components.html(f"{htmlstr}", height=0, width=0)
 
-# Chat 
-input_text = st.text_input("You: ", placeholder="type your question", key="input", on_change=questionAsked)
-clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
 
+def questionAsked():
+    st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])]
+
+# Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context
 def display_iframe(filename, link, contextList):
-    if st.session_state['context_show_option'] == 'context within full source document':
+    st.session_state['do_not_process_question'] = True
+    st.session_state['chat_askedquestion'] = st.session_state.chat_question
+    if st.session_state['chat_context_show_option'] == 'context within full source document':
         try:
             response = requests.get(link)
             text = response.text
@@ -79,46 +115,96 @@ def display_iframe(filename, link, contextList):
     </body>
     """
 
-    if st.button("Close"):
+    def close_iframe():
         placeholder.empty()
+        st.session_state['do_not_process_question'] = True
+
+    st.button("Close", on_click=close_iframe)
 
     placeholder = st.empty()
     with placeholder:
         htmlcontent = html_content.format(filename=filename, text=text)
         components.html(htmlcontent, height=500)
+
     pass
 
 
-if st.session_state.askedquestion:
-    question = st.session_state.askedquestion
-    st.session_state.askedquestion = ""
-    question, result, context, sources = llm_helper.get_semantic_answer_lang_chain(question, st.session_state['chat_history'])
-    st.session_state['chat_history'].append((question, result))
-    st.session_state['source_documents'].append(sources)
+# Callback to assign the follow-up question is selected by the user
+def ask_followup_question(followup_question):
+    st.session_state.chat_askedquestion = followup_question
+    st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
+
+tmp=st.session_state['tab_context']
+tmp2=st.session_state['chat_question']
+# Reset the right asked question to the input box when this page is reopened after switching to the OpenAI_Queries page
+if st.session_state['tab_context'] != 'Chat' and st.session_state['chat_question'] != '' and st.session_state['chat_question'] != st.session_state['chat_askedquestion']:
+    st.session_state['tab_context'] = 'Chat'
+    st.session_state['do_not_process_question'] = True
+    ask_followup_question(st.session_state['chat_question'])
+
+
+# Chat 
+input_text = st.text_input("You: ", placeholder="type your question", value=st.session_state.chat_askedquestion, key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked)
+
+clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
+ChangeButtonStyle("Clear chat", "#885555")
+
+def show_document_source(filename, link, contextList):
+    st.session_state['do_not_process_question'] = True
+    display_iframe(filename, link, contextList)
+
+# If a question is asked execute the request to get the result, context, sources and up to 3 follow-up questions proposals
+if st.session_state.chat_askedquestion and st.session_state.do_not_process_question != True:
+    st.session_state['chat_question'] = st.session_state.chat_askedquestion
+    st.session_state.chat_askedquestion = ""
+    st.session_state['chat_question'], result, context, sources = llm_helper.get_semantic_answer_lang_chain(st.session_state['chat_question'], st.session_state['chat_history'])
+    result, chat_followup_questions_list = llm_helper.extract_followupquestions(result)
+    st.session_state['chat_history'].append((st.session_state['chat_question'], result))
+    st.session_state['chat_source_documents'].append(sources)
     st.session_state['chat_context'].append(context)
+    st.session_state['chat_followup_questions'] = chat_followup_questions_list
+    
+st.session_state['do_not_process_question'] = False
 
-          
+# Displays the chat history
 if st.session_state['chat_history']:
     history_range = range(len(st.session_state['chat_history'])-1, -1, -1)
     for i in range(len(st.session_state['chat_history'])-1, -1, -1):
-        # message(st.session_state['chat_history'][i][1], key=str(i))
 
+        # This history entry is the latest one - also show follow-up questions, buttons to access source(s) context(s) 
         if i == history_range.start:
-            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['source_documents'][i])
+            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i])
             st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
-            answer_with_citations = re.sub(r'\$\^\{(\d+)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
+
+            answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
             message(answer_with_citations, key=str(i))
 
-            st.session_state['context_show_option'] = st.selectbox(
+            # Selectbox to choose how to display the context(s) associated with the clicked source document name
+            st.session_state['chat_context_show_option'] = st.selectbox(
                 'Choose how to display context used to answer the question when clicking on a document source below:',
-                context_show_options,
-                index=context_show_options.index(st.session_state['context_show_option'])
+                chat_context_show_options,
+                index=chat_context_show_options.index(st.session_state['chat_context_show_option'])
             )
-            
+
+            # Buttons to display the context(s) associated with the clicked source document name
             for id in range(len(sourceList)):
-                if st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id]):
-                    display_iframe(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]])
+                st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], ))
+
+            # Display proposed follow-up questions which can be clicked on to ask that question automatically
+            if len(st.session_state['chat_followup_questions']) > 0:
+                st.markdown('**Proposed follow-up questions:**')
+            with st.container():
+                for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
+                    if followup_question:
+                        st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
 
+                for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
+                    if followup_question:
+                        ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none')
+
+        # The old questions and answers within the history
         else:
-            st.markdown(f'\n\nSources: {st.session_state["source_documents"][i]}')
-        message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user')
+            answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
+            message(answer_with_citations, key=str(i))
+            st.markdown(f'\n\nSources: {st.session_state["chat_source_documents"][i]}')
+            message(st.session_state['chat_history'][i][0], is_user=True, key=str(i) + '_user')
diff --git a/code/pages/01_Add_Document.py b/code/pages/01_Add_Document.py
index 7a85b8b..ea0461c 100644
--- a/code/pages/01_Add_Document.py
+++ b/code/pages/01_Add_Document.py
@@ -4,13 +4,15 @@
 import requests
 import mimetypes
 import traceback
+import chardet
 from utilities.helper import LLMHelper
 import uuid
 from redis.exceptions import ResponseError 
 
+    
 def upload_text_and_embeddings():
     file_name = f"{uuid.uuid4()}.txt"
-    source_url = llm_helper.blob_client.upload_file(st.session_state['doc_text'], file_name=file_name, content_type='text/plain')
+    source_url = llm_helper.blob_client.upload_file(st.session_state['doc_text'], file_name=file_name, content_type='text/plain; charset=utf-8')
     llm_helper.add_embeddings_lc(source_url) 
     st.success("Embeddings added successfully.")
 
@@ -27,7 +29,6 @@ def remote_convert_files_and_add_embeddings(process_all=False):
     except Exception as e:
         st.error(traceback.format_exc())
 
-
 def delete_row():
     st.session_state['data_to_drop'] 
     redisembeddings.delete_document(st.session_state['data_to_drop'])
@@ -39,6 +40,14 @@ def add_urls():
             llm_helper.add_embeddings_lc(url)
             st.success(f"Embeddings added successfully for {url}")
 
+def upload_file(bytes_data: bytes, file_name: str):
+    # Upload a new file
+    st.session_state['filename'] = file_name
+    content_type = mimetypes.MimeTypes().guess_type(file_name)[0]
+    charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else ''
+    st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type+charset)
+
+
 try:
     # Set page layout to wide screen and menu item
     menu_items = {
@@ -62,11 +71,7 @@ def add_urls():
             bytes_data = uploaded_file.getvalue()
 
             if st.session_state.get('filename', '') != uploaded_file.name:
-                # Upload a new file
-                st.session_state['filename'] = uploaded_file.name
-                content_type = mimetypes.MimeTypes().guess_type(uploaded_file.name)[0]
-                st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type)
-
+                upload_file(bytes_data, uploaded_file.name)
                 converted_filename = ''
                 if uploaded_file.name.endswith('.txt'):
                     # Add the text to the embeddings
@@ -99,9 +104,7 @@ def add_urls():
 
                 if st.session_state.get('filename', '') != up.name:
                     # Upload a new file
-                    st.session_state['filename'] = up.name
-                    content_type = mimetypes.MimeTypes().guess_type(up.name)[0]
-                    st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type)
+                    upload_file(bytes_data, up.name)
                     if up.name.endswith('.txt'):
                         # Add the text to the embeddings
                         llm_helper.blob_client.upsert_blob_metadata(up.name, {'converted': "true"})
diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py
index 2f27fd1..d854dac 100644
--- a/code/utilities/customprompt.py
+++ b/code/utilities/customprompt.py
@@ -5,6 +5,10 @@
 Please reply to the question using only the information present in the text above. 
 Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
 If you can't find it, reply politely that the information is not in the knowledge base.
+After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook.
+Only use double angle brackets to reference the questions, e.g. <<Are there exclusions for prescriptions?>>.
+Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'.
+Try not to repeat questions that have already been asked.
 Question: {question}
 Answer:"""
 
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 4caa736..2bcf8f7 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -138,6 +138,26 @@ def filter_sourcesLinks(self, sources):
 
         return sources
 
+    def extract_followupquestions(self, answer):
+        followupTag = answer.find('Follow-up Questions')
+        folloupQuestions = answer.find('<<')
+
+        # take min of followupTag and folloupQuestions if not -1 to avoid taking the followup questions if there is no followupTag
+        followupTag = min(followupTag, folloupQuestions) if followupTag != -1 and folloupQuestions != -1 else max(followupTag, folloupQuestions)
+        answer_without_followupquestions = answer[:followupTag] if followupTag != -1 else answer
+        followup_questions = answer[followupTag:].strip() if followupTag != -1 else ''
+
+        # Extract the followup questions as a list
+        pattern = r'\<\<(.*?)\>\>'
+        match = re.search(pattern, followup_questions)
+        followup_questions_list = []
+        while match:
+            followup_questions_list.append(followup_questions[match.start()+2:match.end()-2])
+            followup_questions = followup_questions[match.end():]
+            match = re.search(pattern, followup_questions)
+
+        return answer_without_followupquestions, followup_questions_list
+
     def insert_citations_in_answer(self, answer, filenameList):
         pattern = r'\[\[(.*?)\]\]'
         match = re.search(pattern, answer)
@@ -172,7 +192,7 @@ def get_semantic_answer_lang_chain(self, question, chat_history):
             contextDict[source_key].append(res.page_content)
  
         sources = "\n".join(set(map(lambda x: x.metadata["source"], result['source_documents'])))
-        
+
         result['answer'] = result['answer'].split('SOURCES:')[0].split('Sources:')[0].split('SOURCE:')[0].split('Source:')[0]
         sources = sources.replace('_SAS_TOKEN_PLACEHOLDER_', container_sas)
 
@@ -204,4 +224,4 @@ def get_links_filenames(self, answer, sources):
                 filename = src[1:].split(']')[0] # retrieve the source filename
                 filenameList.append(filename)
         answer = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer
-        return answer, srcList, linkList, filenameList
+        return answer, srcList, linkList, filenameList
\ No newline at end of file

From b244859d8ccd0a27931eb1b6474afd508e39a05d Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 11 Apr 2023 13:57:04 +0200
Subject: [PATCH 06/17] Source Buttons styling and handling buttons with quotes
 in the name and handling latin characters in answers

---
 code/OpenAI_Queries.py   | 36 +++++++++++++++++++++++++-----------
 code/pages/00_Chat.py    | 29 ++++++++++++++++++++---------
 code/utilities/helper.py | 25 ++++++++++++++++++++++---
 3 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index 1dfff4c..cb5d785 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -8,6 +8,7 @@
 from utilities.helper import LLMHelper
 
 import requests
+import regex as re
 
 import logging
 logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
@@ -68,8 +69,10 @@ def check_deployment():
 
 def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
     htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
-                for (i = 0; i < elements.length; ++i) 
-                    {{ if (elements[i].innerText == '{wgt_txt}') 
+                    str_wgt_txt = '{wgt_txt}'
+                    str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
+                    for (i = 0; i < elements.length; ++i)
+                    {{ if (elements[i].innerText == str_wgt_txt) 
                         {{
                             elements[i].style.color  = '{wch_hex_colour}';
                             let border_style = '{wch_border_style}';
@@ -117,8 +120,6 @@ def get_languages():
     if 'tab_context' not in st.session_state:
         st.session_state['tab_context'] = 'Not opened yet'
     else:
-        tmp=st.session_state['tab_context']
-        tmp2=st.session_state['question']
         if st.session_state['question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Chat':
             st.session_state['tab_context'] = 'Open_Queries'
 
@@ -175,12 +176,14 @@ def questionAsked():
     # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context
     def display_iframe(filename, link, contextList):
         st.session_state['do_not_process_question'] = True
-        st.session_state['chat_askedquestion'] = st.session_state.question
+        st.session_state['askedquestion'] = st.session_state.chat_question
         if st.session_state['context_show_option'] == 'context within full source document':
             try:
                 response = requests.get(link)
                 text = response.text
+                text = llm_helper.clean_encoding(text)
                 for i, context in enumerate(contextList):
+                    context = llm_helper.clean_encoding(context)
                     contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
                     text = text.replace(context, contextSpan)
                 text = text.replace('\n', '<br><br>')
@@ -230,8 +233,6 @@ def close_iframe():
 
         pass
 
-    tmp=st.session_state['tab_context']
-    tmp2=st.session_state['question']
     if st.session_state['tab_context'] != 'Open_Queries' and st.session_state['question'] != '' and st.session_state['question'] != st.session_state['followup_questions']:
         st.session_state['tab_context'] = 'Open_Queries'
         st.session_state['do_not_process_question'] = True
@@ -247,12 +248,16 @@ def close_iframe():
         st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], [])
         st.session_state['response'], followup_questions_list = llm_helper.extract_followupquestions(st.session_state['response'])
         st.session_state['followup_questions'] = followup_questions_list
-        
+        st.session_state['response'] = llm_helper.clean_encoding(st.session_state['response'])
+        st.session_state['context'] = llm_helper.clean_encoding(st.session_state['context'])
+
     st.session_state['do_not_process_question'] = False
+    sourceList = []
+    
 
     # Display the sources and context - even if the page is reloaded
     if st.session_state['sources'] or st.session_state['context']:
-        st.session_state['response'], sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
+        st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
         st.markdown("**Answer:**" + st.session_state['response'])
  
     if st.session_state['sources'] or st.session_state['context']:
@@ -276,11 +281,20 @@ def close_iframe():
     with st.container():
         for questionId, followup_question in enumerate(st.session_state['followup_questions']):
             if followup_question:
-                st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
+                str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
 
         for questionId, followup_question in enumerate(st.session_state['followup_questions']):
             if followup_question:
-                ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none')
+                str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+
+    # Source Buttons Styles
+    for id in range(len(sourceList)):
+        if filenameList[id] in matchedSourcesList:
+            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none')
+        else:
+            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
 
     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index c0b11df..7814d89 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -45,16 +45,16 @@ def clear_chat_data():
 else:
     if st.session_state['chat_question'] != '' and st.session_state['tab_context'] != 'Not opened yet' and st.session_state['tab_context'] != 'Open_Queries':
         st.session_state['tab_context'] = 'Chat'
-tmp=st.session_state['tab_context']
-tmp2=st.session_state['chat_question']
 
 llm_helper = LLMHelper()
 
 
 def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
     htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
-                for (i = 0; i < elements.length; ++i) 
-                    {{ if (elements[i].innerText == '{wgt_txt}') 
+                    str_wgt_txt = '{wgt_txt}'
+                    str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
+                    for (i = 0; i < elements.length; ++i)
+                    {{ if (elements[i].innerText == str_wgt_txt) 
                         {{
                             elements[i].style.color  = '{wch_hex_colour}';
                             let border_style = '{wch_border_style}';
@@ -78,7 +78,9 @@ def display_iframe(filename, link, contextList):
         try:
             response = requests.get(link)
             text = response.text
+            text = llm_helper.clean_encoding(text)
             for i, context in enumerate(contextList):
+                context = llm_helper.clean_encoding(context)
                 contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
                 text = text.replace(context, contextSpan)
             text = text.replace('\n', '<br><br>')
@@ -134,8 +136,6 @@ def ask_followup_question(followup_question):
     st.session_state.chat_askedquestion = followup_question
     st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
 
-tmp=st.session_state['tab_context']
-tmp2=st.session_state['chat_question']
 # Reset the right asked question to the input box when this page is reopened after switching to the OpenAI_Queries page
 if st.session_state['tab_context'] != 'Chat' and st.session_state['chat_question'] != '' and st.session_state['chat_question'] != st.session_state['chat_askedquestion']:
     st.session_state['tab_context'] = 'Chat'
@@ -158,6 +158,8 @@ def show_document_source(filename, link, contextList):
     st.session_state['chat_question'] = st.session_state.chat_askedquestion
     st.session_state.chat_askedquestion = ""
     st.session_state['chat_question'], result, context, sources = llm_helper.get_semantic_answer_lang_chain(st.session_state['chat_question'], st.session_state['chat_history'])
+    result = llm_helper.clean_encoding(result)
+    context = llm_helper.clean_encoding(context)
     result, chat_followup_questions_list = llm_helper.extract_followupquestions(result)
     st.session_state['chat_history'].append((st.session_state['chat_question'], result))
     st.session_state['chat_source_documents'].append(sources)
@@ -173,7 +175,7 @@ def show_document_source(filename, link, contextList):
 
         # This history entry is the latest one - also show follow-up questions, buttons to access source(s) context(s) 
         if i == history_range.start:
-            answer_with_citations, sourceList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i])
+            answer_with_citations, sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i])
             st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
 
             answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
@@ -196,11 +198,20 @@ def show_document_source(filename, link, contextList):
             with st.container():
                 for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
                     if followup_question:
-                        st.button(followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
+                        str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                        st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
 
                 for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
                     if followup_question:
-                        ChangeButtonStyle(followup_question, "#5555FF", wch_border_style='none')
+                        str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                        ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+
+            # Source Buttons Styles
+            for id in range(len(sourceList)):
+                if filenameList[id] in matchedSourcesList:
+                    ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none')
+                else:
+                    ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
 
         # The old questions and answers within the history
         else:
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 2bcf8f7..2334d21 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -158,18 +158,28 @@ def extract_followupquestions(self, answer):
 
         return answer_without_followupquestions, followup_questions_list
 
+    # insert citations in the answer - find filenames in the answer maching sources from the filenamelist and replace them with '${(id+1)}'
     def insert_citations_in_answer(self, answer, filenameList):
+        matched_sources = []
         pattern = r'\[\[(.*?)\]\]'
         match = re.search(pattern, answer)
         while match:
             filename = match.group(1).split('.')[0] # remove any extension to the name of the source document
             if filename in filenameList:
+                matched_sources.append(filename)
                 filenameIndex = filenameList.index(filename) + 1
                 answer = answer[:match.start()] + '$^{' + f'{filenameIndex}' + '}$' + answer[match.end():]
             else:
                 answer = answer[:match.start()] + '$^{' + f'{filename}' + '}$' + answer[match.end():]
             match = re.search(pattern, answer)
-        return answer
+
+        # When page is reloaded search for references already added to the answer (e.g. '${(id+1)}')
+        for id, filename in enumerate(filenameList):
+            reference = '$^{' + f'{id+1}' + '}$'
+            if reference in answer and not filename in matched_sources:
+                matched_sources.append(filename)
+
+        return answer, matched_sources
 
     def get_semantic_answer_lang_chain(self, question, chat_history):
         question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)
@@ -223,5 +233,14 @@ def get_links_filenames(self, answer, sources):
                 linkList.append(link)
                 filename = src[1:].split(']')[0] # retrieve the source filename
                 filenameList.append(filename)
-        answer = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer
-        return answer, srcList, linkList, filenameList
\ No newline at end of file
+        answer, matchedSourcesList = self.insert_citations_in_answer(answer, filenameList) # Add (1), (2), (3) to the answer to indicate the source of the answer
+        return answer, srcList, matchedSourcesList, linkList, filenameList
+
+    def clean_encoding(self, text):
+        encoding = 'ISO-8859-1'
+        try:
+            reencodedtext = text.encode(encoding)
+            reencodedtext = reencodedtext.decode('utf-8')
+        except Exception as e:
+            reencodedtext = text
+        return reencodedtext
\ No newline at end of file

From 51485ece5abf8971f8255ea1bd83b14b9d3875c1 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 11 Apr 2023 15:07:43 +0200
Subject: [PATCH 07/17] Display follow-up quesions before documents sources

---
 code/OpenAI_Queries.py | 36 +++++++++++++++++++-----------------
 code/pages/00_Chat.py  | 30 ++++++++++++++++--------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index cb5d785..b696b7f 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -176,7 +176,7 @@ def questionAsked():
     # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context
     def display_iframe(filename, link, contextList):
         st.session_state['do_not_process_question'] = True
-        st.session_state['askedquestion'] = st.session_state.chat_question
+        st.session_state['askedquestion'] = st.session_state.question
         if st.session_state['context_show_option'] == 'context within full source document':
             try:
                 response = requests.get(link)
@@ -253,15 +253,25 @@ def close_iframe():
 
     st.session_state['do_not_process_question'] = False
     sourceList = []
-    
+
 
     # Display the sources and context - even if the page is reloaded
     if st.session_state['sources'] or st.session_state['context']:
         st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
         st.markdown("**Answer:**" + st.session_state['response'])
  
+    # Display proposed follow-up questions which can be clicked on to ask that question automatically
+    if len(st.session_state['followup_questions']) > 0:
+        st.markdown('**Proposed follow-up questions:**')
+    with st.container():
+        for questionId, followup_question in enumerate(st.session_state['followup_questions']):
+            if followup_question:
+                str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
+
     if st.session_state['sources'] or st.session_state['context']:
         # Buttons to display the context used to answer
+        st.markdown('**Document sources:**')
         for id in range(len(sourceList)):
             st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], ))
 
@@ -271,23 +281,10 @@ def close_iframe():
                 for content_source in st.session_state['context'].keys():
                     st.markdown(f"#### {content_source}")
                     for context_text in st.session_state['context'][content_source]:
+                        context_text = llm_helper.clean_encoding(context_text)
                         st.markdown(f"{context_text}")
-            
-            st.markdown(f"SOURCES: {st.session_state['sources']}") 
 
-    # Display proposed follow-up questions which can be clicked on to ask that question automatically
-    if len(st.session_state['followup_questions']) > 0:
-        st.markdown('**Proposed follow-up questions:**')
-    with st.container():
-        for questionId, followup_question in enumerate(st.session_state['followup_questions']):
-            if followup_question:
-                str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-                st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
-
-        for questionId, followup_question in enumerate(st.session_state['followup_questions']):
-            if followup_question:
-                str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-                ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+            st.markdown(f"SOURCES: {st.session_state['sources']}") 
 
     # Source Buttons Styles
     for id in range(len(sourceList)):
@@ -296,6 +293,11 @@ def close_iframe():
         else:
             ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
 
+    for questionId, followup_question in enumerate(st.session_state['followup_questions']):
+        if followup_question:
+            str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+            ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+
     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
         st.write(f"{llm_helper.translator.translate(st.session_state['response'], available_languages[st.session_state['translation_language']])}")		
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 7814d89..5f8aa09 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -181,6 +181,15 @@ def show_document_source(filename, link, contextList):
             answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
             message(answer_with_citations, key=str(i))
 
+            # Display proposed follow-up questions which can be clicked on to ask that question automatically
+            if len(st.session_state['chat_followup_questions']) > 0:
+                st.markdown('**Proposed follow-up questions:**')
+            with st.container():
+                for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
+                    if followup_question:
+                        str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                        st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
+
             # Selectbox to choose how to display the context(s) associated with the clicked source document name
             st.session_state['chat_context_show_option'] = st.selectbox(
                 'Choose how to display context used to answer the question when clicking on a document source below:',
@@ -192,20 +201,6 @@ def show_document_source(filename, link, contextList):
             for id in range(len(sourceList)):
                 st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], ))
 
-            # Display proposed follow-up questions which can be clicked on to ask that question automatically
-            if len(st.session_state['chat_followup_questions']) > 0:
-                st.markdown('**Proposed follow-up questions:**')
-            with st.container():
-                for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
-                    if followup_question:
-                        str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-                        st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
-
-                for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
-                    if followup_question:
-                        str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-                        ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
-
             # Source Buttons Styles
             for id in range(len(sourceList)):
                 if filenameList[id] in matchedSourcesList:
@@ -213,6 +208,13 @@ def show_document_source(filename, link, contextList):
                 else:
                     ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
 
+
+            for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
+                if followup_question:
+                    str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
+                    ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+
+
         # The old questions and answers within the history
         else:
             answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html

From 424d20031cd2bfc4d14adabbe0e3d2055ce46e34 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 11 Apr 2023 15:56:56 +0200
Subject: [PATCH 08/17] Clear answer_with_citations when clearing chat data

---
 code/pages/00_Chat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 5f8aa09..8bf9931 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -15,6 +15,7 @@ def clear_chat_data():
     st.session_state['chat_followup_questions'] = []
     st.session_state['do_not_process_question'] = False
     st.session_state['tab_context'] = 'Not opened yet'
+    answer_with_citations = ""
 
 
 # Initialize chat history

From a86bd52e49acf2239961811ad0507291246e80bc Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Thu, 13 Apr 2023 11:08:20 +0200
Subject: [PATCH 09/17] Styling UI components

---
 code/OpenAI_Queries.py |  2 +-
 code/pages/00_Chat.py  | 93 ++++++++++++++++++++++++++++++------------
 2 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index b696b7f..a5f046c 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -291,7 +291,7 @@ def close_iframe():
         if filenameList[id] in matchedSourcesList:
             ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none')
         else:
-            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
+            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none')
 
     for questionId, followup_question in enumerate(st.session_state['followup_questions']):
         if followup_question:
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 8bf9931..baf5b63 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -4,6 +4,7 @@
 from utilities.helper import LLMHelper
 import requests
 import regex as re
+import os
 
 def clear_chat_data():
     st.session_state['chat_history'] = []
@@ -50,21 +51,52 @@ def clear_chat_data():
 llm_helper = LLMHelper()
 
 
-def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
+def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''):
     htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
                     str_wgt_txt = '{wgt_txt}'
                     str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
                     for (i = 0; i < elements.length; ++i)
                     {{ if (elements[i].innerText == str_wgt_txt) 
                         {{
-                            elements[i].style.color  = '{wch_hex_colour}';
-                            let border_style = '{wch_border_style}';
-                            if (border_style.length > 0) {{
-                                elements[i].style.border ='{wch_border_style}';
+                            parentNode = elements[i].parentNode;
+                            element_type = elements[i].nodeName;
+                            parent_type = parentNode.nodeName;
+                            if (element_type == 'DIV' && parent_type == 'DIV') {{
+                                // console.log(elements[i].parentNode.parentNode.nodeName);
+                                elements[i].parentNode.parentNode.style.margin = "0"
+                                elements[i].parentNode.parentNode.style.gap = "0"
                                 }}
-                        }} }}</script>  """
+                            // console.log(str_wgt_txt + ' ( ' + element_type + ' ) : ' + parentNode + ' ( ' + parent_type + ' , ' + parentNode.innerText + ' )');
+                            if (element_type == 'BUTTON') {{
+                                elements[i].style.color  = '{wch_hex_colour}';
+                                let border_style = '{wch_border_style}';
+                                if (border_style.length > 0) {{
+                                    elements[i].style.border ='{wch_border_style}';
+                                    elements[i].style.outline ='{wch_border_style}';
+                                    elements[i].addEventListener('focus', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px #FFFFFF';
+                                        this.style.backgroundColor = "#FFFFFF";
+                                        // console.log(this.innerText + ' FOCUS');
+                                        }});
+                                    elements[i].addEventListener('hover', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px #FFFFFF';
+                                        this.style.backgroundColor = "#FFFFFF";
+                                        // console.log(this.innerText + ' HOVER');
+                                        }});
+                                    }}
+                                if ('{wch_textsize}' != '') {{
+                                    elements[i].style.fontSize = '{wch_textsize}';
+                                    }}
+                            }}
+                            else if (element_type == 'P' && '{wch_textsize}' != '') {{
+                                elements[i].style.fontSize = '{wch_textsize}';
+                                }}
+                        }} }}
+                        </script>  """
 
-    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style)
+    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize)
     components.html(f"{htmlstr}", height=0, width=0)
 
 
@@ -134,6 +166,7 @@ def close_iframe():
 
 # Callback to assign the follow-up question is selected by the user
 def ask_followup_question(followup_question):
+    st.session_state['tab_context'] = 'Chat'  # Prevents side effect when first click after loading the page
     st.session_state.chat_askedquestion = followup_question
     st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
 
@@ -145,10 +178,11 @@ def ask_followup_question(followup_question):
 
 
 # Chat 
+clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
+ChangeButtonStyle("Clear chat", "#ADCDE7", wch_border_style="none", wch_textsize="10px")
+
 input_text = st.text_input("You: ", placeholder="type your question", value=st.session_state.chat_askedquestion, key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked)
 
-clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
-ChangeButtonStyle("Clear chat", "#885555")
 
 def show_document_source(filename, link, contextList):
     st.session_state['do_not_process_question'] = True
@@ -179,8 +213,11 @@ def show_document_source(filename, link, contextList):
             answer_with_citations, sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i])
             st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
 
-            answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
-            message(answer_with_citations, key=str(i))
+            answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]).strip() # message() does not get Latex nor html
+            # message(answer_with_citations key=str(i))
+            answer_message_height = int((len(answer_with_citations) / 22) * 1.1 * 8)
+            st.text_area(label='', value=answer_with_citations, height=answer_message_height, key=str(i))
+            st.write("<br>", unsafe_allow_html=True)
 
             # Display proposed follow-up questions which can be clicked on to ask that question automatically
             if len(st.session_state['chat_followup_questions']) > 0:
@@ -191,29 +228,31 @@ def show_document_source(filename, link, contextList):
                         str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
                         st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
 
-            # Selectbox to choose how to display the context(s) associated with the clicked source document name
-            st.session_state['chat_context_show_option'] = st.selectbox(
-                'Choose how to display context used to answer the question when clicking on a document source below:',
-                chat_context_show_options,
-                index=chat_context_show_options.index(st.session_state['chat_context_show_option'])
-            )
+            if len(sourceList) > 0:
+                st.write("<br><br>", unsafe_allow_html=True)
+                # Selectbox to choose how to display the context(s) associated with the clicked source document name
+                st.session_state['chat_context_show_option'] = st.selectbox(
+                    'Choose how to display context used to answer the question when clicking on a document source below:',
+                    chat_context_show_options,
+                    index=chat_context_show_options.index(st.session_state['chat_context_show_option'])
+                )
 
-            # Buttons to display the context(s) associated with the clicked source document name
-            for id in range(len(sourceList)):
-                st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], ))
+                # Buttons to display the context(s) associated with the clicked source document name
+                for id in range(len(sourceList)):
+                    st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['chat_context'][i][sourceList[id]], ))
 
-            # Source Buttons Styles
-            for id in range(len(sourceList)):
-                if filenameList[id] in matchedSourcesList:
-                    ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none')
-                else:
-                    ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#884422", wch_border_style='none')
+                # Source Buttons Styles
+                for id in range(len(sourceList)):
+                    if filenameList[id] in matchedSourcesList:
+                        ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px')
+                    else:
+                        ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px')
 
 
             for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
                 if followup_question:
                     str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-                    ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+                    ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px')
 
 
         # The old questions and answers within the history

From def582eabb7a5bcf967b3156130f25cd9b251efc Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Mon, 17 Apr 2023 10:12:36 +0200
Subject: [PATCH 10/17] Synching with latest main branch

---
 README.md              |  6 +++++-
 code/OpenAI_Queries.py | 10 ++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 49a380c..7175228 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,9 @@ Therefore, we have provided a way for you to continue using the previous format
 If you want to move to the new format, please go to:
 -   "Add Document" -> "Add documents in Batch" and click on "Convert all files and add embeddings" to reprocess your documents. 
           
+# Use the Repo with Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4)
+By default, the repo uses an Instruction based model (like text-davinci-003) for QnA and Chat experience.  
+If you want to use a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4), please change the environment variables as described [here](#environment-variables)
 
 # Running this repo
 You have multiple options to run the code:
@@ -159,7 +162,8 @@ Here is the explanation of the parameters:
 
 | App Setting | Value | Note |
 | --- | --- | ------------- |
-|OPENAI_ENGINE|text-davinci-003|Instruction engine deployed in your Azure OpenAI resource|
+|OPENAI_ENGINE|text-davinci-003|Engine deployed in your Azure OpenAI resource. E.g. Instruction based model: text-davinci-003 or Chat based model: gpt-35-turbo or gpt-4-32k or gpt-4. Please use the deployment name and not the model name.|
+|OPENAI_DEPLOYMENT_TYPE | Text | Text for Instruction engines (text-davinci-003), <br> Chat for Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4) |
 |OPENAI_EMBEDDINGS_ENGINE_DOC | text-embedding-ada-002  | Embedding engine for documents deployed in your Azure OpenAI resource|
 |OPENAI_EMBEDDINGS_ENGINE_QUERY | text-embedding-ada-002  | Embedding engine for query deployed in your Azure OpenAI resource|
 |OPENAI_API_BASE | https://YOUR_AZURE_OPENAI_RESOURCE.openai.azure.com/ | Your Azure OpenAI Resource name. Get it in the [Azure Portal](https://portal.azure.com)|
diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index a5f046c..05205a5 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -18,11 +18,13 @@ def check_deployment():
     #\ 1. Check if the llm is working
     try:
         llm_helper = LLMHelper()
-        llm_helper.llm("Generate a joke!")
+        llm_helper.get_completion("Generate a joke!")
         st.success("LLM is working!")
     except Exception as e:
-        st.error(f"""LLM is not working. 
-            Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}.
+        st.error(f"""LLM is not working.  
+            Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}.  
+            If you are using an Instructions based deployment (text-davinci-003), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Text or delete the environment variable OPENAI_DEPLOYMENT_TYPE.  
+            If you are using a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Chat.  
             Then restart your application.
             """)
         st.error(traceback.format_exc())
@@ -33,7 +35,7 @@ def check_deployment():
         st.success("Embedding is working!")
     except Exception as e:
         st.error(f"""Embedding model is not working. 
-            Please check you have a deployment name {llm_helper.model} in your Azure OpenAI resource {llm_helper.api_base}.
+            Please check you have a deployment named "text-embedding-ada-002" for "text-embedding-ada-002" model in your Azure OpenAI resource {llm_helper.api_base}.  
             Then restart your application.
             """)
         st.error(traceback.format_exc())

From 849a6f935c5923885f9f4a143cac5d7064130a0e Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Mon, 17 Apr 2023 16:07:18 +0200
Subject: [PATCH 11/17] Handling brwoser's dark mode and enhancing Prompt
 engineering

---
 code/OpenAI_Queries.py         | 75 ++++++++++++++++++++++++++++------
 code/pages/00_Chat.py          | 26 +++++++++---
 code/utilities/customprompt.py |  6 ++-
 3 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index 05205a5..e9862fb 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -69,21 +69,56 @@ def check_deployment():
         st.error(traceback.format_exc())
 
 
-def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''):
-    htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
+def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''):
+    htmlstr = """<script>
+                    const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
+                    let backgroundColor = '#FFFFFF';
+                    if (prefersDark) {{ backgroundColor = '#0E1117'; }}
+                    var elements = window.parent.document.querySelectorAll('*'), i;
                     str_wgt_txt = '{wgt_txt}'
                     str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
                     for (i = 0; i < elements.length; ++i)
                     {{ if (elements[i].innerText == str_wgt_txt) 
                         {{
-                            elements[i].style.color  = '{wch_hex_colour}';
-                            let border_style = '{wch_border_style}';
-                            if (border_style.length > 0) {{
-                                elements[i].style.border ='{wch_border_style}';
+                            parentNode = elements[i].parentNode;
+                            element_type = elements[i].nodeName;
+                            parent_type = parentNode.nodeName;
+                            if (element_type == 'DIV' && parent_type == 'DIV') {{
+                                // console.log(elements[i].parentNode.parentNode.nodeName);
+                                elements[i].parentNode.parentNode.style.margin = "0"
+                                elements[i].parentNode.parentNode.style.gap = "0"
                                 }}
-                        }} }}</script>  """
+                            // console.log(str_wgt_txt + ' ( ' + element_type + ' ) : ' + parentNode + ' ( ' + parent_type + ' , ' + parentNode.innerText + ' )');
+                            if (element_type == 'BUTTON') {{
+                                elements[i].style.color  = '{wch_hex_colour}';
+                                let border_style = '{wch_border_style}';
+                                if (border_style.length > 0) {{
+                                    elements[i].style.border ='{wch_border_style}';
+                                    elements[i].style.outline ='{wch_border_style}';
+                                    elements[i].addEventListener('focus', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
+                                        // console.log(this.innerText + ' FOCUS');
+                                        }});
+                                    elements[i].addEventListener('hover', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
+                                        // console.log(this.innerText + ' HOVER');
+                                        }});
+                                    }}
+                                if ('{wch_textsize}' != '') {{
+                                    elements[i].style.fontSize = '{wch_textsize}';
+                                    }}
+                            }}
+                            else if (element_type == 'P' && '{wch_textsize}' != '') {{
+                                elements[i].style.fontSize = '{wch_textsize}';
+                                }}
+                        }} }}
+                        </script>  """
 
-    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style)
+    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize)
     components.html(f"{htmlstr}", height=0, width=0)
 
 @st.cache_data()
@@ -147,7 +182,7 @@ def get_languages():
 
     col1, col2, col3 = st.columns([2,2,2])
     with col1:
-        ChangeButtonStyle("Check deployment", "#885555")
+        ChangeButtonStyle("Check deployment", "#ADCDE7", wch_border_style="none", wch_textsize="10px")
         st.button("Check deployment", on_click=check_deployment)
     with col3:
         with st.expander("Settings"):
@@ -218,6 +253,18 @@ def display_iframe(filename, link, contextList):
         <body>
             <div>
             <iframe id="{filename}" srcdoc="{text}" width="100%" height="480px"></iframe>
+            <script>
+                var frame = document.getElementById('{filename}');
+                frame.onload = function() {{
+                    const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
+                    let textColor = '#222222';
+                    if (prefersDark) {{ textColor = '#EEEEEE'; }}
+                    var body = frame.contentWindow.document.querySelector('body');
+                    console.log(textColor);
+                    body.style.color = textColor;
+                    console.log(body.style.color);
+                }};
+            </script>
             </div>
         </body>
         """
@@ -260,10 +307,12 @@ def close_iframe():
     # Display the sources and context - even if the page is reloaded
     if st.session_state['sources'] or st.session_state['context']:
         st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
+        st.write("<br>", unsafe_allow_html=True)
         st.markdown("**Answer:**" + st.session_state['response'])
  
     # Display proposed follow-up questions which can be clicked on to ask that question automatically
     if len(st.session_state['followup_questions']) > 0:
+        st.write("<br>", unsafe_allow_html=True)
         st.markdown('**Proposed follow-up questions:**')
     with st.container():
         for questionId, followup_question in enumerate(st.session_state['followup_questions']):
@@ -273,11 +322,13 @@ def close_iframe():
 
     if st.session_state['sources'] or st.session_state['context']:
         # Buttons to display the context used to answer
+        st.write("<br>", unsafe_allow_html=True)
         st.markdown('**Document sources:**')
         for id in range(len(sourceList)):
             st.button(f'({id+1}) {filenameList[id]}', key=filenameList[id], on_click=show_document_source, args=(filenameList[id], linkList[id], st.session_state['context'][sourceList[id]], ))
 
         # Details on the question and answer context
+        st.write("<br><br>", unsafe_allow_html=True)
         with st.expander("Question and Answer Context"):
             if not st.session_state['context'] is None and st.session_state['context'] != []:
                 for content_source in st.session_state['context'].keys():
@@ -291,14 +342,14 @@ def close_iframe():
     # Source Buttons Styles
     for id in range(len(sourceList)):
         if filenameList[id] in matchedSourcesList:
-            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none')
+            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#228822", wch_border_style='none', wch_textsize='10px')
         else:
-            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none')
+            ChangeButtonStyle(f'({id+1}) {filenameList[id]}', "#AAAAAA", wch_border_style='none', wch_textsize='10px')
 
     for questionId, followup_question in enumerate(st.session_state['followup_questions']):
         if followup_question:
             str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
-            ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none')
+            ChangeButtonStyle(str_followup_question, "#5555FF", wch_border_style='none', wch_textsize='14px')
 
     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index baf5b63..cdaa62e 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -52,7 +52,11 @@ def clear_chat_data():
 
 
 def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''):
-    htmlstr = """<script>var elements = window.parent.document.querySelectorAll('*'), i;
+    htmlstr = """<script>
+                    const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
+                    let backgroundColor = '#FFFFFF';
+                    if (prefersDark) {{ backgroundColor = '#0E1117'; }}
+                    var elements = window.parent.document.querySelectorAll('*'), i;
                     str_wgt_txt = '{wgt_txt}'
                     str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
                     for (i = 0; i < elements.length; ++i)
@@ -75,14 +79,14 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''
                                     elements[i].style.outline ='{wch_border_style}';
                                     elements[i].addEventListener('focus', function() {{
                                         this.style.outline = '{wch_border_style}';
-                                        this.style.boxShadow = '0px 0px 0px #FFFFFF';
-                                        this.style.backgroundColor = "#FFFFFF";
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
                                         // console.log(this.innerText + ' FOCUS');
                                         }});
                                     elements[i].addEventListener('hover', function() {{
                                         this.style.outline = '{wch_border_style}';
-                                        this.style.boxShadow = '0px 0px 0px #FFFFFF';
-                                        this.style.backgroundColor = "#FFFFFF";
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
                                         // console.log(this.innerText + ' HOVER');
                                         }});
                                     }}
@@ -146,6 +150,18 @@ def display_iframe(filename, link, contextList):
     <body>
         <div>
         <iframe id="{filename}" srcdoc="{text}" width="100%" height="480px"></iframe>
+        <script>
+            var frame = document.getElementById('{filename}');
+            frame.onload = function() {{
+                const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
+                let textColor = '#222222';
+                if (prefersDark) {{ textColor = '#EEEEEE'; }}
+                var body = frame.contentWindow.document.querySelector('body');
+                console.log(textColor);
+                body.style.color = textColor;
+                console.log(body.style.color);
+            }};
+        </script>
         </div>
     </body>
     """
diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py
index d854dac..17a3fab 100644
--- a/code/utilities/customprompt.py
+++ b/code/utilities/customprompt.py
@@ -2,9 +2,11 @@
 from langchain.prompts import PromptTemplate
 
 template = """{summaries}
-Please reply to the question using only the information present in the text above. 
-Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
+Please reply to the question using only the information present in the text above.
+Detect the langage of the question and answer in the same language. 
 If you can't find it, reply politely that the information is not in the knowledge base.
+Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
+If asked for enumerations list all of them and do not invent any.
 After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook.
 Only use double angle brackets to reference the questions, e.g. <<Are there exclusions for prescriptions?>>.
 Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'.

From 733a60cd8a6f8f16dd56630c2c4fd63a8b865af9 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Mon, 17 Apr 2023 20:30:20 +0200
Subject: [PATCH 12/17] Updating the prompt to be generic

---
 code/utilities/customprompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/utilities/customprompt.py b/code/utilities/customprompt.py
index 17a3fab..a709863 100644
--- a/code/utilities/customprompt.py
+++ b/code/utilities/customprompt.py
@@ -7,7 +7,7 @@
 If you can't find it, reply politely that the information is not in the knowledge base.
 Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
 If asked for enumerations list all of them and do not invent any.
-After answering the question generate three very brief follow-up questions that the user would likely ask next about their healthcare plan and employee handbook.
+After answering the question generate three very brief follow-up questions that the user would likely ask next.
 Only use double angle brackets to reference the questions, e.g. <<Are there exclusions for prescriptions?>>.
 Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'.
 Try not to repeat questions that have already been asked.

From e8fdcde11fda37ee899a26465ea969eed67e9d29 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Mon, 17 Apr 2023 23:13:15 +0200
Subject: [PATCH 13/17] Removing CR from Pattern for non-ascii characters

---
 code/OpenAI_Queries.py            |  2 -
 code/pages/00_Chat.py             |  2 -
 code/pages/04_Index_Management.py | 63 ++++++++++++++++++++++++++++++-
 code/utilities/helper.py          | 55 ++++++++++++++++++++++-----
 4 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index e9862fb..fc349d7 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -260,9 +260,7 @@ def display_iframe(filename, link, contextList):
                     let textColor = '#222222';
                     if (prefersDark) {{ textColor = '#EEEEEE'; }}
                     var body = frame.contentWindow.document.querySelector('body');
-                    console.log(textColor);
                     body.style.color = textColor;
-                    console.log(body.style.color);
                 }};
             </script>
             </div>
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index cdaa62e..7843f73 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -157,9 +157,7 @@ def display_iframe(filename, link, contextList):
                 let textColor = '#222222';
                 if (prefersDark) {{ textColor = '#EEEEEE'; }}
                 var body = frame.contentWindow.document.querySelector('body');
-                console.log(textColor);
                 body.style.color = textColor;
-                console.log(body.style.color);
             }};
         </script>
         </div>
diff --git a/code/pages/04_Index_Management.py b/code/pages/04_Index_Management.py
index e2e7892..0cb61cf 100644
--- a/code/pages/04_Index_Management.py
+++ b/code/pages/04_Index_Management.py
@@ -2,6 +2,7 @@
 import os
 import traceback
 from utilities.helper import LLMHelper
+import streamlit.components.v1 as components
 
 def delete_embedding():
     llm_helper.vector_store.delete_keys([f"doc:{st.session_state['embedding_to_drop']}"])
@@ -11,6 +12,58 @@ def delete_file():
     embeddings_to_delete = list(map(lambda x: f"doc:{x}", embeddings_to_delete))
     llm_helper.vector_store.delete_keys(embeddings_to_delete)
 
+
+def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = '', wch_textsize=''):
+    htmlstr = """<script>
+                    const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
+                    let backgroundColor = '#FFFFFF';
+                    if (prefersDark) {{ backgroundColor = '#0E1117'; }}
+                    var elements = window.parent.document.querySelectorAll('*'), i;
+                    str_wgt_txt = '{wgt_txt}'
+                    str_wgt_txt = str_wgt_txt.replace("/(^|[^\\])'/g", "$1\\'");
+                    for (i = 0; i < elements.length; ++i)
+                    {{ if (elements[i].innerText == str_wgt_txt) 
+                        {{
+                            parentNode = elements[i].parentNode;
+                            element_type = elements[i].nodeName;
+                            parent_type = parentNode.nodeName;
+                            if (element_type == 'DIV' && parent_type == 'DIV') {{
+                                // console.log(elements[i].parentNode.parentNode.nodeName);
+                                elements[i].parentNode.parentNode.style.margin = "0"
+                                elements[i].parentNode.parentNode.style.gap = "0"
+                                }}
+                            // console.log(str_wgt_txt + ' ( ' + element_type + ' ) : ' + parentNode + ' ( ' + parent_type + ' , ' + parentNode.innerText + ' )');
+                            if (element_type == 'BUTTON') {{
+                                elements[i].style.color  = '{wch_hex_colour}';
+                                let border_style = '{wch_border_style}';
+                                if (border_style.length > 0) {{
+                                    elements[i].style.border ='{wch_border_style}';
+                                    elements[i].style.outline ='{wch_border_style}';
+                                    elements[i].addEventListener('focus', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
+                                        // console.log(this.innerText + ' FOCUS');
+                                        }});
+                                    elements[i].addEventListener('hover', function() {{
+                                        this.style.outline = '{wch_border_style}';
+                                        this.style.boxShadow = '0px 0px 0px ' + backgroundColor;
+                                        this.style.backgroundColor = '"' + backgroundColor + '"';
+                                        // console.log(this.innerText + ' HOVER');
+                                        }});
+                                    }}
+                                if ('{wch_textsize}' != '') {{
+                                    elements[i].style.fontSize = '{wch_textsize}';
+                                    }}
+                            }}
+                            else if (element_type == 'P' && '{wch_textsize}' != '') {{
+                                elements[i].style.fontSize = '{wch_textsize}';
+                                }}
+                        }} }}
+                        </script>  """
+    htmlstr = htmlstr.format(wgt_txt=wgt_txt, wch_hex_colour=wch_hex_colour, wch_border_style=wch_border_style, wch_textsize=wch_textsize)
+    components.html(f"{htmlstr}", height=0, width=0)
+
 try:
     # Set page layout to wide screen and menu item
     menu_items = {
@@ -28,32 +81,40 @@ def delete_file():
     # Query RediSearch to get all the embeddings
     data = llm_helper.get_all_documents(k=1000)
 
-    if len(data) == 0:
+    nb_embeddings = len(data)
+
+    if nb_embeddings == 0:
         st.warning("No embeddings found. Go to the 'Add Document' tab to insert your docs.")
     else:
         st.dataframe(data, use_container_width=True)
 
         st.download_button("Download data", data.to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings')
+        ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="10px")
 
         st.text("")
         st.text("")
         col1, col2, col3, col4 = st.columns([3,2,2,1])
         with col1:
             st.selectbox("Embedding id to delete", data.get('key',[]), key="embedding_to_drop")
+            # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="10px")
         with col2:
             st.text("")
             st.text("")
             st.button("Delete embedding", on_click=delete_embedding)
+            ChangeButtonStyle("Delete embedding", "#ADCDE7", wch_textsize="10px")
         with col3:
             st.selectbox("File name to delete", set(data.get('filename',[])), key="file_to_drop")
+            # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="10px")
         with col4:
             st.text("")
             st.text("")
             st.button("Delete file", on_click=delete_file)
+            ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="10px")
 
         st.text("")
         st.text("")
         st.button("Delete all embeddings", on_click=llm_helper.vector_store.delete_keys_pattern, args=("doc*",), type="secondary")
+        ChangeButtonStyle("Delete all embeddings", "#ADCDE7", wch_textsize="10px")
 
 except Exception as e:
     st.error(traceback.format_exc())
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 2334d21..965b277 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -9,6 +9,7 @@
 from langchain.llms import AzureOpenAI
 from langchain.vectorstores.base import VectorStore
 from langchain.chains import ChatVectorDBChain
+from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.chains.llm import LLMChain
 from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT
@@ -17,6 +18,9 @@
 from langchain.text_splitter import TokenTextSplitter, TextSplitter
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders import TextLoader
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import AIMessage, HumanMessage, SystemMessage
+
 
 from utilities.formrecognizer import AzureFormRecognizerClient
 from utilities.azureblobstorage import AzureBlobStorageClient
@@ -27,6 +31,7 @@
 import pandas as pd
 import urllib
 
+from fake_useragent import UserAgent
 class LLMHelper:
     def __init__(self,
         document_loaders : BaseLoader = None, 
@@ -43,7 +48,7 @@ def __init__(self,
         load_dotenv()
         openai.api_type = "azure"
         openai.api_base = os.getenv('OPENAI_API_BASE')
-        openai.api_version = "2022-12-01"
+        openai.api_version = "2023-03-15-preview"
         openai.api_key = os.getenv("OPENAI_API_KEY")
 
         # Azure OpenAI settings
@@ -52,6 +57,7 @@ def __init__(self,
         self.index_name: str = "embeddings"
         self.model: str = os.getenv('OPENAI_EMBEDDINGS_ENGINE_DOC', "text-embedding-ada-002")
         self.deployment_name: str = os.getenv("OPENAI_ENGINE", os.getenv("OPENAI_ENGINES", "text-davinci-003"))
+        self.deployment_type: str = os.getenv("OPENAI_DEPLOYMENT_TYPE", "Text")
 
         # Vector store settings
         self.vector_store_address: str = os.getenv('REDIS_ADDRESS', "localhost")
@@ -69,7 +75,10 @@ def __init__(self,
         self.document_loaders: BaseLoader = WebBaseLoader if document_loaders is None else document_loaders
         self.text_splitter: TextSplitter = TokenTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) if text_splitter is None else text_splitter
         self.embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=self.model, chunk_size=1) if embeddings is None else embeddings
-        self.llm: AzureOpenAI = AzureOpenAI(deployment_name=self.deployment_name) if llm is None else llm
+        if self.deployment_type == "Chat":
+            self.llm: ChatOpenAI = ChatOpenAI(model_name=self.deployment_name, engine=self.deployment_name) if llm is None else llm
+        else:
+            self.llm: AzureOpenAI = AzureOpenAI(deployment_name=self.deployment_name) if llm is None else llm
         self.vector_store: RedisExtended = RedisExtended(redis_url=self.vector_store_full_address, index_name=self.index_name, embedding_function=self.embeddings.embed_query) if vector_store is None else vector_store   
         self.k : int = 3 if k is None else k
 
@@ -78,16 +87,34 @@ def __init__(self,
         self.enable_translation : bool = False if enable_translation is None else enable_translation
         self.translator : AzureTranslatorClient = AzureTranslatorClient() if translator is None else translator
 
+        self.user_agent: UserAgent() = UserAgent()
+        self.user_agent.random
     def add_embeddings_lc(self, source_url):
         try:
             documents = self.document_loaders(source_url).load()
+            
+            # Convert to UTF-8 encoding for non-ascii text
+            for(document) in documents:
+                try:
+                    if document.page_content.encode("iso-8859-1") == document.page_content.encode("latin-1"):
+                        document.page_content = document.page_content.encode("iso-8859-1").decode("utf-8", errors="ignore")
+                except:
+                    pass
             docs = self.text_splitter.split_documents(documents)
+            
+            # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half)
+            # pattern = re.compile(r'[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]')
+            pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]')  # do not remove \x0a (\n) nor \x0d (\r)
+            for(doc) in docs:
+                doc.page_content = re.sub(pattern, '', doc.page_content)
+
             keys = []
             for i, doc in enumerate(docs):
                 # Create a unique key for the document
                 source_url = source_url.split('?')[0]
                 filename = "/".join(source_url.split('/')[4:])
                 hash_key = hashlib.sha1(f"{source_url}_{i}".encode('utf-8')).hexdigest()
+                hash_key = f"doc:{self.index_name}:{hash_key}"
                 keys.append(hash_key)
                 doc.metadata = {"source": f"[{source_url}]({source_url}_SAS_TOKEN_PLACEHOLDER_)" , "chunk": i, "key": hash_key, "filename": filename}
             self.vector_store.add_documents(documents=docs, redis_url=self.vector_store_full_address,  index_name=self.index_name, keys=keys)
@@ -103,7 +130,7 @@ def convert_file_and_add_embeddings(self, source_url, filename, enable_translati
 
         # Upload the text to Azure Blob Storage
         converted_filename = f"converted/{filename}.txt"
-        source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain')
+        source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain; charset=utf-8')
 
         print(f"Converted file uploaded to {source_url} with filename {filename}")
         # Update the metadata to indicate that the file has been converted
@@ -140,10 +167,10 @@ def filter_sourcesLinks(self, sources):
 
     def extract_followupquestions(self, answer):
         followupTag = answer.find('Follow-up Questions')
-        folloupQuestions = answer.find('<<')
+        followupQuestions = answer.find('<<')
 
         # take min of followupTag and folloupQuestions if not -1 to avoid taking the followup questions if there is no followupTag
-        followupTag = min(followupTag, folloupQuestions) if followupTag != -1 and folloupQuestions != -1 else max(followupTag, folloupQuestions)
+        followupTag = min(followupTag, followupQuestions) if followupTag != -1 and followupQuestions != -1 else max(followupTag, followupQuestions)
         answer_without_followupquestions = answer[:followupTag] if followupTag != -1 else answer
         followup_questions = answer[followupTag:].strip() if followupTag != -1 else ''
 
@@ -155,6 +182,11 @@ def extract_followupquestions(self, answer):
             followup_questions_list.append(followup_questions[match.start()+2:match.end()-2])
             followup_questions = followup_questions[match.end():]
             match = re.search(pattern, followup_questions)
+        
+        # Special case when 'Follow-up questions:' appears in the answer after the <<
+        followupTag = answer_without_followupquestions.find('Follow-up Questions')
+        if followupTag != -1:
+            answer_without_followupquestions = answer_without_followupquestions[:followupTag]
 
         return answer_without_followupquestions, followup_questions_list
 
@@ -184,12 +216,12 @@ def insert_citations_in_answer(self, answer, filenameList):
     def get_semantic_answer_lang_chain(self, question, chat_history):
         question_generator = LLMChain(llm=self.llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)
         doc_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff", verbose=False, prompt=PROMPT)
-        chain = ChatVectorDBChain(
-            vectorstore=self.vector_store,
+        chain = ConversationalRetrievalChain(
+            retriever=self.vector_store.as_retriever(),
             question_generator=question_generator,
             combine_docs_chain=doc_chain,
             return_source_documents=True,
-            top_k_docs_for_context= self.k
+            # top_k_docs_for_context= self.k
         )
         result = chain({"question": question, "chat_history": chat_history})
         container_sas = self.blob_client.get_container_sas()
@@ -219,7 +251,10 @@ def get_embeddings_model(self):
         }
 
     def get_completion(self, prompt, **kwargs):
-        return self.llm(prompt)
+        if self.deployment_type == 'Chat':
+            return self.llm([HumanMessage(content=prompt)]).content
+        else:
+            return self.llm(prompt)
     
     def get_links_filenames(self, answer, sources):
         split_sources = sources.split('  \n ') # soures are expected to be of format '  \n  [filename1.ext](sourcelink1)  \n [filename2.ext](sourcelink2)  \n  [filename3.ext](sourcelink3)  \n '
@@ -243,4 +278,4 @@ def clean_encoding(self, text):
             reencodedtext = reencodedtext.decode('utf-8')
         except Exception as e:
             reencodedtext = text
-        return reencodedtext
\ No newline at end of file
+        return reencodedtext

From d114cd367a780131f20b0d83e1e96e671f563332 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 18 Apr 2023 08:18:16 +0200
Subject: [PATCH 14/17] Handling AutoScroll

---
 code/OpenAI_Queries.py            | 2 +-
 code/pages/00_Chat.py             | 2 +-
 code/pages/04_Index_Management.py | 8 ++++----
 code/utilities/helper.py          | 1 -
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index fc349d7..1adfe4b 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -221,7 +221,7 @@ def display_iframe(filename, link, contextList):
                 text = llm_helper.clean_encoding(text)
                 for i, context in enumerate(contextList):
                     context = llm_helper.clean_encoding(context)
-                    contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
+                    contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow; color: black'><b>{context}</b></span>"
                     text = text.replace(context, contextSpan)
                 text = text.replace('\n', '<br><br>')
 
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 7843f73..57b6d71 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -118,7 +118,7 @@ def display_iframe(filename, link, contextList):
             text = llm_helper.clean_encoding(text)
             for i, context in enumerate(contextList):
                 context = llm_helper.clean_encoding(context)
-                contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow'><b>{context}</b></span>"
+                contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow; color: black'><b>{context}</b></span>"
                 text = text.replace(context, contextSpan)
             text = text.replace('\n', '<br><br>')
 
diff --git a/code/pages/04_Index_Management.py b/code/pages/04_Index_Management.py
index 0cb61cf..b86ca87 100644
--- a/code/pages/04_Index_Management.py
+++ b/code/pages/04_Index_Management.py
@@ -89,14 +89,14 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''
         st.dataframe(data, use_container_width=True)
 
         st.download_button("Download data", data.to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings')
-        ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="10px")
+        ChangeButtonStyle("Download data", "#ADCDE7", wch_textsize="12px")
 
         st.text("")
         st.text("")
         col1, col2, col3, col4 = st.columns([3,2,2,1])
         with col1:
             st.selectbox("Embedding id to delete", data.get('key',[]), key="embedding_to_drop")
-            # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="10px")
+            # ChangeButtonStyle("Embedding id to delete", "#ADCDE7", wch_textsize="12px")
         with col2:
             st.text("")
             st.text("")
@@ -104,12 +104,12 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''
             ChangeButtonStyle("Delete embedding", "#ADCDE7", wch_textsize="10px")
         with col3:
             st.selectbox("File name to delete", set(data.get('filename',[])), key="file_to_drop")
-            # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="10px")
+            # ChangeButtonStyle("File name to delete", "#ADCDE7", wch_textsize="12px")
         with col4:
             st.text("")
             st.text("")
             st.button("Delete file", on_click=delete_file)
-            ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="10px")
+            ChangeButtonStyle("Delete file", "#ADCDE7", wch_textsize="12px")
 
         st.text("")
         st.text("")
diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 965b277..6989138 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -103,7 +103,6 @@ def add_embeddings_lc(self, source_url):
             docs = self.text_splitter.split_documents(documents)
             
             # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half)
-            # pattern = re.compile(r'[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]')
             pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]')  # do not remove \x0a (\n) nor \x0d (\r)
             for(doc) in docs:
                 doc.page_content = re.sub(pattern, '', doc.page_content)

From 028858ec8d1a492a2169f99f32c4f414a12ea566 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 18 Apr 2023 09:39:56 +0200
Subject: [PATCH 15/17] Applying pattern to converted txt

---
 code/utilities/helper.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/code/utilities/helper.py b/code/utilities/helper.py
index 6989138..5088e5c 100644
--- a/code/utilities/helper.py
+++ b/code/utilities/helper.py
@@ -128,8 +128,12 @@ def convert_file_and_add_embeddings(self, source_url, filename, enable_translati
         text = list(map(lambda x: self.translator.translate(x), text)) if self.enable_translation else text
 
         # Upload the text to Azure Blob Storage
+        converted_text = "n".join(text)
+        # Remove half non-ascii character from start/end of doc content (langchain TokenTextSplitter may split a non-ascii character in half)
+        pattern = re.compile(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]')  # do not remove \x0a (\n) nor \x0d (\r)
+        converted_text = re.sub(pattern, '', converted_text)
         converted_filename = f"converted/{filename}.txt"
-        source_url = self.blob_client.upload_file("\n".join(text), f"converted/{filename}.txt", content_type='text/plain; charset=utf-8')
+        source_url = self.blob_client.upload_file(converted_text, f"converted/{filename}.txt", content_type='text/plain; charset=utf-8')
 
         print(f"Converted file uploaded to {source_url} with filename {filename}")
         # Update the metadata to indicate that the file has been converted

From 718bbd5fae90fa1164f1e237f718083ae899ed95 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 18 Apr 2023 13:23:50 +0200
Subject: [PATCH 16/17] Bug correction on asking second question of the session

---
 code/pages/00_Chat.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index 57b6d71..f32e81e 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -106,6 +106,7 @@ def ChangeButtonStyle(wgt_txt, wch_hex_colour = '#000000', wch_border_style = ''
 
 def questionAsked():
     st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])]
+    st.session_state.chat_question = st.session_state.chat_askedquestion
 
 # Display the context(s) associated with a source document used to andwer, with automaic scroll to the yellow highlighted context
 def display_iframe(filename, link, contextList):
@@ -119,7 +120,11 @@ def display_iframe(filename, link, contextList):
             for i, context in enumerate(contextList):
                 context = llm_helper.clean_encoding(context)
                 contextSpan = f" <span id='ContextTag{i}' style='background-color: yellow; color: black'><b>{context}</b></span>"
+                print(contextSpan)
                 text = text.replace(context, contextSpan)
+                checkit = text.find('<span')
+                checkit2 = text.find('span>')
+                print(text[checkit:checkit2])
             text = text.replace('\n', '<br><br>')
 
         except Exception as e:

From 2b8cee2b7ee251d9a1617f4c498d181e26a29ee3 Mon Sep 17 00:00:00 2001
From: Philippe Limantour <plimantour@microsoft.com>
Date: Tue, 18 Apr 2023 13:41:27 +0200
Subject: [PATCH 17/17] Augmenting timout before autoscroll to leave time for
 content to be fully loaded

---
 code/OpenAI_Queries.py | 4 ++--
 code/pages/00_Chat.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/code/OpenAI_Queries.py b/code/OpenAI_Queries.py
index 1adfe4b..ae0381e 100644
--- a/code/OpenAI_Queries.py
+++ b/code/OpenAI_Queries.py
@@ -238,7 +238,7 @@ def display_iframe(filename, link, contextList):
         <script>
             window.onload = function() {{
             setTimeout(function() {{
-                // Code to execute after 0.5 seconds
+                // Code to execute after 1.5 seconds
                 var iframe = this.document.getElementById('{filename}');
                 var element = iframe.contentDocument.getElementById('ContextTag0');
                 if (element !== null) {{
@@ -246,7 +246,7 @@ def display_iframe(filename, link, contextList):
                     behavior: 'smooth',
                     }});
                 }}
-            }}, 500);
+            }}, 1500);
             }};
         </script>
         </head>
diff --git a/code/pages/00_Chat.py b/code/pages/00_Chat.py
index f32e81e..42d2cd3 100644
--- a/code/pages/00_Chat.py
+++ b/code/pages/00_Chat.py
@@ -140,7 +140,7 @@ def display_iframe(filename, link, contextList):
     <script>
         window.onload = function() {{
         setTimeout(function() {{
-            // Code to execute after 0.5 seconds
+            // Code to execute after 1.5 seconds
             var iframe = this.document.getElementById('{filename}');
             var element = iframe.contentDocument.getElementById('ContextTag0');
             if (element !== null) {{
@@ -148,7 +148,7 @@ def display_iframe(filename, link, contextList):
                 behavior: 'smooth',
                 }});
             }}
-        }}, 500);
+        }}, 1500);
         }};
     </script>
     </head>