From a180f7bef10c1907c7ef10d833c5b63d9153def0 Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Mon, 19 Jun 2023 09:11:05 -0400 Subject: [PATCH 1/7] chatgpt can talk to acl --- pyserini/demo/acl-chatgpt.py | 27 ++++++++ pyserini/demo/acl-server.py | 116 +++++++++++++++++++++++++++++++++++ pyserini/demo/skill.py | 48 +++++++++++++++ 3 files changed, 191 insertions(+) create mode 100644 pyserini/demo/acl-chatgpt.py create mode 100644 pyserini/demo/acl-server.py create mode 100644 pyserini/demo/skill.py diff --git a/pyserini/demo/acl-chatgpt.py b/pyserini/demo/acl-chatgpt.py new file mode 100644 index 000000000..e5320c77f --- /dev/null +++ b/pyserini/demo/acl-chatgpt.py @@ -0,0 +1,27 @@ +import semantic_kernel as sk +from semantic_kernel.connectors.ai.open_ai import OpenAITextCompletion, AzureTextCompletion + +from pyserini.demo.skill import PyseriniSkill + +kernel = sk.Kernel() + +api_key, org_id = sk.openai_settings_from_dot_env() +kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", api_key, org_id)) + +kernel.import_skill(PyseriniSkill(),"pyserini") + + +sk_prompt = """ +{{pyserini.search $input}} + +Based on the information provided above, please answer the following question: + +{{$input}} +""" + +context = kernel.create_new_context() +context["url"] = "http://127.0.0.1:8080/search" + +acl_function = kernel.create_semantic_function(sk_prompt, max_tokens=200, temperature=0, top_p=0.5) + +print(acl_function("Tell me about the Computational Power of Transformers and Its Implications in Sequence Modeling", context=context)) \ No newline at end of file diff --git a/pyserini/demo/acl-server.py b/pyserini/demo/acl-server.py new file mode 100644 index 000000000..bca8eed6b --- /dev/null +++ b/pyserini/demo/acl-server.py @@ -0,0 +1,116 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script provides an interactive web interface demo for retrieval on the ACL dataset. +It requires `flask` (`pip install flask~=2.2.0`). +An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080. +The demo can be accessed via "http://localhost:8080" in a web browser. +Additional arguments include: + --port [PORT] --hits [Number of hits] + --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda] +""" +import json +import logging +from argparse import ArgumentParser +from functools import partial +from typing import Callable, Optional, Tuple, Union + +from flask import Flask, render_template, request, flash, jsonify +from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder + +logging.basicConfig( + format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logger = logging.getLogger('acl-demo') + +VERSION = '1.0' +Searcher = Union[FaissSearcher, LuceneSearcher] + + +def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]): + app = Flask(__name__) + + lang = 'en' + searcher, retriever = load_searcher_fn(lang) + + @app.route('/search', methods=['POST']) + def search(): + nonlocal lang, searcher, retriever + query = request.json['query'] + if not query: + search_results = [] + else: + hits = searcher.search(query, k=k) + docs = [searcher.doc(hit.docid) for hit in hits] + search_results = [ + { + 'rank': r + 1, + 'docid': hit.docid, + 'doc': docs[r].contents(), + 'score': hit.score, + } + for r, hit in enumerate(hits) + ] + return search_results + + + return app + + +def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str): + searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') + searcher.set_language(language) + if k1 is not None and b is not None: + searcher.set_bm25(k1, b) + retriever_name = f'BM25 (k1={k1}, b={b})' + else: + retriever_name = 'BM25' + + return searcher, retriever_name + + +def main(): + parser = ArgumentParser() + + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') + parser.add_argument( + '--device', + type=str, + default='cpu', + help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)', + ) + parser.add_argument( + '--port', + default=8080, + type=int, + help='Web server port', + ) + + args = parser.parse_args() + + load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b) + + app = create_app(args.hits, load_fn) + app.run(host='0.0.0.0', port=args.port) + + +if __name__ == '__main__': + main() diff --git a/pyserini/demo/skill.py b/pyserini/demo/skill.py new file mode 100644 index 000000000..df512e527 --- /dev/null +++ b/pyserini/demo/skill.py @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft. All rights reserved. + +import json + +import aiohttp + +from semantic_kernel.orchestration.sk_context import SKContext +from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter + + +class PyseriniSkill: + """ + A skill that uses Pyserini to search a corpus of documents. + + Usage: + kernel.import_skill(PyseriniSkill(), "http") + + Examples: + + {{pyserini.search $query}} + """ + + @sk_function(description="Searches a corpus of documents using Pyserini using the specified query.", name="search") + @sk_function_context_parameter(name="url", description="The url of the request") + async def search(self, query: str, context: SKContext) -> str: + """ + Searches a corpus of documents using Pyserini using the specified query. + Return the response body as a string. + + params: + query: The query to search for. + context: The SKContext containing the url of the request. + returns: + The response body as a string. + """ + _, url = context.variables.get("url") + if not url: + raise ValueError("url cannot be `None` or empty") + + headers = {"Content-Type": "application/json"} + body = {"query": query} + data = json.dumps(body) + async with aiohttp.ClientSession() as session: + async with session.post( + url, headers=headers, data=data, raise_for_status=True + ) as response: + return await response.text() + From 2ce0747a9da02ed8763e4d352f0e67b55efe8afb Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Mon, 19 Jun 2023 09:51:44 -0400 Subject: [PATCH 2/7] chatbot done? --- pyserini/demo/acl-chatgpt.py | 27 ----------- pyserini/demo/aclchatgpt/chatbot.py | 46 +++++++++++++++++++ .../{acl-server.py => aclchatgpt/server.py} | 0 pyserini/demo/{ => aclchatgpt}/skill.py | 10 ++-- 4 files changed, 49 insertions(+), 34 deletions(-) delete mode 100644 pyserini/demo/acl-chatgpt.py create mode 100644 pyserini/demo/aclchatgpt/chatbot.py rename pyserini/demo/{acl-server.py => aclchatgpt/server.py} (100%) rename pyserini/demo/{ => aclchatgpt}/skill.py (78%) diff --git a/pyserini/demo/acl-chatgpt.py b/pyserini/demo/acl-chatgpt.py deleted file mode 100644 index e5320c77f..000000000 --- a/pyserini/demo/acl-chatgpt.py +++ /dev/null @@ -1,27 +0,0 @@ -import semantic_kernel as sk -from semantic_kernel.connectors.ai.open_ai import OpenAITextCompletion, AzureTextCompletion - -from pyserini.demo.skill import PyseriniSkill - -kernel = sk.Kernel() - -api_key, org_id = sk.openai_settings_from_dot_env() -kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", api_key, org_id)) - -kernel.import_skill(PyseriniSkill(),"pyserini") - - -sk_prompt = """ -{{pyserini.search $input}} - -Based on the information provided above, please answer the following question: - -{{$input}} -""" - -context = kernel.create_new_context() -context["url"] = "http://127.0.0.1:8080/search" - -acl_function = kernel.create_semantic_function(sk_prompt, max_tokens=200, temperature=0, top_p=0.5) - -print(acl_function("Tell me about the Computational Power of Transformers and Its Implications in Sequence Modeling", context=context)) \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py new file mode 100644 index 000000000..e5d147558 --- /dev/null +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -0,0 +1,46 @@ +import semantic_kernel as sk +from semantic_kernel.connectors.ai.open_ai import OpenAITextCompletion + +from pyserini.demo.aclchatgpt.skill import PyseriniSkill + +kernel = sk.Kernel() + +api_key, org_id = sk.openai_settings_from_dot_env() +kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", api_key, org_id)) + +kernel.import_skill(PyseriniSkill(),"pyserini") + + +sk_prompt = """ +ChatBot can answer questions based on query_results. +It can only answer questions based on query_results. +If query_results is empty, say "I don't know" + +============ +query_results: {{pyserini.search $input}} +============ + +Question: {{$input}} +""" + + + +context = kernel.create_new_context() +context["url"] = "http://127.0.0.1:8080/search" + +acl_chat_function = kernel.create_semantic_function(sk_prompt, max_tokens=200, temperature=0, top_p=0.5) + + +def chat(input_text: str) -> None: + # Save new message in the context variables + + + # Process the user message and get an answer + answer = acl_chat_function(input_text,context=context) + + # Show the response + print(f"ChatBot: {answer}") + + +while True: + chat(input("User: ")) \ No newline at end of file diff --git a/pyserini/demo/acl-server.py b/pyserini/demo/aclchatgpt/server.py similarity index 100% rename from pyserini/demo/acl-server.py rename to pyserini/demo/aclchatgpt/server.py diff --git a/pyserini/demo/skill.py b/pyserini/demo/aclchatgpt/skill.py similarity index 78% rename from pyserini/demo/skill.py rename to pyserini/demo/aclchatgpt/skill.py index df512e527..fa27ed9de 100644 --- a/pyserini/demo/skill.py +++ b/pyserini/demo/aclchatgpt/skill.py @@ -3,6 +3,7 @@ import json import aiohttp +import requests from semantic_kernel.orchestration.sk_context import SKContext from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter @@ -37,12 +38,7 @@ async def search(self, query: str, context: SKContext) -> str: if not url: raise ValueError("url cannot be `None` or empty") - headers = {"Content-Type": "application/json"} body = {"query": query} - data = json.dumps(body) - async with aiohttp.ClientSession() as session: - async with session.post( - url, headers=headers, data=data, raise_for_status=True - ) as response: - return await response.text() + result = requests.post(url, json = body).json()[0]["doc"] + return result From 4c296598ad84597b04bd8d868acef4611be370d8 Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Mon, 19 Jun 2023 11:45:39 -0400 Subject: [PATCH 3/7] remove answer --- pyserini/demo/aclchatgpt/chatbot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py index e5d147558..222956b2f 100644 --- a/pyserini/demo/aclchatgpt/chatbot.py +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -20,7 +20,7 @@ query_results: {{pyserini.search $input}} ============ -Question: {{$input}} +{{$input}} """ From 00f397568a96c7123fbebc226178389804dc9366 Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Mon, 19 Jun 2023 11:54:06 -0400 Subject: [PATCH 4/7] documents --- pyserini/demo/aclchatgpt/readme.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 pyserini/demo/aclchatgpt/readme.md diff --git a/pyserini/demo/aclchatgpt/readme.md b/pyserini/demo/aclchatgpt/readme.md new file mode 100644 index 000000000..64b49b409 --- /dev/null +++ b/pyserini/demo/aclchatgpt/readme.md @@ -0,0 +1,19 @@ +# Talking to ACL Anthology with GPT + +By default, GPT-3 text-davinci-003 is used. You can change this in `chatbot.py`. + +`chatbot.py` is the front-end script. As a CLI, the user can input a query and the chatbot will return a response. + +`server.py` is the back-end script. It is a Flask server that will return a response to a query sent to the server. + +You must have both running for `chatbot.py` to work. + +## Setup + +1. Follow the instructions in [Indexing the ACL Anthology with Anserini](https://github.com/castorini/pyserini/blob/master/docs/working-with-acl-anthology.md) to setup the project and generate a `lucene-index-acl-paragraph` index. +2. Copy the generated `lucene-index-acl-paragraph` index from the `acl-anthology` folder to `pyserini/indexes` +3. You will need Semantic Kernel as well. + - `pip3 install --upgrade semantic-kernel` +4. Start the server with `python -m pyserini.demo.aclchatgpt.server` +5. Start the chatbot with `python -m pyserini.demo.aclchatgpt.chatbot` +6. Start chatting! \ No newline at end of file From 2fd2a61cfd8dcc2814b6bd29f081ae7f42f8c17c Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Tue, 20 Jun 2023 21:26:39 -0400 Subject: [PATCH 5/7] added conversation capability --- pyserini/demo/aclchatgpt/chatbot.py | 47 +++++++++++++++++++++++------ pyserini/demo/aclchatgpt/readme.md | 8 +++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py index 222956b2f..ec3d4cb53 100644 --- a/pyserini/demo/aclchatgpt/chatbot.py +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -11,36 +11,63 @@ kernel.import_skill(PyseriniSkill(),"pyserini") -sk_prompt = """ -ChatBot can answer questions based on query_results. -It can only answer questions based on query_results. -If query_results is empty, say "I don't know" -============ + +acl_chat_prompt = """ +Given the `query_results` below, your task is to formulate an answer using only the information +provided in these results. You should not draw from other sources or attempt to provide information that is not +contained within the `query_results`. If the `query_results` are empty, simply state "I'm sorry, but I do not have +enough information to provide an answer." + +=================== query_results: {{pyserini.search $input}} -============ +=================== -{{$input}} +Based on the above `query_results`, what is your response to {{$input}}? """ +absolute_question_prompt = """ +Task: You are an AI language model tasked with transforming given questions into +absolute questions. An absolute question is a question that can stand on its own and carries all the context needed +to be answered. Here's an example: + +User: Who is Alan Turing? +ChatBot: Who is Alan Turing? +User: How old is he? +ChatBot: How old is Alan Turing? + +=================== +History: {{$history}} +=================== + +Using the history as context, transform the following question into an absolute question: {{$input}} +""" context = kernel.create_new_context() context["url"] = "http://127.0.0.1:8080/search" +context["history"] = "" -acl_chat_function = kernel.create_semantic_function(sk_prompt, max_tokens=200, temperature=0, top_p=0.5) +acl_chat_function = kernel.create_semantic_function(acl_chat_prompt, max_tokens=200, temperature=0, top_p=0.5) +absolute_question_function = kernel.create_semantic_function(absolute_question_prompt, max_tokens=200, temperature=0, top_p=0.5) def chat(input_text: str) -> None: - # Save new message in the context variables + print("---------------------------------------------") + absolute_question = absolute_question_function(input_text,context=context) + + print (f"Absolute Question: {absolute_question}") # Process the user message and get an answer - answer = acl_chat_function(input_text,context=context) + answer = acl_chat_function(str(absolute_question),context=context) # Show the response print(f"ChatBot: {answer}") + context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" + while True: + print("=============================================") chat(input("User: ")) \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/readme.md b/pyserini/demo/aclchatgpt/readme.md index 64b49b409..df9c7cbb4 100644 --- a/pyserini/demo/aclchatgpt/readme.md +++ b/pyserini/demo/aclchatgpt/readme.md @@ -8,6 +8,14 @@ By default, GPT-3 text-davinci-003 is used. You can change this in `chatbot.py`. You must have both running for `chatbot.py` to work. +## Environment Varaibles +You will need to create an `.env` file in directory that you are running in. +The `.env` file should contain the following: +``` +OPENAI_API_KEY="" +OPENAI_ORG_ID="" +``` + ## Setup 1. Follow the instructions in [Indexing the ACL Anthology with Anserini](https://github.com/castorini/pyserini/blob/master/docs/working-with-acl-anthology.md) to setup the project and generate a `lucene-index-acl-paragraph` index. From c18e506129f0e01bb034d7c4e8c24977ff8f4a16 Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Wed, 21 Jun 2023 18:50:01 -0400 Subject: [PATCH 6/7] pyserini skill --- pyserini/demo/aclchatgpt/chatbot.py | 120 +++++++++++++++++----------- pyserini/demo/aclchatgpt/server.py | 116 --------------------------- pyserini/demo/aclchatgpt/skill.py | 38 ++++++++- 3 files changed, 110 insertions(+), 164 deletions(-) delete mode 100644 pyserini/demo/aclchatgpt/server.py diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py index ec3d4cb53..6e80cae66 100644 --- a/pyserini/demo/aclchatgpt/chatbot.py +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -1,73 +1,103 @@ +from dataclasses import dataclass +from argparse import ArgumentParser +from typing import Optional + import semantic_kernel as sk from semantic_kernel.connectors.ai.open_ai import OpenAITextCompletion -from pyserini.demo.aclchatgpt.skill import PyseriniSkill +from pyserini.demo.aclchatgpt.skill import PyseriniSkill, PyseriniConfig + + +@dataclass +class OpenAIConfig: + api_key: str + org_id: str -kernel = sk.Kernel() +class ChatBot: -api_key, org_id = sk.openai_settings_from_dot_env() -kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", api_key, org_id)) + acl_chat_prompt = """ + Given the `query_results` below, your task is to formulate an answer using only the information + provided in these results. You should not draw from other sources or attempt to provide information that is not + contained within the `query_results`. If the `query_results` are empty, simply state "I'm sorry, but I do not have + enough information to provide an answer." + + =================== + query_results: {{pyserini.search $input}} + =================== + + Based on the above `query_results`, what is your response to {{$input}}? + """ -kernel.import_skill(PyseriniSkill(),"pyserini") + absolute_question_prompt = """ + Task: You are an AI language model tasked with transforming given questions into + absolute questions. An absolute question is a question that can stand on its own and carries all the context needed + to be answered. Here's an example: + + User: Who is Alan Turing? + ChatBot: Who is Alan Turing? + User: How old is he? + ChatBot: How old is Alan Turing? + + =================== + History: {{$history}} + =================== + + Using the history as context, transform the following question into an absolute question: {{$input}} + """ + def __init__(self, pyserini_config: PyseriniConfig, openai_config: OpenAIConfig): + self.kernel = sk.Kernel() + self.kernel.add_text_completion_service("dv", OpenAITextCompletion("text-davinci-003", openai_config.api_key, openai_config.org_id)) + self.kernel.import_skill(PyseriniSkill(pyserini_config),"pyserini") + self.context = self.kernel.create_new_context() + self.context["url"] = "http://127.0.0.1:8080/search" + self.context["history"] = "" + self.acl_chat_function = self.kernel.create_semantic_function(self.acl_chat_prompt, max_tokens=200, temperature=0, top_p=0.5) + self.absolute_question_function = self.kernel.create_semantic_function(self.absolute_question_prompt, max_tokens=200, temperature=0, top_p=0.5) -acl_chat_prompt = """ -Given the `query_results` below, your task is to formulate an answer using only the information -provided in these results. You should not draw from other sources or attempt to provide information that is not -contained within the `query_results`. If the `query_results` are empty, simply state "I'm sorry, but I do not have -enough information to provide an answer." -=================== -query_results: {{pyserini.search $input}} -=================== + def _chat(self,input_text: str) -> None: -Based on the above `query_results`, what is your response to {{$input}}? -""" + print("---------------------------------------------") + absolute_question = self.absolute_question_function(input_text,context=self.context) -absolute_question_prompt = """ -Task: You are an AI language model tasked with transforming given questions into -absolute questions. An absolute question is a question that can stand on its own and carries all the context needed -to be answered. Here's an example: + print (f"Absolute Question: {absolute_question}") -User: Who is Alan Turing? -ChatBot: Who is Alan Turing? -User: How old is he? -ChatBot: How old is Alan Turing? + # Process the user message and get an answer + answer = self.acl_chat_function(str(absolute_question),context=self.context) -=================== -History: {{$history}} -=================== + # Show the response + print(f"ChatBot: {answer}") -Using the history as context, transform the following question into an absolute question: {{$input}} -""" + self.context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" + def chat(self) -> None: -context = kernel.create_new_context() -context["url"] = "http://127.0.0.1:8080/search" -context["history"] = "" + while True: + print("=============================================") + self._chat(input("User: ")) -acl_chat_function = kernel.create_semantic_function(acl_chat_prompt, max_tokens=200, temperature=0, top_p=0.5) -absolute_question_function = kernel.create_semantic_function(absolute_question_prompt, max_tokens=200, temperature=0, top_p=0.5) +def main(): -def chat(input_text: str) -> None: + parser = ArgumentParser() - print("---------------------------------------------") - absolute_question = absolute_question_function(input_text,context=context) + parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') + parser.add_argument('--b', type=float, help='BM25 b parameter.') + parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') - print (f"Absolute Question: {absolute_question}") + args = parser.parse_args() + api_key, org_id = sk.openai_settings_from_dot_env() + open_ai_config = OpenAIConfig(api_key,org_id) + pyserini_config = PyseriniConfig(args.k1, args.b, args.hits) - # Process the user message and get an answer - answer = acl_chat_function(str(absolute_question),context=context) + chatbot = ChatBot(pyserini_config=pyserini_config,openai_config=open_ai_config) + chatbot.chat() - # Show the response - print(f"ChatBot: {answer}") - context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" -while True: - print("=============================================") - chat(input("User: ")) \ No newline at end of file +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/server.py b/pyserini/demo/aclchatgpt/server.py deleted file mode 100644 index bca8eed6b..000000000 --- a/pyserini/demo/aclchatgpt/server.py +++ /dev/null @@ -1,116 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -This script provides an interactive web interface demo for retrieval on the ACL dataset. -It requires `flask` (`pip install flask~=2.2.0`). -An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080. -The demo can be accessed via "http://localhost:8080" in a web browser. -Additional arguments include: - --port [PORT] --hits [Number of hits] - --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda] -""" -import json -import logging -from argparse import ArgumentParser -from functools import partial -from typing import Callable, Optional, Tuple, Union - -from flask import Flask, render_template, request, flash, jsonify -from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder - -logging.basicConfig( - format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logger = logging.getLogger('acl-demo') - -VERSION = '1.0' -Searcher = Union[FaissSearcher, LuceneSearcher] - - -def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]): - app = Flask(__name__) - - lang = 'en' - searcher, retriever = load_searcher_fn(lang) - - @app.route('/search', methods=['POST']) - def search(): - nonlocal lang, searcher, retriever - query = request.json['query'] - if not query: - search_results = [] - else: - hits = searcher.search(query, k=k) - docs = [searcher.doc(hit.docid) for hit in hits] - search_results = [ - { - 'rank': r + 1, - 'docid': hit.docid, - 'doc': docs[r].contents(), - 'score': hit.score, - } - for r, hit in enumerate(hits) - ] - return search_results - - - return app - - -def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str): - searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') - searcher.set_language(language) - if k1 is not None and b is not None: - searcher.set_bm25(k1, b) - retriever_name = f'BM25 (k1={k1}, b={b})' - else: - retriever_name = 'BM25' - - return searcher, retriever_name - - -def main(): - parser = ArgumentParser() - - parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') - parser.add_argument('--b', type=float, help='BM25 b parameter.') - parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever') - parser.add_argument( - '--device', - type=str, - default='cpu', - help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)', - ) - parser.add_argument( - '--port', - default=8080, - type=int, - help='Web server port', - ) - - args = parser.parse_args() - - load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b) - - app = create_app(args.hits, load_fn) - app.run(host='0.0.0.0', port=args.port) - - -if __name__ == '__main__': - main() diff --git a/pyserini/demo/aclchatgpt/skill.py b/pyserini/demo/aclchatgpt/skill.py index fa27ed9de..52300a86a 100644 --- a/pyserini/demo/aclchatgpt/skill.py +++ b/pyserini/demo/aclchatgpt/skill.py @@ -1,13 +1,22 @@ # Copyright (c) Microsoft. All rights reserved. import json +import logging +from dataclasses import dataclass import aiohttp import requests from semantic_kernel.orchestration.sk_context import SKContext +from typing import Callable, Optional, Tuple, Union from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter +from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder +@dataclass +class PyseriniConfig: + k1: Optional[float]=None + b: Optional[float]=None + hits: Optional[int]=1 class PyseriniSkill: """ @@ -21,6 +30,18 @@ class PyseriniSkill: {{pyserini.search $query}} """ + + def __init__(self,pyserini_config:PyseriniConfig): + self.lang = 'en' + self.searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') + self.searcher.set_language(self.lang) + if pyserini_config.k1 is not None and pyserini_config.b is not None: + self.searcher.set_bm25(pyserini_config.k1, pyserini_config.b) + self.retriever_name = f'BM25 (k1={pyserini_config.k1}, b={pyserini_config.b})' + else: + self.retriever_name = 'BM25' + self.hits = pyserini_config.hits + @sk_function(description="Searches a corpus of documents using Pyserini using the specified query.", name="search") @sk_function_context_parameter(name="url", description="The url of the request") async def search(self, query: str, context: SKContext) -> str: @@ -38,7 +59,18 @@ async def search(self, query: str, context: SKContext) -> str: if not url: raise ValueError("url cannot be `None` or empty") - body = {"query": query} - result = requests.post(url, json = body).json()[0]["doc"] - return result + if not query: + search_results = [] + else: + hits = self.searcher.search(query, k=self.hits) + docs = [self.searcher.doc(hit.docid) for hit in hits] + search_results = [ + { + 'rank': r + 1, + 'docid': hit.docid, + 'doc': docs[r].contents(), + } + for r, hit in enumerate(hits) + ] + return search_results[0]["doc"] From 39070d8095f65c22674b531dc125b2dd88ce20fb Mon Sep 17 00:00:00 2001 From: Bill Cui Date: Wed, 21 Jun 2023 22:36:19 -0400 Subject: [PATCH 7/7] made chatbot smarter --- pyserini/demo/aclchatgpt/chatbot.py | 15 ++++++++------- pyserini/demo/aclchatgpt/readme.md | 11 ++--------- pyserini/demo/aclchatgpt/skill.py | 3 +-- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pyserini/demo/aclchatgpt/chatbot.py b/pyserini/demo/aclchatgpt/chatbot.py index 6e80cae66..770e59172 100644 --- a/pyserini/demo/aclchatgpt/chatbot.py +++ b/pyserini/demo/aclchatgpt/chatbot.py @@ -15,17 +15,16 @@ class OpenAIConfig: class ChatBot: - acl_chat_prompt = """ - Given the `query_results` below, your task is to formulate an answer using only the information - provided in these results. You should not draw from other sources or attempt to provide information that is not - contained within the `query_results`. If the `query_results` are empty, simply state "I'm sorry, but I do not have - enough information to provide an answer." + acl_chat_prompt = """Given the query_results below, your task is to formulate an answer. You may choose to use or not to use the + information in the query_results. + If you use the query_results, you must reference the docid of the document used by appending to the answer with "(docid: doc-id-here)". + If you do not use the query_results, do not reference it in your answer. =================== query_results: {{pyserini.search $input}} =================== - Based on the above `query_results`, what is your response to {{$input}}? + What is your response to {{$input}}? """ absolute_question_prompt = """ @@ -74,6 +73,7 @@ def _chat(self,input_text: str) -> None: self.context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" def chat(self) -> None: + print("Hi, I'm the ACL ChatBot. Ask me a question about ACL Anthology papers and I'll do my best to answer it.") while True: print("=============================================") @@ -81,7 +81,6 @@ def chat(self) -> None: def main(): - parser = ArgumentParser() parser.add_argument('--k1', type=float, help='BM25 k1 parameter.') @@ -93,6 +92,8 @@ def main(): open_ai_config = OpenAIConfig(api_key,org_id) pyserini_config = PyseriniConfig(args.k1, args.b, args.hits) + print("Starting ChatBot...") + chatbot = ChatBot(pyserini_config=pyserini_config,openai_config=open_ai_config) chatbot.chat() diff --git a/pyserini/demo/aclchatgpt/readme.md b/pyserini/demo/aclchatgpt/readme.md index df9c7cbb4..0f3a5809a 100644 --- a/pyserini/demo/aclchatgpt/readme.md +++ b/pyserini/demo/aclchatgpt/readme.md @@ -2,12 +2,6 @@ By default, GPT-3 text-davinci-003 is used. You can change this in `chatbot.py`. -`chatbot.py` is the front-end script. As a CLI, the user can input a query and the chatbot will return a response. - -`server.py` is the back-end script. It is a Flask server that will return a response to a query sent to the server. - -You must have both running for `chatbot.py` to work. - ## Environment Varaibles You will need to create an `.env` file in directory that you are running in. The `.env` file should contain the following: @@ -22,6 +16,5 @@ OPENAI_ORG_ID="" 2. Copy the generated `lucene-index-acl-paragraph` index from the `acl-anthology` folder to `pyserini/indexes` 3. You will need Semantic Kernel as well. - `pip3 install --upgrade semantic-kernel` -4. Start the server with `python -m pyserini.demo.aclchatgpt.server` -5. Start the chatbot with `python -m pyserini.demo.aclchatgpt.chatbot` -6. Start chatting! \ No newline at end of file +4. Start the chatbot with `python -m pyserini.demo.aclchatgpt.chatbot` +5. Start chatting! \ No newline at end of file diff --git a/pyserini/demo/aclchatgpt/skill.py b/pyserini/demo/aclchatgpt/skill.py index 52300a86a..3209d0373 100644 --- a/pyserini/demo/aclchatgpt/skill.py +++ b/pyserini/demo/aclchatgpt/skill.py @@ -30,7 +30,6 @@ class PyseriniSkill: {{pyserini.search $query}} """ - def __init__(self,pyserini_config:PyseriniConfig): self.lang = 'en' self.searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph') @@ -72,5 +71,5 @@ async def search(self, query: str, context: SKContext) -> str: } for r, hit in enumerate(hits) ] - return search_results[0]["doc"] + return "docid:" + search_results[0]["docid"] + ",doc:" + search_results[0]["doc"]