From 7c17d1706896cbddb5ce6b49a44db6c622ca2733 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Nov 2023 01:03:30 -0400 Subject: [PATCH 1/4] initial working version of better index chat --- cogs/commands.py | 16 +-- cogs/index_service_cog.py | 119 ++++++++++++++++------- gpt3discord.py | 2 +- models/index_model.py | 199 +++++++++++++++++++++++++++++--------- 4 files changed, 238 insertions(+), 98 deletions(-) diff --git a/cogs/commands.py b/cogs/commands.py index 47456808..3e9651c0 100644 --- a/cogs/commands.py +++ b/cogs/commands.py @@ -739,18 +739,6 @@ async def load_index( guild_ids=ALLOWED_GUILDS, ) @discord.guild_only() - @discord.option( - name="user_index", - description="Which user file to load the index from", - required=False, - autocomplete=File_autocompleter.get_user_indexes, - ) - @discord.option( - name="search_index", - description="Which search index file to load the index from", - required=False, - autocomplete=File_autocompleter.get_user_search_indexes, - ) @discord.option( name="model", description="The model to use for the conversation", @@ -761,12 +749,10 @@ async def load_index( async def talk( self, ctx: discord.ApplicationContext, - user_index: str, - search_index: str, model: str, ): await ctx.defer() - await self.index_cog.index_chat_command(ctx, user_index, search_index, model) + await self.index_cog.index_chat_command(ctx, model) @add_to_group("index") @discord.slash_command( diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index c4a1d382..a86f83cf 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -1,6 +1,7 @@ import datetime import traceback +import aiofiles import discord import os @@ -23,10 +24,10 @@ class IndexService(discord.Cog, name="IndexService"): """Cog containing gpt-index commands""" def __init__( - self, - bot, - usage_service, - deletion_queue, + self, + bot, + usage_service, + deletion_queue, ): super().__init__() self.bot = bot @@ -79,19 +80,63 @@ async def on_message(self, message): except: pass + # Handle file uploads + file = message.attachments[0] if len(message.attachments) > 0 else None + + # File operations, allow for user file upload + if file: + # We will attempt to upload the file to the execution environment + thinking_embed = discord.Embed( + title=f"🤖💬 Indexing file and saving to agent knowledge", + color=0x808080, + ) + + thinking_embed.set_footer(text="This may take a few seconds.") + try: + thinking_message = await message.reply(embed=thinking_embed) + except: + traceback.print_exc() + pass + + indexing_result, summary = await self.index_handler.index_chat_file(message, file) + + try: + await thinking_message.delete() + except: + pass + + if not indexing_result: + failure_embed = discord.Embed( + title="Indexing Error", + description=f"Your file could not be indexed", + color=discord.Color.red(), + ) + failure_embed.set_thumbnail(url="https://i.imgur.com/hbdBZfG.png") + await message.reply(failure_embed) + self.thread_awaiting_responses.remove(message.channel.id) + return + + prompt += "\n{System Message: the user has just uploaded the file " + str(file.filename) + "}" + + success_embed = discord.Embed( + title="Document Interpreted", + description=f"The file you've uploaded has successfully been interpreted. The summary is below:\n`{summary}`", + color=discord.Color.green(), + ) + # thumbnail of https://i.imgur.com/I5dIdg6.png + success_embed.set_thumbnail(url="https://i.imgur.com/I5dIdg6.png") + await message.reply(embed=success_embed) + chat_result = await self.index_handler.execute_index_chat_message( message, prompt ) + if chat_result: - await message.channel.send(chat_result) + await message.reply(chat_result) self.thread_awaiting_responses.remove(message.channel.id) - async def index_chat_command(self, ctx, user_index, search_index, model): - if not user_index and not search_index: - await ctx.respond("Please provide a valid user index or search index") - return - - await self.index_handler.start_index_chat(ctx, search_index, user_index, model) + async def index_chat_command(self, ctx, model): + await self.index_handler.start_index_chat(ctx, model) pass @@ -109,9 +154,9 @@ async def rename_user_index_command(self, ctx, user_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.user.id}/{user_index}", - f"indexes/{ctx.user.id}/{new_name}", + ctx, + f"indexes/{ctx.user.id}/{user_index}", + f"indexes/{ctx.user.id}/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -141,9 +186,9 @@ async def rename_server_index_command(self, ctx, server_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.guild.id}/{server_index}", - f"indexes/{ctx.guild.id}/{new_name}", + ctx, + f"indexes/{ctx.guild.id}/{server_index}", + f"indexes/{ctx.guild.id}/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -171,9 +216,9 @@ async def rename_search_index_command(self, ctx, search_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.user.id}_search/{search_index}", - f"indexes/{ctx.user.id}_search/{new_name}", + ctx, + f"indexes/{ctx.user.id}_search/{search_index}", + f"indexes/{ctx.user.id}_search/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -190,7 +235,7 @@ async def rename_search_index_command(self, ctx, search_index, new_name): ) async def set_index_link_recurse_command( - self, ctx, link: str = None, depth: int = 1 + self, ctx, link: str = None, depth: int = 1 ): await ctx.defer() """Command handler to set a file as your personal index""" @@ -211,7 +256,7 @@ async def set_index_link_recurse_command( ) async def set_index_command( - self, ctx, file: discord.Attachment = None, link: str = None + self, ctx, file: discord.Attachment = None, link: str = None ): await ctx.defer() """Command handler to set a file as your personal index""" @@ -243,7 +288,7 @@ async def set_index_command( ) async def set_discord_command( - self, ctx, channel: discord.TextChannel = None, message_limit: int = 2500 + self, ctx, channel: discord.TextChannel = None, message_limit: int = 2500 ): """Command handler to set a channel as your personal index""" await ctx.defer() @@ -294,12 +339,12 @@ async def load_index_command(self, ctx, user_index, server_index, search_index): return if ( - user_index - and server_index - or user_index - and search_index - or server_index - and search_index + user_index + and server_index + or user_index + and search_index + or server_index + and search_index ): await ctx.respond( "Please only try to load one type of index. Either a user index, a server index or a search index." @@ -328,14 +373,14 @@ async def load_index_command(self, ctx, user_index, server_index, search_index): await self.index_handler.load_index(ctx, index, server, search, user_api_key) async def query_command( - self, - ctx, - query, - nodes, - response_mode, - child_branch_factor, - model, - multistep, + self, + ctx, + query, + nodes, + response_mode, + child_branch_factor, + model, + multistep, ): """Command handler to query your index""" diff --git a/gpt3discord.py b/gpt3discord.py index 6150d263..f64cb7e5 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -94,7 +94,7 @@ # Settings for the bot # activity = discord.Activity( - type=discord.ActivityType.watching, name="for /help /gpt, and more!" + type=discord.ActivityType.custom, name="Undergoing Maintenance" ) bot = discord.Bot(intents=discord.Intents.all(), command_prefix="!", activity=activity) usage_service = UsageService(Path(os.environ.get("DATA_DIR", os.getcwd()))) diff --git a/models/index_model.py b/models/index_model.py index cbeb111b..bb450605 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -25,6 +25,7 @@ from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory from langchain.prompts import MessagesPlaceholder from langchain.schema import SystemMessage +from langchain.tools import Tool from llama_index.callbacks import CallbackManager, TokenCountingHandler from llama_index.node_parser import SimpleNodeParser from llama_index.schema import NodeRelationship @@ -32,7 +33,7 @@ from llama_index.langchain_helpers.agents import ( IndexToolConfig, LlamaToolkit, - create_llama_chat_agent, + create_llama_chat_agent, LlamaIndexTool, ) from llama_index.prompts.chat_prompts import CHAT_REFINE_PROMPT @@ -60,6 +61,7 @@ StorageContext, load_index_from_storage, get_response_synthesizer, + VectorStoreIndex, ) from llama_index.schema import TextNode @@ -67,6 +69,7 @@ from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from llama_index.composability import ComposableGraph +from llama_index.vector_stores import DocArrayInMemoryVectorStore from models.embed_statics_model import EmbedStatics from models.openai_model import Models @@ -146,6 +149,15 @@ def get_and_query( return response +class IndexChatData: + def __init__(self, llm, agent_chain, memory, thread_id, tools): + self.llm = llm + self.agent_chain = agent_chain + self.memory = memory + self.thread_id = thread_id + self.tools = tools + + class IndexData: def __init__(self): self.queryable_index = None @@ -261,6 +273,7 @@ def __init__(self, bot, usage_service): ) self.EMBED_CUTOFF = 2000 self.index_chat_chains = {} + self.chat_indexes = defaultdict() async def rename_index(self, ctx, original_path, rename_path): """Command handler to rename a user index""" @@ -278,7 +291,7 @@ async def rename_index(self, ctx, original_path, rename_path): return False async def get_is_in_index_chat(self, ctx): - return ctx.channel.id in self.index_chat_chains + return ctx.channel.id in self.index_chat_chains.keys() async def execute_index_chat_message(self, ctx, message): if ctx.channel.id not in self.index_chat_chains: @@ -295,50 +308,141 @@ async def execute_index_chat_message(self, ctx, message): return "Ended chat session." agent_output = await self.loop.run_in_executor( - None, partial(self.index_chat_chains[ctx.channel.id].run, message) + None, + partial(self.index_chat_chains[ctx.channel.id].agent_chain.run, message), ) return agent_output - async def start_index_chat(self, ctx, search, user, model): - if search: - index_file = EnvService.find_shared_file( - f"indexes/{ctx.user.id}_search/{search}" - ) - elif user: - index_file = EnvService.find_shared_file(f"indexes/{ctx.user.id}/{user}") + async def index_chat_file(self, message: discord.Message, file: discord.Attachment): + type_to_suffix_mappings = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/pdf": ".pdf", + "application/json": ".json", + "image/png": ".png", + "image/": ".jpg", + "ms-powerpoint": ".ppt", + "presentationml.presentation": ".pptx", + "ms-excel": ".xls", + "spreadsheetml.sheet": ".xlsx", + "msword": ".doc", + "wordprocessingml.document": ".docx", + "audio/": ".mp3", + "video/": ".mp4", + "epub": ".epub", + "markdown": ".md", + "html": ".html", + } - assert index_file is not None + # For when content type doesnt get picked up by discord. + secondary_mappings = { + ".epub": ".epub", + } - preparation_message = await ctx.channel.send( - embed=EmbedStatics.get_index_chat_preparation_message() - ) + # First, initially set the suffix to the suffix of the attachment + suffix = None + if file.content_type: + # Apply the suffix mappings to the file + for key, value in type_to_suffix_mappings.items(): + if key in file.content_type: + suffix = value + break - index = await self.loop.run_in_executor( - None, partial(self.index_load_file, index_file) - ) + if not suffix: + await message.reply("This file type is not supported.") + return False, None - summary_response = await self.loop.run_in_executor( - None, - partial( - index.as_query_engine().query, - "What is a summary of this document? This summary will be used to identify the document in the future. Make the summary verbose and broad.", - ), - ) + else: + for key, value in secondary_mappings.items(): + if key in file.filename: + suffix = value + break + if not suffix: + await message.reply( + "Could not determine the file type of the attachment." + ) + return False, None - query_engine = index.as_query_engine(similarity_top_k=4) + async with aiofiles.tempfile.TemporaryDirectory() as temp_path: + async with aiofiles.tempfile.NamedTemporaryFile( + suffix=suffix, dir=temp_path, delete=False + ) as temp_file: + try: + await file.save(temp_file.name) - tool_config = IndexToolConfig( - query_engine=query_engine, - name=f"Data-Answering-Tool", - description=f"A tool for when you want to answer queries about the external data you're connected to. The " - f"input to the tool is a query string that is similar to the type of data that you want to " - f"get back from the storage, in semantic search fashion. The summary of the data you're " - f"connected to is: {summary_response}.", - tool_kwargs={"return_direct": True}, - ) - toolkit = LlamaToolkit( - index_configs=[tool_config], + filename = file.filename + + # Assert that the filename is < 100 characters, if it is greater, truncate to the first 100 characters and keep the original ending + if len(filename) > 100: + filename = filename[:100] + filename[-4:] + + index: VectorStoreIndex = await self.loop.run_in_executor( + None, + partial( + self.index_file, + Path(temp_file.name), + service_context, + ), + ) + + summary = index.as_query_engine().query( + f"What is a summary or general idea of this document? Be detailed in your summary but not too verbose. Your summary should be under a hundred words. This summary will be used in a vector index to retrieve information about certain data. So, at a high level, the summary should describe the document in such a way that a retriever would know to select it when asked questions about it. The filename was {filename}. Include the file name in the summary." + ) + + engine = index.as_query_engine() + + # Get rid of all special characters in the filename + filename = "".join( + [c for c in filename if c.isalpha() or c.isdigit()] + ).rstrip() + + tool_config = IndexToolConfig( + query_engine=engine, + name=f"{filename}-index", + description=f"Use this tool if the query seems related to this summary: {summary}", + tool_kwargs={"return_direct": False,}, + max_iterations=5, + ) + + tool = LlamaIndexTool.from_tool_config(tool_config) + + agent_kwargs = { + "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")], + "system_message": SystemMessage( + content="You are a superpowered version of GPT that is able to answer questions about the data you're " + "connected to. Each different tool you have represents a different dataset to interact with." + ), + } + + tools = self.index_chat_chains[message.channel.id].tools + tools.append(tool) + + agent_chain = initialize_agent( + tools=tools, + llm=self.index_chat_chains[message.channel.id].llm, + agent=AgentType.OPENAI_FUNCTIONS, + verbose=True, + agent_kwargs=agent_kwargs, + memory=self.index_chat_chains[message.channel.id].memory, + handle_parsing_errors="Check your output and make sure it conforms!", + ) + + index_chat_data = IndexChatData(self.index_chat_chains[message.channel.id].llm, agent_chain, self.index_chat_chains[message.channel.id].memory, message.channel.id, tools) + + self.index_chat_chains[message.channel.id] = index_chat_data + + return True, summary + except Exception as e: + await message.reply("There was an error indexing your file: "+str(e)) + traceback.print_exc() + return False, None + + + async def start_index_chat(self, ctx, model): + preparation_message = await ctx.channel.send( + embed=EmbedStatics.get_index_chat_preparation_message() ) + llm = ChatOpenAI(model=model, temperature=0) memory = ConversationSummaryBufferMemory( @@ -352,12 +456,20 @@ async def start_index_chat(self, ctx, search, user, model): "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")], "system_message": SystemMessage( content="You are a superpowered version of GPT that is able to answer questions about the data you're " - "connected to." + "connected to. Each different tool you have represents a different dataset to interact with. If you are asked to perform a task that spreads across multiple datasets, use multiple tools for the same prompt." ), } + tools = [ + Tool( + name="Dummy-Tool-Do-Not-Use", + func=None, + description=f"This is a dummy tool that does nothing, do not ever mention this tool or use this tool.", + ) + ] + agent_chain = initialize_agent( - tools=toolkit.get_tools(), + tools=tools, llm=llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True, @@ -367,15 +479,10 @@ async def start_index_chat(self, ctx, search, user, model): ) embed_title = f"{ctx.user.name}'s data-connected conversation with GPT" - # Get only the last part after the last / of the index_file - try: - index_file_name = str(index_file).split("/")[-1] - except: - index_file_name = index_file message_embed = discord.Embed( title=embed_title, - description=f"The agent is connected to the data index named {index_file_name}\nModel: {model}", + description=f"The agent is able to interact with your documents. Simply drag your documents into discord or give the agent a link from where to download the documents.\nModel: {model}", color=0x00995B, ) message_embed.set_thumbnail(url="https://i.imgur.com/7V6apMT.png") @@ -394,7 +501,9 @@ async def start_index_chat(self, ctx, search, user, model): except: pass - self.index_chat_chains[thread.id] = agent_chain + index_chat_data = IndexChatData(llm, agent_chain, memory, thread.id, tools) + + self.index_chat_chains[thread.id] = index_chat_data async def paginate_embed(self, response_text): """Given a response text make embed pages and return a list of the pages.""" From 89f274793489fbcaaf3949c14e3537c3355080b0 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Nov 2023 04:25:58 -0400 Subject: [PATCH 2/4] Working revamped index chat --- cogs/index_service_cog.py | 166 +++++++++------ gpt3discord.py | 4 +- models/index_model.py | 415 ++++++++++++++++++++++++-------------- 3 files changed, 367 insertions(+), 218 deletions(-) diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index a86f83cf..07563da2 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -24,10 +24,10 @@ class IndexService(discord.Cog, name="IndexService"): """Cog containing gpt-index commands""" def __init__( - self, - bot, - usage_service, - deletion_queue, + self, + bot, + usage_service, + deletion_queue, ): super().__init__() self.bot = bot @@ -35,6 +35,59 @@ def __init__( self.thread_awaiting_responses = [] self.deletion_queue = deletion_queue + async def process_indexing(self, message, index_type, content=None, link=None): + """ + Helper method to process indexing for both files and links. + - index_type: 'file' or 'link' + - content: The file content if index_type is 'file' + - link: The link if index_type is 'link' + """ + thinking_embed = discord.Embed( + title=f"🤖💬 Indexing {index_type} and saving to agent knowledge", + color=0x808080, + ) + thinking_embed.set_footer(text="This may take a few seconds.") + + try: + thinking_message = await message.reply(embed=thinking_embed) + except: + traceback.print_exc() + + if index_type == "file": + indexing_result, summary = await self.index_handler.index_chat_file( + message, content + ) + else: + indexing_result, summary = await self.index_handler.index_link( + link, summarize=True, index_chat_ctx=message + ) + print("The summary is " + str(summary)) + + try: + await thinking_message.delete() + except: + pass + + if not indexing_result: + failure_embed = discord.Embed( + title="Indexing Error", + description=f"Your {index_type} could not be indexed", + color=discord.Color.red(), + ) + failure_embed.set_thumbnail(url="https://i.imgur.com/hbdBZfG.png") + await message.reply(embed=failure_embed) + self.thread_awaiting_responses.remove(message.channel.id) + return False + + success_embed = discord.Embed( + title=f"{index_type.capitalize()} Interpreted", + description=f"The {index_type} you've uploaded has successfully been interpreted. The summary is below:\n`{summary}`", + color=discord.Color.green(), + ) + success_embed.set_thumbnail(url="https://i.imgur.com/I5dIdg6.png") + await message.reply(embed=success_embed) + return True + @discord.Cog.listener() async def on_message(self, message): # Check for self @@ -85,47 +138,38 @@ async def on_message(self, message): # File operations, allow for user file upload if file: - # We will attempt to upload the file to the execution environment - thinking_embed = discord.Embed( - title=f"🤖💬 Indexing file and saving to agent knowledge", - color=0x808080, + indexing_result = await self.process_indexing( + message, "file", content=file ) - thinking_embed.set_footer(text="This may take a few seconds.") - try: - thinking_message = await message.reply(embed=thinking_embed) - except: - traceback.print_exc() - pass + if not indexing_result: + self.thread_awaiting_responses.remove(message.channel.id) + return + + prompt += ( + "\n{System Message: the user has just uploaded the file " + + str(file.filename) + + "Unless the user asked a specific question, do not use your tools and instead just acknowledge the upload}" + ) - indexing_result, summary = await self.index_handler.index_chat_file(message, file) + # Link operations, allow for user link upload, we connect and download the content at the link. + if "http" in prompt: + # Extract the entire link + link = prompt[prompt.find("http") :] - try: - await thinking_message.delete() - except: - pass + indexing_result = await self.process_indexing( + message, "link", link=link + ) if not indexing_result: - failure_embed = discord.Embed( - title="Indexing Error", - description=f"Your file could not be indexed", - color=discord.Color.red(), - ) - failure_embed.set_thumbnail(url="https://i.imgur.com/hbdBZfG.png") - await message.reply(failure_embed) self.thread_awaiting_responses.remove(message.channel.id) return - prompt += "\n{System Message: the user has just uploaded the file " + str(file.filename) + "}" - - success_embed = discord.Embed( - title="Document Interpreted", - description=f"The file you've uploaded has successfully been interpreted. The summary is below:\n`{summary}`", - color=discord.Color.green(), + prompt += ( + "\n{System Message: you have just indexed the link " + + str(link) + + "}" ) - # thumbnail of https://i.imgur.com/I5dIdg6.png - success_embed.set_thumbnail(url="https://i.imgur.com/I5dIdg6.png") - await message.reply(embed=success_embed) chat_result = await self.index_handler.execute_index_chat_message( message, prompt @@ -154,9 +198,9 @@ async def rename_user_index_command(self, ctx, user_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.user.id}/{user_index}", - f"indexes/{ctx.user.id}/{new_name}", + ctx, + f"indexes/{ctx.user.id}/{user_index}", + f"indexes/{ctx.user.id}/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -186,9 +230,9 @@ async def rename_server_index_command(self, ctx, server_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.guild.id}/{server_index}", - f"indexes/{ctx.guild.id}/{new_name}", + ctx, + f"indexes/{ctx.guild.id}/{server_index}", + f"indexes/{ctx.guild.id}/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -216,9 +260,9 @@ async def rename_search_index_command(self, ctx, search_index, new_name): return if await self.index_handler.rename_index( - ctx, - f"indexes/{ctx.user.id}_search/{search_index}", - f"indexes/{ctx.user.id}_search/{new_name}", + ctx, + f"indexes/{ctx.user.id}_search/{search_index}", + f"indexes/{ctx.user.id}_search/{new_name}", ): await ctx.respond( embed=EmbedStatics.get_index_rename_success_embed( @@ -235,7 +279,7 @@ async def rename_search_index_command(self, ctx, search_index, new_name): ) async def set_index_link_recurse_command( - self, ctx, link: str = None, depth: int = 1 + self, ctx, link: str = None, depth: int = 1 ): await ctx.defer() """Command handler to set a file as your personal index""" @@ -256,7 +300,7 @@ async def set_index_link_recurse_command( ) async def set_index_command( - self, ctx, file: discord.Attachment = None, link: str = None + self, ctx, file: discord.Attachment = None, link: str = None ): await ctx.defer() """Command handler to set a file as your personal index""" @@ -288,7 +332,7 @@ async def set_index_command( ) async def set_discord_command( - self, ctx, channel: discord.TextChannel = None, message_limit: int = 2500 + self, ctx, channel: discord.TextChannel = None, message_limit: int = 2500 ): """Command handler to set a channel as your personal index""" await ctx.defer() @@ -339,12 +383,12 @@ async def load_index_command(self, ctx, user_index, server_index, search_index): return if ( - user_index - and server_index - or user_index - and search_index - or server_index - and search_index + user_index + and server_index + or user_index + and search_index + or server_index + and search_index ): await ctx.respond( "Please only try to load one type of index. Either a user index, a server index or a search index." @@ -373,14 +417,14 @@ async def load_index_command(self, ctx, user_index, server_index, search_index): await self.index_handler.load_index(ctx, index, server, search, user_api_key) async def query_command( - self, - ctx, - query, - nodes, - response_mode, - child_branch_factor, - model, - multistep, + self, + ctx, + query, + nodes, + response_mode, + child_branch_factor, + model, + multistep, ): """Command handler to query your index""" diff --git a/gpt3discord.py b/gpt3discord.py index f64cb7e5..e23469ef 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -34,7 +34,7 @@ from models.openai_model import Model -__version__ = "11.9.9" +__version__ = "12.0.0" PID_FILE = Path("bot.pid") @@ -94,7 +94,7 @@ # Settings for the bot # activity = discord.Activity( - type=discord.ActivityType.custom, name="Undergoing Maintenance" + type=discord.ActivityType.watching, name="for /help, /gpt, and more!" ) bot = discord.Bot(intents=discord.Intents.all(), command_prefix="!", activity=activity) usage_service = UsageService(Path(os.environ.get("DATA_DIR", os.getcwd()))) diff --git a/models/index_model.py b/models/index_model.py index bb450605..9ecdf3fd 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -12,7 +12,7 @@ import openai import tiktoken from functools import partial -from typing import List, Optional +from typing import List, Optional, cast from pathlib import Path from datetime import date @@ -27,22 +27,34 @@ from langchain.schema import SystemMessage from langchain.tools import Tool from llama_index.callbacks import CallbackManager, TokenCountingHandler +from llama_index.evaluation.guideline import DEFAULT_GUIDELINES, GuidelineEvaluator +from llama_index.indices.query.base import BaseQueryEngine from llama_index.node_parser import SimpleNodeParser +from llama_index.response_synthesizers import ResponseMode from llama_index.schema import NodeRelationship from llama_index.indices.query.query_transform import StepDecomposeQueryTransform from llama_index.langchain_helpers.agents import ( IndexToolConfig, LlamaToolkit, - create_llama_chat_agent, LlamaIndexTool, + create_llama_chat_agent, + LlamaIndexTool, +) +from llama_index.prompts.chat_prompts import ( + CHAT_REFINE_PROMPT, + CHAT_TREE_SUMMARIZE_PROMPT, + TEXT_QA_SYSTEM_PROMPT, ) -from llama_index.prompts.chat_prompts import CHAT_REFINE_PROMPT from llama_index.readers import YoutubeTranscriptReader from llama_index.readers.schema.base import Document from llama_index.langchain_helpers.text_splitter import TokenTextSplitter from llama_index.retrievers import VectorIndexRetriever, TreeSelectLeafRetriever -from llama_index.query_engine import RetrieverQueryEngine, MultiStepQueryEngine +from llama_index.query_engine import ( + RetrieverQueryEngine, + MultiStepQueryEngine, + RetryGuidelineQueryEngine, +) from llama_index import ( GPTVectorStoreIndex, @@ -89,7 +101,7 @@ verbose=False, ) node_parser = SimpleNodeParser.from_defaults( - text_splitter=TokenTextSplitter(chunk_size=256, chunk_overlap=64) + text_splitter=TokenTextSplitter(chunk_size=1024, chunk_overlap=128) ) callback_manager = CallbackManager([token_counter]) service_context = ServiceContext.from_defaults( @@ -150,12 +162,16 @@ def get_and_query( class IndexChatData: - def __init__(self, llm, agent_chain, memory, thread_id, tools): + def __init__( + self, llm, agent_chain, memory, thread_id, tools, agent_kwargs, llm_predictor + ): self.llm = llm self.agent_chain = agent_chain self.memory = memory self.thread_id = thread_id self.tools = tools + self.agent_kwargs = agent_kwargs + self.llm_predictor = llm_predictor class IndexData: @@ -254,6 +270,54 @@ class Index_handler: callback_manager=callback_manager, node_parser=node_parser, ) + type_to_suffix_mappings = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/pdf": ".pdf", + "application/json": ".json", + "image/png": ".png", + "image/jpeg": ".jpg", + "image/gif": ".gif", + "image/svg+xml": ".svg", + "image/webp": ".webp", + "application/mspowerpoint": ".ppt", + "application/vnd.ms-powerpoint": ".ppt", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/msexcel": ".xls", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/msword": ".doc", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "audio/mpeg": ".mp3", + "audio/x-wav": ".wav", + "audio/ogg": ".ogg", + "video/mpeg": ".mpeg", + "video/mp4": ".mp4", + "application/epub+zip": ".epub", + "text/markdown": ".md", + "text/html": ".html", + "application/rtf": ".rtf", + "application/x-msdownload": ".exe", + "application/xml": ".xml", + "application/vnd.adobe.photoshop": ".psd", + "application/x-sql": ".sql", + "application/x-latex": ".latex", + "application/x-httpd-php": ".php", + "application/java-archive": ".jar", + "application/x-sh": ".sh", + "application/x-csh": ".csh", + "text/x-c": ".c", + "text/x-c++": ".cpp", + "text/x-java-source": ".java", + "text/x-python": ".py", + "text/x-ruby": ".rb", + "text/x-perl": ".pl", + "text/x-shellscript": ".sh", + } + + # For when content type doesnt get picked up by discord. + secondary_mappings = { + ".epub": ".epub", + } def __init__(self, bot, usage_service): self.bot = bot @@ -314,54 +378,14 @@ async def execute_index_chat_message(self, ctx, message): return agent_output async def index_chat_file(self, message: discord.Message, file: discord.Attachment): - type_to_suffix_mappings = { - "text/plain": ".txt", - "text/csv": ".csv", - "application/pdf": ".pdf", - "application/json": ".json", - "image/png": ".png", - "image/": ".jpg", - "ms-powerpoint": ".ppt", - "presentationml.presentation": ".pptx", - "ms-excel": ".xls", - "spreadsheetml.sheet": ".xlsx", - "msword": ".doc", - "wordprocessingml.document": ".docx", - "audio/": ".mp3", - "video/": ".mp4", - "epub": ".epub", - "markdown": ".md", - "html": ".html", - } - - # For when content type doesnt get picked up by discord. - secondary_mappings = { - ".epub": ".epub", - } - # First, initially set the suffix to the suffix of the attachment - suffix = None - if file.content_type: - # Apply the suffix mappings to the file - for key, value in type_to_suffix_mappings.items(): - if key in file.content_type: - suffix = value - break + suffix = self.get_file_suffix(file.content_type, file.filename) or None - if not suffix: - await message.reply("This file type is not supported.") - return False, None - - else: - for key, value in secondary_mappings.items(): - if key in file.filename: - suffix = value - break - if not suffix: - await message.reply( - "Could not determine the file type of the attachment." - ) - return False, None + if not suffix: + await message.reply( + "The file you uploaded is unable to be indexed. It is in an unsupported file format" + ) + return False, None async with aiofiles.tempfile.TemporaryDirectory() as temp_path: async with aiofiles.tempfile.NamedTemporaryFile( @@ -382,14 +406,27 @@ async def index_chat_file(self, message: discord.Message, file: discord.Attachme self.index_file, Path(temp_file.name), service_context, + suffix, ), ) - summary = index.as_query_engine().query( - f"What is a summary or general idea of this document? Be detailed in your summary but not too verbose. Your summary should be under a hundred words. This summary will be used in a vector index to retrieve information about certain data. So, at a high level, the summary should describe the document in such a way that a retriever would know to select it when asked questions about it. The filename was {filename}. Include the file name in the summary." + summary = await index.as_query_engine( + similarity_top_k=10, + child_branch_factor=6, + response_mode="tree_summarize", + ).aquery( + f"What is a summary or general idea of this data? Be detailed in your summary (e.g " + f"extract key names, etc) but not too verbose. Your summary should be under a hundred words. " + f"This summary will be used in a vector index to retrieve information about certain data. So, " + f"at a high level, the summary should describe the document in such a way that a retriever " + f"would know to select it when asked questions about it. The data name was {filename}. Include " + f"the file name in the summary. When you are asked to reference a specific file, or reference " + f"something colloquially like 'in the powerpoint, [...]?', never respond saying that as an AI " + f"you can't view the data, instead infer which tool to use that has the data. Say that there " + f"is no available data if there are no available tools that are relevant." ) - engine = index.as_query_engine() + engine = self.get_query_engine(index, message, summary) # Get rid of all special characters in the filename filename = "".join( @@ -400,20 +437,14 @@ async def index_chat_file(self, message: discord.Message, file: discord.Attachme query_engine=engine, name=f"{filename}-index", description=f"Use this tool if the query seems related to this summary: {summary}", - tool_kwargs={"return_direct": False,}, + tool_kwargs={ + "return_direct": False, + }, max_iterations=5, ) tool = LlamaIndexTool.from_tool_config(tool_config) - agent_kwargs = { - "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")], - "system_message": SystemMessage( - content="You are a superpowered version of GPT that is able to answer questions about the data you're " - "connected to. Each different tool you have represents a different dataset to interact with." - ), - } - tools = self.index_chat_chains[message.channel.id].tools tools.append(tool) @@ -422,28 +453,40 @@ async def index_chat_file(self, message: discord.Message, file: discord.Attachme llm=self.index_chat_chains[message.channel.id].llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True, - agent_kwargs=agent_kwargs, + agent_kwargs=self.index_chat_chains[ + message.channel.id + ].agent_kwargs, memory=self.index_chat_chains[message.channel.id].memory, handle_parsing_errors="Check your output and make sure it conforms!", ) - index_chat_data = IndexChatData(self.index_chat_chains[message.channel.id].llm, agent_chain, self.index_chat_chains[message.channel.id].memory, message.channel.id, tools) + index_chat_data = IndexChatData( + self.index_chat_chains[message.channel.id].llm, + agent_chain, + self.index_chat_chains[message.channel.id].memory, + message.channel.id, + tools, + self.index_chat_chains[message.channel.id].agent_kwargs, + self.index_chat_chains[message.channel.id].llm_predictor, + ) self.index_chat_chains[message.channel.id] = index_chat_data return True, summary except Exception as e: - await message.reply("There was an error indexing your file: "+str(e)) + await message.reply( + "There was an error indexing your file: " + str(e) + ) traceback.print_exc() return False, None - async def start_index_chat(self, ctx, model): preparation_message = await ctx.channel.send( embed=EmbedStatics.get_index_chat_preparation_message() ) llm = ChatOpenAI(model=model, temperature=0) + llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=model)) memory = ConversationSummaryBufferMemory( memory_key="memory", @@ -456,7 +499,13 @@ async def start_index_chat(self, ctx, model): "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")], "system_message": SystemMessage( content="You are a superpowered version of GPT that is able to answer questions about the data you're " - "connected to. Each different tool you have represents a different dataset to interact with. If you are asked to perform a task that spreads across multiple datasets, use multiple tools for the same prompt." + "connected to. Each different tool you have represents a different dataset to interact with. " + "If you are asked to perform a task that spreads across multiple datasets, use multiple tools " + "for the same prompt. When the user types links in chat, you will have already been connected " + "to the data at the link by the time you respond. When using tools, the input should be " + "clearly created based on the request of the user. For example, if a user uploads an invoice " + "and asks how many usage hours of X was present in the invoice, a good query is 'X hours'. " + "Avoid using single word queries unless the request is very simple." ), } @@ -501,7 +550,9 @@ async def start_index_chat(self, ctx, model): except: pass - index_chat_data = IndexChatData(llm, agent_chain, memory, thread.id, tools) + index_chat_data = IndexChatData( + llm, agent_chain, memory, thread.id, tools, agent_kwargs, llm_predictor + ) self.index_chat_chains[thread.id] = index_chat_data @@ -667,6 +718,21 @@ async def index_webpage(self, url, service_context) -> GPTVectorStoreIndex: def reset_indexes(self, user_id): self.index_storage[user_id].reset_indexes(user_id) + def get_file_suffix(self, content_type, filename): + print("The content type is " + content_type) + if content_type: + # Apply the suffix mappings to the file + for key, value in self.type_to_suffix_mappings.items(): + if key in content_type: + return value + + else: + for key, value in self.secondary_mappings.items(): + if key in filename: + return value + + return None + async def set_file_index( self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key ): @@ -676,55 +742,15 @@ async def set_file_index( os.environ["OPENAI_API_KEY"] = user_api_key openai.api_key = os.environ["OPENAI_API_KEY"] - type_to_suffix_mappings = { - "text/plain": ".txt", - "text/csv": ".csv", - "application/pdf": ".pdf", - "application/json": ".json", - "image/png": ".png", - "image/": ".jpg", - "ms-powerpoint": ".ppt", - "presentationml.presentation": ".pptx", - "ms-excel": ".xls", - "spreadsheetml.sheet": ".xlsx", - "msword": ".doc", - "wordprocessingml.document": ".docx", - "audio/": ".mp3", - "video/": ".mp4", - "epub": ".epub", - "markdown": ".md", - "html": ".html", - } - - # For when content type doesnt get picked up by discord. - secondary_mappings = { - ".epub": ".epub", - } - try: # First, initially set the suffix to the suffix of the attachment - suffix = None - if file.content_type: - # Apply the suffix mappings to the file - for key, value in type_to_suffix_mappings.items(): - if key in file.content_type: - suffix = value - break - - if not suffix: - await ctx.send("This file type is not supported.") - return + suffix = self.get_file_suffix(file.content_type, file.filename) or None - else: - for key, value in secondary_mappings.items(): - if key in file.filename: - suffix = value - break - if not suffix: - await ctx.send( - "Could not determine the file type of the attachment, attempting a dirty index.." - ) - return + if not suffix: + await ctx.respond( + embed=EmbedStatics.get_index_set_failure_embed("Unsupported file") + ) + return # Send indexing message response = await ctx.respond( @@ -857,42 +883,37 @@ async def set_link_index_recurse( await response.edit(embed=EmbedStatics.get_index_set_success_embed(price)) - async def set_link_index( - self, ctx: discord.ApplicationContext, link: str, user_api_key - ): - if not user_api_key: - os.environ["OPENAI_API_KEY"] = self.openai_key - else: - os.environ["OPENAI_API_KEY"] = user_api_key - openai.api_key = os.environ["OPENAI_API_KEY"] + def get_query_engine(self, index, message, summary): + retriever = VectorIndexRetriever( + index=index, similarity_top_k=10, service_context=service_context + ) - response = await ctx.respond(embed=EmbedStatics.build_index_progress_embed()) - try: - # Pre-emptively connect and get the content-type of the response - try: - async with aiohttp.ClientSession() as session: - async with session.get(link, timeout=2) as _response: - print(_response.status) - if _response.status == 200: - content_type = _response.headers.get("content-type") - else: - await response.edit( - embed=EmbedStatics.get_index_set_failure_embed( - "Invalid URL or could not connect to the provided URL." - ) - ) - return - except Exception as e: - traceback.print_exc() - await response.edit( - embed=EmbedStatics.get_index_set_failure_embed( - "Invalid URL or could not connect to the provided URL. " - + str(e) - ) - ) - return + response_synthesizer = get_response_synthesizer( + response_mode=ResponseMode.COMPACT_ACCUMULATE, + use_async=True, + refine_template=TEXT_QA_SYSTEM_PROMPT, + service_context=service_context, + ) - # Check if the link contains youtube in it + # Guideline eval + guideline_eval = GuidelineEvaluator( + guidelines=DEFAULT_GUIDELINES + + "\nThe response should be verbose and detailed.\n" + "The response should not simply just say that the requested information was found in the context information.\n" + ) # just for example + + engine = RetrieverQueryEngine( + retriever=retriever, response_synthesizer=response_synthesizer + ) + + retry_guideline_query_engine = RetryGuidelineQueryEngine( + engine, guideline_eval, resynthesize_query=True + ) + + return retry_guideline_query_engine + + async def index_link(self, link, summarize=False, index_chat_ctx=None): + try: if await UrlCheck.check_youtube_link(link): index = await self.loop.run_in_executor( None, partial(self.index_youtube_transcript, link, service_context) @@ -903,6 +924,95 @@ async def set_link_index( ) else: index = await self.index_webpage(link, service_context) + except Exception as e: + if index_chat_ctx: + await index_chat_ctx.reply( + "There was an error indexing your link: " + str(e) + ) + return False, None + else: + raise e + + summary = None + if index_chat_ctx: + try: + summary = await index.as_query_engine( + response_mode="tree_summarize" + ).aquery( + "What is a summary or general idea of this document? Be detailed in your summary but not too verbose. Your summary should be under 50 words. This summary will be used in a vector index to retrieve information about certain data. So, at a high level, the summary should describe the document in such a way that a retriever would know to select it when asked questions about it. The link was {link}. Include the an easy identifier derived from the link at the end of the summary." + ) + + engine = self.get_query_engine(index, index_chat_ctx, summary) + + # Get rid of all special characters in the link, replace periods with _ + link_cleaned = "".join( + [c for c in link if c.isalpha() or c.isdigit() or c == "."] + ).rstrip() + # replace . + link_cleaned = link_cleaned.replace(".", "_") + + tool_config = IndexToolConfig( + query_engine=engine, + name=f"{link_cleaned}-index", + description=f"Use this tool if the query seems related to this summary: {summary}", + tool_kwargs={ + "return_direct": False, + }, + max_iterations=5, + ) + + tool = LlamaIndexTool.from_tool_config(tool_config) + + tools = self.index_chat_chains[index_chat_ctx.channel.id].tools + tools.append(tool) + + agent_chain = initialize_agent( + tools=tools, + llm=self.index_chat_chains[index_chat_ctx.channel.id].llm, + agent=AgentType.OPENAI_FUNCTIONS, + verbose=True, + agent_kwargs=self.index_chat_chains[ + index_chat_ctx.channel.id + ].agent_kwargs, + memory=self.index_chat_chains[index_chat_ctx.channel.id].memory, + handle_parsing_errors="Check your output and make sure it conforms!", + ) + + index_chat_data = IndexChatData( + self.index_chat_chains[index_chat_ctx.channel.id].llm, + agent_chain, + self.index_chat_chains[index_chat_ctx.channel.id].memory, + index_chat_ctx.channel.id, + tools, + self.index_chat_chains[index_chat_ctx.channel.id].agent_kwargs, + self.index_chat_chains[index_chat_ctx.channel.id].llm_predictor, + ) + + self.index_chat_chains[index_chat_ctx.channel.id] = index_chat_data + + return True, summary + except Exception as e: + await index_chat_ctx.reply( + "There was an error indexing your link: " + str(e) + ) + return False, None + + return index, summary + + async def set_link_index( + self, ctx: discord.ApplicationContext, link: str, user_api_key + ): + if not user_api_key: + os.environ["OPENAI_API_KEY"] = self.openai_key + else: + os.environ["OPENAI_API_KEY"] = user_api_key + openai.api_key = os.environ["OPENAI_API_KEY"] + + response = await ctx.respond(embed=EmbedStatics.build_index_progress_embed()) + try: + # Check if the link contains youtube in it + index = await self.index_link(link) + await self.usage_service.update_usage( token_counter.total_embedding_token_count, "embedding" ) @@ -930,11 +1040,6 @@ async def set_link_index( self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name) - except ValueError as e: - await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e))) - traceback.print_exc() - return - except Exception as e: await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e))) traceback.print_exc() From 0675a7599e81c66607a7afdfee1aea58a53ada26 Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Nov 2023 04:42:48 -0400 Subject: [PATCH 3/4] some cleanup --- README.md | 9 ++++++-- cogs/code_interpreter_service_cog.py | 27 +----------------------- cogs/index_service_cog.py | 31 +++++++++++++++++++++++++++- cogs/search_service_cog.py | 27 +----------------------- models/embed_statics_model.py | 26 +++++++++++++++++++++++ 5 files changed, 65 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index cff6c0fa..ad1223f0 100644 --- a/README.md +++ b/README.md @@ -49,14 +49,19 @@ Internet-connected chat (Google + Wolfram + Link Crawling)
Code Interpreter / Advanced Data Analysis

Custom indexing and Document Q&A
-
+

-# Recent Notable Updates +# Recent Notable Updates - **Code Interpreter / Advanced Data Analysis** - Just like ChatGPT, GPTDiscord now has a fully-fledged code execution environment. You can work with GPT to execute your code in an isolated environment, with the ability to even install Python and system packages, and access the internet from the execution environment. + - **Multi-modality** - GPTDiscord now supports images sent to the bot during a conversation made with `/gpt converse`! + +- **Drag And Drop Document Chat** - Chat with your documents by simply dragging and dropping files, or even links into discord chat! `/index chat` + + - **Internet-connected Chat!** - Chat with an instance of GPT3.5 or GPT-4 that's connected to Google and Wolfram Alpha and can browse and access links that you send it! # Features diff --git a/cogs/code_interpreter_service_cog.py b/cogs/code_interpreter_service_cog.py index cfe22e1b..e424d891 100644 --- a/cogs/code_interpreter_service_cog.py +++ b/cogs/code_interpreter_service_cog.py @@ -94,31 +94,6 @@ def __init__( self.sessions = {} # Make a mapping of all the country codes and their full country names: - async def paginate_chat_embed(self, response_text): - """Given a response text make embed pages and return a list of the pages.""" - - response_text = [ - response_text[i : i + 3500] for i in range(0, len(response_text), 7000) - ] - pages = [] - first = False - # Send each chunk as a message - for count, chunk in enumerate(response_text, start=1): - if not first: - page = discord.Embed( - title=f"{count}", - description=chunk, - ) - first = True - else: - page = discord.Embed( - title=f"{count}", - description=chunk, - ) - pages.append(page) - - return pages - @discord.Cog.listener() async def on_message(self, message): # Check if the message is from a bot. @@ -277,7 +252,7 @@ async def on_message(self, message): artifacts_available = len(artifact_names) > 0 if len(response) > 2000: - embed_pages = await self.paginate_chat_embed(response) + embed_pages = await EmbedStatics.paginate_chat_embed(response) paginator = pages.Paginator( pages=embed_pages, timeout=None, diff --git a/cogs/index_service_cog.py b/cogs/index_service_cog.py index 07563da2..7e762d8e 100644 --- a/cogs/index_service_cog.py +++ b/cogs/index_service_cog.py @@ -5,6 +5,8 @@ import discord import os +from discord.ext import pages + from models.embed_statics_model import EmbedStatics from services.deletion_service import Deletion from services.environment_service import EnvService @@ -176,7 +178,34 @@ async def on_message(self, message): ) if chat_result: - await message.reply(chat_result) + if len(chat_result) > 2000: + embed_pages = await EmbedStatics.paginate_chat_embed(chat_result) + paginator = pages.Paginator( + pages=embed_pages, + timeout=None, + author_check=False, + ) + try: + await paginator.respond(message) + except: + chat_result = [ + chat_result[i : i + 1900] + for i in range(0, len(chat_result), 1900) + ] + for count, chunk in enumerate(chat_result, start=1): + await message.channel.send(chunk) + + else: + chat_result = chat_result.replace("\\n", "\n") + # Build a response embed + response_embed = discord.Embed( + title="", + description=chat_result, + color=0x808080, + ) + await message.reply( + embed=response_embed, + ) self.thread_awaiting_responses.remove(message.channel.id) async def index_chat_command(self, ctx, model): diff --git a/cogs/search_service_cog.py b/cogs/search_service_cog.py index 68def6ee..c6cad90c 100644 --- a/cogs/search_service_cog.py +++ b/cogs/search_service_cog.py @@ -311,31 +311,6 @@ async def paginate_embed( return pages - async def paginate_chat_embed(self, response_text): - """Given a response text make embed pages and return a list of the pages.""" - - response_text = [ - response_text[i : i + 3500] for i in range(0, len(response_text), 7000) - ] - pages = [] - first = False - # Send each chunk as a message - for count, chunk in enumerate(response_text, start=1): - if not first: - page = discord.Embed( - title=f"{count}", - description=chunk, - ) - first = True - else: - page = discord.Embed( - title=f"{count}", - description=chunk, - ) - pages.append(page) - - return pages - @discord.Cog.listener() async def on_message(self, message): # Check if the message is from a bot. @@ -426,7 +401,7 @@ async def on_message(self, message): return if len(response) > 2000: - embed_pages = await self.paginate_chat_embed(response) + embed_pages = await EmbedStatics.paginate_chat_embed(response) paginator = pages.Paginator( pages=embed_pages, timeout=None, diff --git a/models/embed_statics_model.py b/models/embed_statics_model.py index f544b28b..cedef4ac 100644 --- a/models/embed_statics_model.py +++ b/models/embed_statics_model.py @@ -9,6 +9,32 @@ class EmbedStatics: def __init__(self): pass + @staticmethod + def paginate_chat_embed(response_text): + """Given a response text make embed pages and return a list of the pages.""" + + response_text = [ + response_text[i : i + 3500] for i in range(0, len(response_text), 7000) + ] + pages = [] + first = False + # Send each chunk as a message + for count, chunk in enumerate(response_text, start=1): + if not first: + page = discord.Embed( + title=f"{count}", + description=chunk, + ) + first = True + else: + page = discord.Embed( + title=f"{count}", + description=chunk, + ) + pages.append(page) + + return pages + @staticmethod def get_api_timeout_embed(): embed = discord.Embed( From 17a8b7682b2a9e4bc25fc871e77494c36aa223fc Mon Sep 17 00:00:00 2001 From: Kaveen Kumarasinghe Date: Sat, 4 Nov 2023 04:49:21 -0400 Subject: [PATCH 4/4] fix youtube shortlinking --- models/index_model.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/models/index_model.py b/models/index_model.py index 9ecdf3fd..dee68cbf 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -607,7 +607,18 @@ def index_gdoc(self, doc_id, service_context) -> GPTVectorStoreIndex: def index_youtube_transcript(self, link, service_context): try: - documents = YoutubeTranscriptReader().load_data(ytlinks=[link]) + def convert_shortlink_to_full_link(short_link): + # Check if the link is a shortened YouTube link + if "youtu.be" in short_link: + # Extract the video ID from the link + video_id = short_link.split('/')[-1].split('?')[0] + # Construct the full YouTube desktop link + desktop_link = f"https://www.youtube.com/watch?v={video_id}" + return desktop_link + else: + return short_link + + documents = YoutubeTranscriptReader().load_data(ytlinks=[convert_shortlink_to_full_link(link)]) except Exception as e: raise ValueError(f"The youtube transcript couldn't be loaded: {e}")