diff --git a/gpt3discord.py b/gpt3discord.py index 9fa9da8e..35dc7327 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -33,7 +33,7 @@ from models.openai_model import Model -__version__ = "11.7.3" +__version__ = "11.8.0" PID_FILE = Path("bot.pid") diff --git a/models/index_model.py b/models/index_model.py index c69a74f8..8e4f2995 100644 --- a/models/index_model.py +++ b/models/index_model.py @@ -23,8 +23,7 @@ from langchain.llms import OpenAIChat from langchain.memory import ConversationBufferMemory from llama_index.callbacks import CallbackManager, TokenCountingHandler -from llama_index.data_structs.data_structs import Node -from llama_index.data_structs.node import DocumentRelationship +from llama_index.schema import NodeRelationship from llama_index.indices.query.query_transform import StepDecomposeQueryTransform from llama_index.langchain_helpers.agents import ( IndexToolConfig, @@ -59,10 +58,12 @@ ResponseSynthesizer, load_index_from_storage, ) + +from llama_index.schema import TextNode +from llama_index.storage.docstore.types import RefDocInfo from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR from llama_index.composability import ComposableGraph -from llama_index.schema import BaseDocument from models.embed_statics_model import EmbedStatics from models.openai_model import Models @@ -874,23 +875,16 @@ async def load_index( async def index_to_docs( self, old_index, chunk_size: int = 4000, chunk_overlap: int = 200 - ) -> List[BaseDocument]: + ) -> List[Document]: documents = [] docstore = old_index.docstore + ref_docs = old_index.ref_doc_info - for doc_id in docstore.docs.keys(): + for document in ref_docs.values(): text = "" - - document = docstore.get_document(doc_id) - if document is not None: - node = docstore.get_node(document.get_doc_id()) - while node is not None: - extra_info = node.extra_info - text += f"{node.text} " - next_node_id = node.relationships.get( - DocumentRelationship.NEXT, None - ) - node = docstore.get_node(next_node_id) if next_node_id else None + for node in document.node_ids: + node = docstore.get_node(node) + text += f"{node.text} " text_splitter = TokenTextSplitter( separator=" ", chunk_size=chunk_size, chunk_overlap=chunk_overlap @@ -898,9 +892,8 @@ async def index_to_docs( text_chunks = text_splitter.split_text(text) for chunk_text in text_chunks: - new_doc = Document(text=chunk_text, extra_info=extra_info) + new_doc = Document(text=chunk_text, extra_info=document.metadata) documents.append(new_doc) - print(new_doc) return documents @@ -1283,7 +1276,7 @@ async def load_data( channel_id, limit=limit, oldest_first=oldest_first ) results.append( - Document(channel_content, extra_info={"channel_name": channel_name}) + Document(text=channel_content, extra_info={"channel_name": channel_name}) ) return results diff --git a/pyproject.toml b/pyproject.toml index 4b94579a..0b6a8b70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,13 +33,13 @@ dependencies = [ "sqlitedict==2.1.0", "backoff==2.2.1", "flask==2.2.3", -"llama-index==0.6.30", +"llama-index==0.6.38", "pypdf==3.11.1", "youtube_transcript_api==0.5.0", "sentencepiece==0.1.99", "protobuf==3.20.2", "python-pptx==0.6.21", -"langchain==0.0.208", +"langchain==0.0.268", "unidecode==1.3.6", "tqdm==4.64.1", "docx2txt==0.8", diff --git a/requirements.txt b/requirements.txt index a50de9e0..dee3728d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,14 +13,14 @@ pinecone-client==2.1.0 sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.3 -llama-index==0.6.30 +llama-index==0.6.38 pypdf==3.11.1 youtube_transcript_api==0.5.0 sentencepiece==0.1.99 protobuf==3.20.2 python-pptx==0.6.21 sentence-transformers==2.2.2 -langchain==0.0.208 +langchain==0.0.268 openai-whisper unidecode==1.3.6 tqdm==4.64.1 diff --git a/requirements_base.txt b/requirements_base.txt index dd0fd4aa..23df5989 100644 --- a/requirements_base.txt +++ b/requirements_base.txt @@ -13,13 +13,13 @@ pinecone-client==2.1.0 sqlitedict==2.1.0 backoff==2.2.1 flask==2.2.3 -llama-index==0.6.30 +llama-index==0.6.38 pypdf==3.11.1 youtube_transcript_api==0.5.0 sentencepiece==0.1.99 protobuf==3.20.2 python-pptx==0.6.21 -langchain==0.0.208 +langchain==0.0.268 unidecode==1.3.6 tqdm==4.64.1 docx2txt==0.8