-
Notifications
You must be signed in to change notification settings - Fork 53
/
ingest.py
61 lines (45 loc) · 1.84 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import warnings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader,
)
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
warnings.simplefilter("ignore")
ABS_PATH: str = os.path.dirname(os.path.abspath(__file__))
DB_DIR: str = os.path.join(ABS_PATH, "db")
# Create vector database
def create_vector_database():
"""
Creates a vector database using document loaders and embeddings.
This function loads data from PDF, markdown and text files in the 'data/' directory,
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
and finally persists the embeddings into a Chroma vector database.
"""
# Initialize loaders for different file types
pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
loaded_documents = pdf_loader.load()
#len(loaded_documents)
# Split loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
chunked_documents = text_splitter.split_documents(loaded_documents)
#len(chunked_documents)
#chunked_documents[0]
# Initialize Ollama Embeddings
ollama_embeddings = OllamaEmbeddings(model="mistral")
# Create and persist a Chroma vector database from the chunked documents
vector_database = Chroma.from_documents(
documents=chunked_documents,
embedding=ollama_embeddings,
persist_directory=DB_DIR,
)
vector_database.persist()
# query it
#query = "Who are the authors of the paper"
#docs = vector_database.similarity_search(query)
# print results
#print(docs[0].page_content)
if __name__ == "__main__":
create_vector_database()