-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathingest.py
94 lines (74 loc) · 2.48 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
This file contains the code for ingesting data from the data source.
"""
import os
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader,
TextLoader,
)
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
# env variables
from constants import OPENAI_API_KEY, USE_OPENAI
def create_vector_database(device="cpu"):
"""
Create the vector database from the documents in the docs folder
and save it in the db folder.
"""
# take confirmation if there's some already in db folder
if os.path.exists("db"):
confirmation = input(
"Do you want to overwrite the existing database? (y/n): "
).lower()
if confirmation == "n":
print("Aborting...")
return
pdf_loader = DirectoryLoader(
"docs/",
glob="**/*.pdf",
loader_cls=PyPDFLoader,
)
text_loader = DirectoryLoader(
"docs/",
glob="**/*.txt",
loader_cls=TextLoader,
)
all_loaders = [pdf_loader, text_loader]
# loading the documents from all the loaders
loaded_documents = []
for loader in all_loaders:
loaded_documents.extend(loader.load())
# splitting the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=512, chunk_overlap=24, length_function=len
)
chunks = text_splitter.split_documents(loaded_documents)
# loading the embeddings
if USE_OPENAI:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
else:
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": device},
)
# creating the vector store
vector_db = FAISS.from_documents(chunks, embeddings)
vector_db.save_local("db")
print("Vector database created successfully!")
def get_vector_database(device="cpu"):
"""
Get the vector database from the db folder.
"""
if USE_OPENAI:
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
else:
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": device},
)
return FAISS.load_local("db", embedding)
if __name__ == "__main__":
create_vector_database()