-
Notifications
You must be signed in to change notification settings - Fork 0
/
rag.py
83 lines (66 loc) · 3.27 KB
/
rag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from langchain.llms import OpenAI
from langchain.chains import RetrievalAugmentedGeneration
from langchain.retrievers import ChromaDB
from langchain.tokenizers import DefaultTokenizer
import asyncio
from bs4 import BeautifulSoup
import openai
async def index_data_in_chromadb(data, vector_store):
# Assuming 'data' is a list of texts for which you want to create embeddings
for text in data:
# Step 1: Create embeddings using OpenAI's API
# In this hypothetical example, we're assuming OpenAI has an endpoint that allows us to create embeddings
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-001", # Just an example, use the appropriate embedding engine
api_key="XX_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your actual API key
)
embedding = response['data'] # Assuming the embedding data is returned under a 'data' key
except openai.error.OpenAIError as e:
print(f"An error occurred while creating embeddings: {e}")
continue
# Step 2: Store the embeddings in ChromaDB
# Here 'chromadb_store_embedding' is a placeholder for the actual method you would use to save the embedding
try:
await vector_store.chromadb_store_embedding(text, embedding)
except Exception as e:
print(f"An error occurred while storing embeddings in ChromaDB: {e}")
# Function to parse HTML and extract text
async def parse_html_and_index(file_path, vector_store):
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Use BeautifulSoup or another HTML parser to extract the text
soup = BeautifulSoup(html_content, 'html.parser')
texts = soup.get_text(separator=' ', strip=True)
# Here we assume that texts is a list of string,
# each string should be a piece of text that you want to index.
await index_data_in_chromadb(texts, vector_store)
# Function to initialize and index data to ChromaDB
async def initialize_and_index(file_path):
# Initialize ChromaDB as the vector store for the retriever
vector_store = ChromaDB(api_key="zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz")
# Parse the HTML and index the data
await parse_html_and_index(file_path, vector_store)
html_file_path = 'telegram_group_messages.html'
await initialize_and_index(html_file_path)
# Initialize the OpenAI LLM with your API key
llm = OpenAI(api_key="XX_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
# Initialize the tokenizer
tokenizer = DefaultTokenizer()
# Set up the retriever with ChromaDB
retriever = ChromaDB(vector_store=vector_store, tokenizer=tokenizer)
# Initialize RAG with the LLM and the retriever
rag = RetrievalAugmentedGeneration(llm=llm, retriever=retriever)
# Function to process user questions, search from ChromaDB embeddings, and generate responses
async def answer_question(question):
# Use RAG to generate an answer to the question
answer = await rag.generate(question)
return answer
async def main():
# Example usage
question = "Какую зарядку лучше использовать для Li L9?"
answer = await answer_question(question)
print(answer)
# Run the main function in the event loop
asyncio.run(main())