-
Notifications
You must be signed in to change notification settings - Fork 36
/
data_chunk.py
53 lines (39 loc) · 1.58 KB
/
data_chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
import os
from dataloader import load_high
from agentic_chunker import AgenticChunker
# Pydantic data class
class Sentences(BaseModel):
sentences: List[str]
def get_propositions(text, runnable, extraction_chain):
runnable_output = runnable.invoke({
"input": text
}).content
propositions = extraction_chain.run(runnable_output)[0].sentences
return propositions
def run_chunk(essay):
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-4-1106-preview', openai_api_key = os.getenv("OPENAI_API_KEY"))
runnable = obj | llm
# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
paragraphs = essay.split("\n\n")
essay_propositions = []
for i, para in enumerate(paragraphs):
propositions = get_propositions(para, runnable, extraction_chain)
essay_propositions.extend(propositions)
print (f"Done with {i}")
ac = AgenticChunker()
ac.add_propositions(essay_propositions)
ac.pretty_print_chunks()
chunks = ac.get_chunks(get_type='list_of_strings')
return chunks
print(chunks)