-
Notifications
You must be signed in to change notification settings - Fork 2
/
acad_bot_utils.py
72 lines (62 loc) · 2.22 KB
/
acad_bot_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import requests
from acad_gpt.environment import FILE_UPLOAD_PATH
from acad_gpt.parsers import DocumentType
from acad_gpt.parsers.base_parser import DocumentStatus
from acad_gpt.parsers.config import ParserConfig
from acad_gpt.parsers.pdf_parser import PDFParser
from acad_gpt.parsers.webpage_parser import WebPageParser
def get_url_type(url):
if url.split(".")[-1].strip() == "pdf":
return DocumentType.pdf, url
else:
return DocumentType.webpage, url
def download(url: str):
if not os.path.exists(FILE_UPLOAD_PATH):
os.makedirs(FILE_UPLOAD_PATH) # create folder if it does not exist
filename = url.split("/")[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(FILE_UPLOAD_PATH, filename)
r = requests.get(url, stream=True)
is_saved = False
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
is_saved = True
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
return is_saved, file_path
def process_url(
url,
type,
embed_client,
status=DocumentStatus.todo,
):
document = None
if type in [DocumentType.pdf, DocumentType.paper]:
is_saved, file_path = download(url)
if is_saved:
parser = PDFParser()
parser_config = ParserConfig(file_path=file_path, file_url=url)
results = parser.parse(config=parser_config)
document = parser.to_documents(
pdf_contents=results,
embed_client=embed_client,
type=type,
status=status,
)
else:
parser = WebPageParser()
parser_config = ParserConfig(file_path=url, file_url=url)
results = parser.parse(config=parser_config)
document = parser.to_documents(
web_contents=results,
embed_client=embed_client,
type=type,
status=status,
)
return document