Skip to content

Commit

Permalink
Merge pull request #309 from openchatai/fix/form_data
Browse files Browse the repository at this point in the history
fixing some scraping issues
  • Loading branch information
codebanesr authored Nov 27, 2023
2 parents 213a53a + d1e8ce9 commit b1b3c9c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 18 deletions.
8 changes: 2 additions & 6 deletions llm-server/routes/workflow/utils/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,9 @@ def get_relevant_docs(text: str, bot_id: str) -> Optional[str]:

if result and len(result) > 0:
# Assuming result is a list of objects and each object has a page_content attribute
all_page_content = "\n".join([item.page_content for item in result])
all_page_content = "\n\n".join([item.page_content for item in result])

# Replace multiple new lines with a single new line
cleaned_page_content = "\n".join(
line.strip() for line in all_page_content.splitlines() if line.strip()
)
return cleaned_page_content
return all_page_content

return None

Expand Down
43 changes: 31 additions & 12 deletions workers/tasks/web_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@

from shared.utils.opencopilot_utils import get_embeddings, init_vector_store
from shared.utils.opencopilot_utils.interfaces import StoreOptions
from repos.website_data_sources import create_website_data_source, get_website_data_source_by_id, update_website_data_source_status_by_url
from repos.website_data_sources import (
create_website_data_source,
get_website_data_source_by_id,
update_website_data_source_status_by_url,
)
from typing import Set
from collections import deque

selenium_grid_url = os.getenv("SELENIUM_GRID_URL", "http://localhost:4444/wd/hub")


def is_valid_url(url, target_url):
"""Returns True if the URL is valid and the root of both URLs are the same, False otherwise."""

# Regular expression for matching valid URLs.
regex = re.compile(r'^(?:http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&:/~+#-])$')
regex = re.compile(
r"^(?:http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&:/~+#-])$"
)

# Check if the URL is valid.
if regex.match(url) is None:
Expand All @@ -33,7 +40,10 @@ def is_valid_url(url, target_url):
# Check if the root of both URLs are the same.
return url_root == target_url_root

def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_pages: int) -> int:

def scrape_website_in_bfs(
url: str, bot_id: str, unique_urls: Set[str], max_pages: int
) -> int:
"""Scrapes a website in breadth-first order, following all of the linked pages.
Args:
Expand All @@ -54,7 +64,7 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
url = queue.popleft()
if url in visited_urls or total_pages_scraped >= max_pages:
continue

create_website_data_source(chatbot_id=bot_id, status="PENDING", url=url)
visited_urls.add(url)
unique_urls.add(url)
Expand All @@ -74,6 +84,10 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
queue.append(next_url)

text = soup.get_text()
text = re.sub(
r"\s+", " ", text
) # Replace all whitespace with single spaces
text = text.strip() # Trim leading and trailing whitespace

print(text)
# push to vector db
Expand All @@ -83,7 +97,11 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page

docs = text_splitter.create_documents([text])
embeddings = get_embeddings()
init_vector_store(docs, embeddings, StoreOptions(namespace="knowledgebase", metadata={"bot_id": bot_id}))
init_vector_store(
docs,
embeddings,
StoreOptions(namespace="knowledgebase", metadata={"bot_id": bot_id}),
)
update_website_data_source_status_by_url(url=url, status="SUCCESS")

if driver is not None:
Expand All @@ -96,14 +114,15 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page

return total_pages_scraped


def get_web_driver():
options = Options()
driver = webdriver.Remote(command_executor=selenium_grid_url, options=options)
driver = webdriver.Remote(command_executor=selenium_grid_url, options=options)
driver.set_script_timeout(300)
driver.set_page_load_timeout(300)
driver.set_page_load_timeout(300)
return driver


@shared_task
def web_crawl(url, bot_id: str):
try:
Expand All @@ -113,8 +132,8 @@ def web_crawl(url, bot_id: str):
scrape_website_in_bfs(url, bot_id, unique_urls, 5)
except Exception as e:
traceback.print_exc()


@shared_task
def resume_failed_website_scrape(website_data_source_id: str):
"""Resumes a failed website scrape.
Expand All @@ -134,4 +153,4 @@ def resume_failed_website_scrape(website_data_source_id: str):

# Scrape the website.
unique_urls: set = set()
scrape_website_in_bfs(url, website_data_source.bot_id, unique_urls, 1)
scrape_website_in_bfs(url, website_data_source.bot_id, unique_urls, 5)

0 comments on commit b1b3c9c

Please sign in to comment.