Merge pull request #309 from openchatai/fix/form_data

fixing some scraping issues
openchatai · Nov 27, 2023 · b1b3c9c · b1b3c9c
2 parents 213a53a + d1e8ce9
commit b1b3c9c
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 18 deletions.
diff --git a/llm-server/routes/workflow/utils/router.py b/llm-server/routes/workflow/utils/router.py
@@ -34,13 +34,9 @@ def get_relevant_docs(text: str, bot_id: str) -> Optional[str]:
 
         if result and len(result) > 0:
             # Assuming result is a list of objects and each object has a page_content attribute
-            all_page_content = "\n".join([item.page_content for item in result])
+            all_page_content = "\n\n".join([item.page_content for item in result])
 
-            # Replace multiple new lines with a single new line
-            cleaned_page_content = "\n".join(
-                line.strip() for line in all_page_content.splitlines() if line.strip()
-            )
-            return cleaned_page_content
+            return all_page_content
 
         return None
 

diff --git a/workers/tasks/web_crawl.py b/workers/tasks/web_crawl.py
@@ -10,17 +10,24 @@
 
 from shared.utils.opencopilot_utils import get_embeddings, init_vector_store
 from shared.utils.opencopilot_utils.interfaces import StoreOptions
-from repos.website_data_sources import create_website_data_source, get_website_data_source_by_id, update_website_data_source_status_by_url
+from repos.website_data_sources import (
+    create_website_data_source,
+    get_website_data_source_by_id,
+    update_website_data_source_status_by_url,
+)
 from typing import Set
 from collections import deque
 
 selenium_grid_url = os.getenv("SELENIUM_GRID_URL", "http://localhost:4444/wd/hub")
 
+
 def is_valid_url(url, target_url):
     """Returns True if the URL is valid and the root of both URLs are the same, False otherwise."""
 
     # Regular expression for matching valid URLs.
-    regex = re.compile(r'^(?:http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&:/~+#-])$')
+    regex = re.compile(
+        r"^(?:http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&:/~+#-])$"
+    )
 
     # Check if the URL is valid.
     if regex.match(url) is None:
@@ -33,7 +40,10 @@ def is_valid_url(url, target_url):
     # Check if the root of both URLs are the same.
     return url_root == target_url_root
 
-def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_pages: int) -> int:
+
+def scrape_website_in_bfs(
+    url: str, bot_id: str, unique_urls: Set[str], max_pages: int
+) -> int:
     """Scrapes a website in breadth-first order, following all of the linked pages.
 
     Args:
@@ -54,7 +64,7 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
             url = queue.popleft()
             if url in visited_urls or total_pages_scraped >= max_pages:
                 continue
-            
+
             create_website_data_source(chatbot_id=bot_id, status="PENDING", url=url)
             visited_urls.add(url)
             unique_urls.add(url)
@@ -74,6 +84,10 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
                         queue.append(next_url)
 
             text = soup.get_text()
+            text = re.sub(
+                r"\s+", " ", text
+            )  # Replace all whitespace with single spaces
+            text = text.strip()  # Trim leading and trailing whitespace
 
             print(text)
             # push to vector db
@@ -83,7 +97,11 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
 
             docs = text_splitter.create_documents([text])
             embeddings = get_embeddings()
-            init_vector_store(docs, embeddings, StoreOptions(namespace="knowledgebase", metadata={"bot_id": bot_id}))
+            init_vector_store(
+                docs,
+                embeddings,
+                StoreOptions(namespace="knowledgebase", metadata={"bot_id": bot_id}),
+            )
             update_website_data_source_status_by_url(url=url, status="SUCCESS")
 
         if driver is not None:
@@ -96,14 +114,15 @@ def scrape_website_in_bfs(url: str, bot_id: str, unique_urls: Set[str], max_page
 
     return total_pages_scraped
 
+
 def get_web_driver():
     options = Options()
-    driver = webdriver.Remote(command_executor=selenium_grid_url, options=options)    
+    driver = webdriver.Remote(command_executor=selenium_grid_url, options=options)
     driver.set_script_timeout(300)
-    driver.set_page_load_timeout(300)    
+    driver.set_page_load_timeout(300)
     return driver
-    
-    
+
+
 @shared_task
 def web_crawl(url, bot_id: str):
     try:
@@ -113,8 +132,8 @@ def web_crawl(url, bot_id: str):
         scrape_website_in_bfs(url, bot_id, unique_urls, 5)
     except Exception as e:
         traceback.print_exc()
-        
-        
+
+
 @shared_task
 def resume_failed_website_scrape(website_data_source_id: str):
     """Resumes a failed website scrape.
@@ -134,4 +153,4 @@ def resume_failed_website_scrape(website_data_source_id: str):
 
     # Scrape the website.
     unique_urls: set = set()
-    scrape_website_in_bfs(url, website_data_source.bot_id, unique_urls, 1)
+    scrape_website_in_bfs(url, website_data_source.bot_id, unique_urls, 5)