diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py index c78c7397c6452..faddf4dd205f7 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/knowledge_base/base.py @@ -5,7 +5,8 @@ class KnowledgeBaseWebReader(BaseReader): - """Knowledge base reader. + """ + Knowledge base reader. Crawls and reads articles from a knowledge base/help center with Playwright. Tested on Zendesk and Intercom CMS, may work on others. @@ -36,6 +37,7 @@ def __init__( title_selector: Optional[str] = None, subtitle_selector: Optional[str] = None, body_selector: Optional[str] = None, + max_depth: int = 100, ) -> None: """Initialize with parameters.""" self.root_url = root_url @@ -44,6 +46,7 @@ def __init__( self.title_selector = title_selector self.subtitle_selector = subtitle_selector self.body_selector = body_selector + self.max_depth = max_depth def load_data(self) -> List[Document]: """Load data from the knowledge base.""" @@ -54,9 +57,7 @@ def load_data(self) -> List[Document]: # Crawl article_urls = self.get_article_urls( - browser, - self.root_url, - self.root_url, + browser, self.root_url, self.root_url, self.max_depth ) # Scrape @@ -82,7 +83,8 @@ def scrape_article( browser: Any, url: str, ) -> Dict[str, str]: - """Scrape a single article url. + """ + Scrape a single article url. Args: browser (Any): a Playwright Chromium browser. @@ -125,9 +127,10 @@ def scrape_article( return {"title": title, "subtitle": subtitle, "body": body, "url": url} def get_article_urls( - self, browser: Any, root_url: str, current_url: str + self, browser: Any, root_url: str, current_url: str, max_depth: int = 100 ) -> List[str]: - """Recursively crawl through the knowledge base to find a list of articles. + """ + Recursively crawl through the knowledge base to find a list of articles. Args: browser (Any): a Playwright Chromium browser. @@ -158,7 +161,9 @@ def get_article_urls( for link in links: url = root_url + page.evaluate("(node) => node.getAttribute('href')", link) - article_urls.extend(self.get_article_urls(browser, root_url, url)) + article_urls.extend( + self.get_article_urls(browser, root_url, url, max_depth) + ) page.close() diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index ab4428f473a6b..333f4151f51db 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -45,7 +45,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.3.2" +version = "0.3.3" [tool.poetry.dependencies] python = ">=3.9,<4.0"