Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: prevent infinite recursion in get_article_urls #17360

Merged
merged 2 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@


class KnowledgeBaseWebReader(BaseReader):
"""Knowledge base reader.
"""
Knowledge base reader.

Crawls and reads articles from a knowledge base/help center with Playwright.
Tested on Zendesk and Intercom CMS, may work on others.
Expand Down Expand Up @@ -36,6 +37,7 @@ def __init__(
title_selector: Optional[str] = None,
subtitle_selector: Optional[str] = None,
body_selector: Optional[str] = None,
max_depth: int = 100,
) -> None:
"""Initialize with parameters."""
self.root_url = root_url
Expand All @@ -44,6 +46,7 @@ def __init__(
self.title_selector = title_selector
self.subtitle_selector = subtitle_selector
self.body_selector = body_selector
self.max_depth = max_depth

def load_data(self) -> List[Document]:
"""Load data from the knowledge base."""
Expand All @@ -54,9 +57,7 @@ def load_data(self) -> List[Document]:

# Crawl
article_urls = self.get_article_urls(
browser,
self.root_url,
self.root_url,
browser, self.root_url, self.root_url, self.max_depth
)

# Scrape
Expand All @@ -82,7 +83,8 @@ def scrape_article(
browser: Any,
url: str,
) -> Dict[str, str]:
"""Scrape a single article url.
"""
Scrape a single article url.

Args:
browser (Any): a Playwright Chromium browser.
Expand Down Expand Up @@ -125,9 +127,10 @@ def scrape_article(
return {"title": title, "subtitle": subtitle, "body": body, "url": url}

def get_article_urls(
self, browser: Any, root_url: str, current_url: str
self, browser: Any, root_url: str, current_url: str, max_depth: int = 100
Copy link
Contributor

@jzhao62 jzhao62 Dec 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we track the url we visited so we do not visit it again, instead of hardcoding depth ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I needed a quick fix but yeah, that would be a better solution, feel free to open a PR!

) -> List[str]:
"""Recursively crawl through the knowledge base to find a list of articles.
"""
Recursively crawl through the knowledge base to find a list of articles.

Args:
browser (Any): a Playwright Chromium browser.
Expand Down Expand Up @@ -158,7 +161,9 @@ def get_article_urls(

for link in links:
url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
article_urls.extend(self.get_article_urls(browser, root_url, url))
article_urls.extend(
self.get_article_urls(browser, root_url, url, max_depth)
)

page.close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.3.2"
version = "0.3.3"

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
Expand Down
Loading