From 17d6da4cc55f6844fb6289659cd1c42d300ad9e3 Mon Sep 17 00:00:00 2001 From: Massimiliano Angelino Date: Fri, 15 Dec 2023 12:37:33 +0100 Subject: [PATCH] fix(crawler): only process "text/html" Content-Type pages --- .../layers/python-sdk/python/genai_core/websites/crawler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py index 7829ee4e5..9e32539bd 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py +++ b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py @@ -101,6 +101,8 @@ def parse_url(url: str): base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}" response = requests.get(url, timeout=20) + if response.headers["Content-Type"] != "text/html": + raise Exception(f"Invalid content type {response.headers['Content-Type']}") soup = BeautifulSoup(response.content, "html.parser") content = soup.text content = re.sub(r"[ \n]+", " ", content)