From 1aa3280316d87e1218a20a955d9e14d387d6ebd3 Mon Sep 17 00:00:00 2001 From: Ronald Luitwieler Date: Tue, 23 Jan 2024 15:49:37 +0100 Subject: [PATCH] crawler fails on content-type I have a website that returns the following header: 'Content-Type': 'text/html; charset=utf-8' --- .../layers/python-sdk/python/genai_core/websites/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py index 9e32539bd..e426b7383 100644 --- a/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py +++ b/lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py @@ -101,7 +101,7 @@ def parse_url(url: str): base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}" response = requests.get(url, timeout=20) - if response.headers["Content-Type"] != "text/html": + if "text/html" not in response.headers["Content-Type"]: raise Exception(f"Invalid content type {response.headers['Content-Type']}") soup = BeautifulSoup(response.content, "html.parser") content = soup.text