diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py index 540df8d87..afd9c0c64 100644 --- a/lncrawl/core/scraper.py +++ b/lncrawl/core/scraper.py @@ -285,7 +285,7 @@ def submit_form_json( ) return response.json() - def get_soup(self, url, headers={}, parser=None, **kwargs) -> BeautifulSoup: + def get_soup(self, url, headers={}, parser=None, encoding=None, **kwargs) -> BeautifulSoup: """Fetch the content and return a BeautifulSoup instance of the page""" headers = CaseInsensitiveDict(headers) headers.setdefault( @@ -294,10 +294,10 @@ def get_soup(self, url, headers={}, parser=None, **kwargs) -> BeautifulSoup: ) response = self.get_response(url, **kwargs) self.last_soup_url = url - return self.make_soup(response, parser) + return self.make_soup(response, parser, encoding) def post_soup( - self, url, data={}, headers={}, parser=None, **kwargs + self, url, data={}, headers={}, parser=None, encoding=None, **kwargs ) -> BeautifulSoup: """Make a POST request and return BeautifulSoup instance of the response""" headers = CaseInsensitiveDict(headers) @@ -306,10 +306,10 @@ def post_soup( "text/html,application/xhtml+xml,application/xml;q=0.9", ) response = self.post_response(url, data=data, headers=headers, **kwargs) - return self.make_soup(response, parser) + return self.make_soup(response, parser, encoding) def submit_form_for_soup( - self, url, data={}, headers={}, multipart=False, parser=None, **kwargs + self, url, data={}, headers={}, multipart=False, parser=None, encoding=None, **kwargs ) -> BeautifulSoup: """Simulate submit form request and return a BeautifulSoup instance of the response""" headers = CaseInsensitiveDict(headers) @@ -320,4 +320,4 @@ def submit_form_for_soup( response = self.submit_form( url, data=data, headers=headers, multipart=multipart, **kwargs ) - return self.make_soup(response, parser) + return self.make_soup(response, parser, encoding) diff --git a/lncrawl/core/soup.py b/lncrawl/core/soup.py index db5e0dd1d..12afb8f37 100644 --- a/lncrawl/core/soup.py +++ b/lncrawl/core/soup.py @@ -7,6 +7,8 @@ from .exeptions import LNException +import chardet + logger = logging.getLogger(__name__) @@ -33,11 +35,16 @@ def make_soup( self, data: Union[Response, bytes, str], parser: Optional[str] = None, + encoding: Optional[str] = None, ) -> BeautifulSoup: if isinstance(data, Response): - html = data.content.decode("utf8", "ignore") + if encoding is None: + encoding = chardet.detect(data.content)["encoding"] + html = data.content.decode(encoding, "ignore") elif isinstance(data, bytes): - html = data.decode("utf8", "ignore") + if encoding is None: + encoding = chardet.detect(data)["encoding"] + html = data.decode(encoding, "ignore") elif isinstance(data, str): html = str(data) else: