Skip to content

Commit

Permalink
Merge pull request #2040 from jere344/dev-7
Browse files Browse the repository at this point in the history
Use chardet by default to find the encoding
  • Loading branch information
dipu-bd authored Aug 27, 2023
2 parents 78f5356 + 2510f58 commit 6e820d6
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
12 changes: 6 additions & 6 deletions lncrawl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def submit_form_json(
)
return response.json()

def get_soup(self, url, headers={}, parser=None, **kwargs) -> BeautifulSoup:
def get_soup(self, url, headers={}, parser=None, encoding=None, **kwargs) -> BeautifulSoup:
"""Fetch the content and return a BeautifulSoup instance of the page"""
headers = CaseInsensitiveDict(headers)
headers.setdefault(
Expand All @@ -294,10 +294,10 @@ def get_soup(self, url, headers={}, parser=None, **kwargs) -> BeautifulSoup:
)
response = self.get_response(url, **kwargs)
self.last_soup_url = url
return self.make_soup(response, parser)
return self.make_soup(response, parser, encoding)

def post_soup(
self, url, data={}, headers={}, parser=None, **kwargs
self, url, data={}, headers={}, parser=None, encoding=None, **kwargs
) -> BeautifulSoup:
"""Make a POST request and return BeautifulSoup instance of the response"""
headers = CaseInsensitiveDict(headers)
Expand All @@ -306,10 +306,10 @@ def post_soup(
"text/html,application/xhtml+xml,application/xml;q=0.9",
)
response = self.post_response(url, data=data, headers=headers, **kwargs)
return self.make_soup(response, parser)
return self.make_soup(response, parser, encoding)

def submit_form_for_soup(
self, url, data={}, headers={}, multipart=False, parser=None, **kwargs
self, url, data={}, headers={}, multipart=False, parser=None, encoding=None, **kwargs
) -> BeautifulSoup:
"""Simulate submit form request and return a BeautifulSoup instance of the response"""
headers = CaseInsensitiveDict(headers)
Expand All @@ -320,4 +320,4 @@ def submit_form_for_soup(
response = self.submit_form(
url, data=data, headers=headers, multipart=multipart, **kwargs
)
return self.make_soup(response, parser)
return self.make_soup(response, parser, encoding)
11 changes: 9 additions & 2 deletions lncrawl/core/soup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from .exeptions import LNException

import chardet

logger = logging.getLogger(__name__)


Expand All @@ -33,11 +35,16 @@ def make_soup(
self,
data: Union[Response, bytes, str],
parser: Optional[str] = None,
encoding: Optional[str] = None,
) -> BeautifulSoup:
if isinstance(data, Response):
html = data.content.decode("utf8", "ignore")
if encoding is None:
encoding = chardet.detect(data.content)["encoding"]
html = data.content.decode(encoding, "ignore")
elif isinstance(data, bytes):
html = data.decode("utf8", "ignore")
if encoding is None:
encoding = chardet.detect(data)["encoding"]
html = data.decode(encoding, "ignore")
elif isinstance(data, str):
html = str(data)
else:
Expand Down

0 comments on commit 6e820d6

Please sign in to comment.