From cbc214da4780050ba01ec5e19fd509a156257188 Mon Sep 17 00:00:00 2001 From: zerty Date: Sun, 2 Jul 2023 11:26:53 +0200 Subject: [PATCH 1/5] Rewrite scribblehub.com with browser issues: #1942 #1896 #1788 --- sources/en/s/scribblehub.py | 179 ++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 89 deletions(-) diff --git a/sources/en/s/scribblehub.py b/sources/en/s/scribblehub.py index 16056fbd8..aab060c80 100644 --- a/sources/en/s/scribblehub.py +++ b/sources/en/s/scribblehub.py @@ -1,22 +1,32 @@ # -*- coding: utf-8 -*- + import logging -from math import ceil -from urllib.parse import quote +import re + +from typing import Generator, Union -from bs4 import Tag +from bs4 import BeautifulSoup, Tag -from lncrawl.core.crawler import Crawler +from lncrawl.models import Chapter, SearchResult, Volume +from lncrawl.templates.browser.searchable import SearchableBrowserTemplate +from lncrawl.core.exeptions import FallbackToBrowser + +from urllib.parse import urljoin, quote_plus logger = logging.getLogger(__name__) -chapter_post_url = "https://www.scribblehub.com/wp-admin/admin-ajax.php" + +digit_regex = re.compile(r"\?toc=(\d+)#content1$") -class ScribbleHubCrawler(Crawler): +class ScribbleHubCrawler(SearchableBrowserTemplate): base_url = [ "https://www.scribblehub.com/", "https://scribblehub.com/", ] + has_manga = False + has_mtl = False + def initialize(self) -> None: self.cleaner.bad_css.update( [ @@ -41,92 +51,83 @@ def initialize(self) -> None: ] ) - def search_novel(self, query): - url = f"{self.home_url}?s={quote(query)}&post_type=fictionposts" - soup = self.get_soup(url) - - results = [] - for novel in soup.select("div.search_body"): - a = novel.select_one(".search_title a") - info = novel.select_one(".search_stats") - if not isinstance(a, Tag): - continue - - results.append( - { - "title": a.text.strip(), - "url": self.absolute_url(a["href"]), - "info": info.text.strip() if isinstance(info, Tag) else "", - } + def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]: + self.visit( + urljoin( + self.home_url, "/?s={}&post_type=fictionposts".format(quote_plus(query)) ) - - return results - - def read_novel_info(self): - soup = self.get_soup(self.novel_url) - - possible_title = soup.select_one("div.fic_title") - assert isinstance(possible_title, Tag) - self.novel_title = str(possible_title["title"]).strip() - logger.info("Novel title: %s", self.novel_title) - - possible_image = soup.find("div", {"class": "fic_image"}) - if isinstance(possible_image, Tag): - possible_image = possible_image.find("img") - if isinstance(possible_image, Tag): - self.novel_cover = self.absolute_url(possible_image["src"]) - logger.info("Novel cover: %s", self.novel_cover) - - possible_author = soup.find("span", {"class": "auth_name_fic"}) - if isinstance(possible_author, Tag): - self.novel_author = possible_author.text.strip() - logger.info("Novel author: %s", self.novel_author) - - chapter_count = soup.find("span", {"class": "cnt_toc"}) - chapter_count = ( - int(chapter_count.text) if isinstance(chapter_count, Tag) else -1 ) - page_count = ceil(chapter_count / 15.0) - logger.info("Chapter list pages: %d" % page_count) - - possible_mypostid = soup.select_one("input#mypostid") - assert isinstance(possible_mypostid, Tag) - mypostid = int(str(possible_mypostid["value"])) - logger.info("#mypostid = %d", mypostid) - - response = self.submit_form( - f"{self.home_url}wp-admin/admin-ajax.php", - { - "action": "wi_getreleases_pagination", - "pagenum": -1, - "mypostid": mypostid, - }, + self.browser.wait(".search") + for a in self.browser.soup.select( + ".fic .search_main_box .search_body .search_title a" + ): + print(a) + for elem in self.browser.soup.select( + ".fic .search_main_box .search_body .search_title a" + ): + yield elem + + def select_search_items(self, query: str) -> Generator[Tag, None, None]: + raise FallbackToBrowser() + + def parse_search_item(self, tag: Tag) -> SearchResult: + return SearchResult( + title=tag.text.strip(), + url=self.absolute_url(tag["href"]), ) - soup = self.make_soup(response) - for chapter in reversed(soup.select(".toc_ol a.toc_a")): - self.chapters.append( - { - "id": len(self.chapters) + 1, - "url": self.absolute_url(str(chapter["href"])), - "title": chapter.text.strip(), - } + def visit_novel_page_in_browser(self) -> BeautifulSoup: + self.visit(self.novel_url) + self.browser.wait(".fictionposts-template-default") + + def parse_title(self, soup: BeautifulSoup) -> str: + tag = soup.select_one(".fic_title") + assert tag + return tag.text.strip() + + def parse_cover(self, soup: BeautifulSoup) -> str: + tag = soup.select_one(".fic_image img") + assert tag + if tag.has_attr("data-src"): + return self.absolute_url(tag["data-src"]) + elif tag.has_attr("src"): + return self.absolute_url(tag["src"]) + + def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]: + for a in soup.select(".nauth_name_fic"): + yield a.text.strip() + + def parse_chapter_list_in_browser( + self, + ) -> Generator[Union[Chapter, Volume], None, None]: + _pages = max( + [ + int(digit_regex.search(a["href"]).group(1)) + for a in self.browser.soup.select(".simple-pagination a") + if digit_regex.search(a["href"]) is not None + ] + ) + if not _pages: + _page = 1 + tags = self.browser.soup.select(".main .toc li a") + for i in range(2, _pages + 1): + self.browser.visit(urljoin(self.novel_url, f"?toc={i}#content1")) + self.browser.wait(".main") + tags += self.browser.soup.select(".main .toc li a") + + for _id, _t in enumerate(reversed(tags)): + yield Chapter( + id=_id, url=self.absolute_url(_t.get("href")), title=_t.text.strip() ) - def download_chapter_body(self, chapter): - soup = self.get_soup(chapter["url"]) - contents = soup.select_one("div#chp_raw") - self.cleaner.clean_contents(contents) - body = str(contents) - body += """ - """ - return body + def parse_chapter_list( + self, soup: BeautifulSoup + ) -> Generator[Union[Chapter, Volume], None, None]: + pass + + def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: + self.visit(chapter.url) + self.browser.wait(".site-content-contain") + + def select_chapter_body(self, soup: BeautifulSoup) -> Tag: + return soup.select_one("div#chp_raw") From b034da0e617a814fbd5ad69395e5ffaafa0fc65a Mon Sep 17 00:00:00 2001 From: zerty Date: Sun, 2 Jul 2023 11:28:54 +0200 Subject: [PATCH 2/5] Update scribblehub.py --- sources/en/s/scribblehub.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sources/en/s/scribblehub.py b/sources/en/s/scribblehub.py index aab060c80..3389ee438 100644 --- a/sources/en/s/scribblehub.py +++ b/sources/en/s/scribblehub.py @@ -58,10 +58,6 @@ def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, Non ) ) self.browser.wait(".search") - for a in self.browser.soup.select( - ".fic .search_main_box .search_body .search_title a" - ): - print(a) for elem in self.browser.soup.select( ".fic .search_main_box .search_body .search_title a" ): @@ -108,7 +104,7 @@ def parse_chapter_list_in_browser( ] ) if not _pages: - _page = 1 + _pages = 1 tags = self.browser.soup.select(".main .toc li a") for i in range(2, _pages + 1): self.browser.visit(urljoin(self.novel_url, f"?toc={i}#content1")) From 1f8be72f83879eccd94b11b1905d5c0c6e5873d7 Mon Sep 17 00:00:00 2001 From: zerty Date: Wed, 5 Jul 2023 19:32:11 +0200 Subject: [PATCH 3/5] Missing implementation in scribblehub --- sources/en/s/scribblehub.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/sources/en/s/scribblehub.py b/sources/en/s/scribblehub.py index 3389ee438..7b67244e3 100644 --- a/sources/en/s/scribblehub.py +++ b/sources/en/s/scribblehub.py @@ -11,6 +11,7 @@ from lncrawl.templates.browser.searchable import SearchableBrowserTemplate from lncrawl.core.exeptions import FallbackToBrowser +from math import ceil from urllib.parse import urljoin, quote_plus logger = logging.getLogger(__name__) @@ -119,7 +120,32 @@ def parse_chapter_list_in_browser( def parse_chapter_list( self, soup: BeautifulSoup ) -> Generator[Union[Chapter, Volume], None, None]: - pass + chapter_count = soup.find("span", {"class": "cnt_toc"}) + chapter_count = ( + int(chapter_count.text) if isinstance(chapter_count, Tag) else -1 + ) + page_count = ceil(chapter_count / 15.0) + + possible_mypostid = soup.select_one("input#mypostid") + assert isinstance(possible_mypostid, Tag) + mypostid = int(str(possible_mypostid["value"])) + logger.info("#mypostid = %d", mypostid) + + response = self.submit_form( + f"{self.home_url}wp-admin/admin-ajax.php", + { + "action": "wi_getreleases_pagination", + "pagenum": -1, + "mypostid": mypostid, + }, + ) + soup = self.make_soup(response) + for chapter in reversed(soup.select(".toc_ol a.toc_a")): + yield Chapter( + id= len(self.chapters) + 1, + url=self.absolute_url(str(chapter["href"])), + title=chapter.text.strip(), + ) def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: self.visit(chapter.url) From 2d0893a1350fbef6555146730f55175e06495e3d Mon Sep 17 00:00:00 2001 From: zerty Date: Wed, 5 Jul 2023 19:36:00 +0200 Subject: [PATCH 4/5] Small fix for SH --- sources/en/s/scribblehub.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sources/en/s/scribblehub.py b/sources/en/s/scribblehub.py index 7b67244e3..c3a0e425a 100644 --- a/sources/en/s/scribblehub.py +++ b/sources/en/s/scribblehub.py @@ -124,7 +124,6 @@ def parse_chapter_list( chapter_count = ( int(chapter_count.text) if isinstance(chapter_count, Tag) else -1 ) - page_count = ceil(chapter_count / 15.0) possible_mypostid = soup.select_one("input#mypostid") assert isinstance(possible_mypostid, Tag) @@ -142,10 +141,10 @@ def parse_chapter_list( soup = self.make_soup(response) for chapter in reversed(soup.select(".toc_ol a.toc_a")): yield Chapter( - id= len(self.chapters) + 1, - url=self.absolute_url(str(chapter["href"])), - title=chapter.text.strip(), - ) + id=len(self.chapters) + 1, + url=self.absolute_url(str(chapter["href"])), + title=chapter.text.strip(), + ) def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: self.visit(chapter.url) From f5f885350f9ae43477e250ff591127a811212c9f Mon Sep 17 00:00:00 2001 From: zerty Date: Wed, 5 Jul 2023 19:51:32 +0200 Subject: [PATCH 5/5] Update scribblehub.py --- sources/en/s/scribblehub.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/en/s/scribblehub.py b/sources/en/s/scribblehub.py index c3a0e425a..1b9144e24 100644 --- a/sources/en/s/scribblehub.py +++ b/sources/en/s/scribblehub.py @@ -11,7 +11,6 @@ from lncrawl.templates.browser.searchable import SearchableBrowserTemplate from lncrawl.core.exeptions import FallbackToBrowser -from math import ceil from urllib.parse import urljoin, quote_plus logger = logging.getLogger(__name__)