From 88d4aefa557fe4f34660d2dc22974703cd82aa3f Mon Sep 17 00:00:00 2001 From: needKVAS <43537033+needKVAS@users.noreply.github.com> Date: Tue, 15 Aug 2023 21:42:34 +0300 Subject: [PATCH 1/3] Create ranobelib.py --- sources/ru/ranobelib.py | 94 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 sources/ru/ranobelib.py diff --git a/sources/ru/ranobelib.py b/sources/ru/ranobelib.py new file mode 100644 index 000000000..385fd4a36 --- /dev/null +++ b/sources/ru/ranobelib.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +import logging +import json +import operator + +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class RanobeLibCrawler(Crawler): + base_url = [ + "https://ranobelib.me/", + ] + + def initialize(self): + self.init_executor(1) + + def read_novel_info(self): + clean_url = self.novel_url.split("?")[0].strip("/") + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(f"{clean_url}?section=info") + + for script in soup.find_all("script"): + json_var = "window.__DATA__ = " + text = script.text.strip() + if not text or not text.startswith(json_var): + continue + text = text[len(json_var) : text.find("window._SITE_COLOR_")].strip( + ";\t\n " + ) + content = json.loads(text) + + self.novel_title = content["manga"]["rusName"] + logger.info("Novel title: %s", self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one(".media-sidebar__cover > img:nth-child(1)")["src"] + ) + logger.info("Novel cover: %s", self.novel_cover) + + self.novel_author = soup.select_one( + "div.media-info-list__item:nth-child(6) > div:nth-child(2) > a:nth-child(1)" + ).text.strip() + logger.info("Novel author: %s", self.novel_author) + + self.novel_synopsis = self.cleaner.extract_contents( + soup.find("div", {"class": "media-description__text"}) + ) + logger.info("Novel synopsis: %s", self.novel_synopsis) + + for tag in soup.find_all("a", {"class": "media-tag-item"}): + self.novel_tags.append(tag.text) + logger.info("Novel tags: %s", self.novel_tags) + + chapters = content["chapters"]["list"] + chapters.reverse() + chap_id = 0 + volumes_set = set() + + branches = dict() + for chapter in chapters: + key = chapter["branch_id"] + branches[key] = branches.setdefault(key, 0) + 1 + branch = max(branches.items(), key=operator.itemgetter(1))[0] + + for chapter in chapters: + if chapter["branch_id"] != branch: + continue + + chap_id = chap_id + 1 + chap_num = chapter["chapter_number"] + vol_id = chapter["chapter_volume"] + + if vol_id not in volumes_set: + volumes_set.add(vol_id) + self.volumes.append({"id": vol_id}) + + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "url": self.absolute_url( + f"{clean_url}/v{str(vol_id)}/c{chap_num}/" + ), + "title": chapter["chapter_name"] or (f"Глава {chap_num}"), + } + ) + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter["url"]) + contents = soup.select_one(".reader-container") + self.cleaner.clean_contents(contents) + return str(contents) From 3e978ee732c3c7907c6d66c14591ee1d9a3bc65e Mon Sep 17 00:00:00 2001 From: needKVAS <43537033+needKVAS@users.noreply.github.com> Date: Tue, 15 Aug 2023 21:48:32 +0300 Subject: [PATCH 2/3] Crawler name fix --- sources/ru/ranobelib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/ru/ranobelib.py b/sources/ru/ranobelib.py index 385fd4a36..d2ab67543 100644 --- a/sources/ru/ranobelib.py +++ b/sources/ru/ranobelib.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) -class RanobeLibCrawler(Crawler): +class RanobeLibMeCrawler(Crawler): base_url = [ "https://ranobelib.me/", ] From 538f89e81641657d991b91446266f237ff2c7d08 Mon Sep 17 00:00:00 2001 From: needKVAS <43537033+needKVAS@users.noreply.github.com> Date: Fri, 18 Aug 2023 21:13:51 +0300 Subject: [PATCH 3/3] Novel author fix --- sources/ru/ranobelib.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sources/ru/ranobelib.py b/sources/ru/ranobelib.py index d2ab67543..4aed4f10c 100644 --- a/sources/ru/ranobelib.py +++ b/sources/ru/ranobelib.py @@ -30,18 +30,24 @@ def read_novel_info(self): ";\t\n " ) content = json.loads(text) + break self.novel_title = content["manga"]["rusName"] logger.info("Novel title: %s", self.novel_title) - self.novel_cover = self.absolute_url( - soup.select_one(".media-sidebar__cover > img:nth-child(1)")["src"] - ) + possible_image = soup.select_one(".media-sidebar__cover > img:nth-child(1)") + if possible_image: + self.novel_cover = self.absolute_url(possible_image["src"]) logger.info("Novel cover: %s", self.novel_cover) - self.novel_author = soup.select_one( - "div.media-info-list__item:nth-child(6) > div:nth-child(2) > a:nth-child(1)" - ).text.strip() + for list_value in soup.find_all("div", {"class": "media-info-list__value"}): + possible_author_ref = list_value.find("a") + if not possible_author_ref: + continue + if "ranobelib.me/people" not in possible_author_ref["href"]: + continue + self.novel_author = possible_author_ref.text.strip() + break logger.info("Novel author: %s", self.novel_author) self.novel_synopsis = self.cleaner.extract_contents(