Skip to content

Commit

Permalink
Merge pull request #2485 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
dipu-bd authored Oct 14, 2024
2 parents 7bee68e + 1eadb1e commit 09e7f0a
Show file tree
Hide file tree
Showing 9 changed files with 726 additions and 458 deletions.
857 changes: 446 additions & 411 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/core/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _fetch_cover_image(app):
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed to download cover", e)

if not cover_file.exists() and cover_file.is_file():
if not cover_file.is_file():
generate_cover_image(cover_file.as_posix())

app.progress += 1
Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions sources/en/n/novelrare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
import logging

from lncrawl.core.crawler import Crawler

logger = logging.getLogger(__name__)


class NovelrareCrawler(Crawler):
base_url = "https://novelrare.com/"

def read_novel_info(self):
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one("#manga-title h1")
if possible_title:
self.novel_title = possible_title.get_text(strip=True)

logger.info("Novel title: %s", self.novel_title)

possible_synopsis = soup.select_one("div[aria-labelledby='manga-info'] p")
if possible_synopsis:
self.novel_synopsis = possible_synopsis.get_text()

img_src = soup.select_one("div.summary_image img")
if img_src:
self.novel_cover = self.absolute_url(img_src["src"])

chapters_table = soup.select_one("div.listing-chapters_wrap")
for a in reversed(
chapters_table.find_all("a", class_=lambda x: x != "c-new-tag")
):
chap_id = 1 + (len(self.chapters))

self.chapters.append(
{
"id": chap_id,
"title": a.text.strip(),
"url": self.absolute_url(a['href'])
}
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
content = soup.select_one("div.text-left")
return self.cleaner.extract_contents(content)
55 changes: 23 additions & 32 deletions sources/en/t/teanovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@


class TeaNovelCrawler(Crawler):
base_url = ["https://www.teanovel.com/", "https://www.teanovel.net/"]
base_url = "https://www.teanovel.com"

def initialize(self):
self.init_executor(
workers=4
)

def read_novel_info(self):
soup = self.get_soup(self.novel_url)
Expand All @@ -22,40 +27,26 @@ def read_novel_info(self):

next_data = json.loads(script_tag.get_text())

build_id = next_data["buildId"]
novel_data = next_data["props"]["pageProps"]["novelData"]["novel"]
novel_data = next_data["props"]["pageProps"]["novel"]

self.novel_title = novel_data["name"]
self.novel_author = novel_data["author"]

# img_tag = soup.select_one("main img[src*='_next/']")
# if isinstance(img_tag, Tag):
# self.novel_cover = self.absolute_url(img_tag["src"])

slug = novel_data["slug"]

toc_url = f"{self.home_url}api/chapters/{slug}?slug={slug}&orderBy=asc"
toc_json = self.get_json(toc_url)

while True:
for chapter in toc_json["data"]:
chapter_id = len(self.chapters) + 1
self.chapters.append(
{
"id": chapter_id,
"title": f"Chapter {chapter_id}: {chapter['title']}",
"url": (
f"{self.home_url}_next/data/{build_id}/novel/{slug}/{chapter['slug']}.json"
),
}
)
if "nextId" in toc_json:
toc_json = self.get_json(toc_url + f"&nextId={toc_json['nextId']}")
else:
break
img_tag = soup.select_one("main img[src*='_next/']")
if isinstance(img_tag, Tag):
self.novel_cover = self.absolute_url(img_tag["src"])

chapters = self.get_soup(self.novel_url + "/chapter-list").select("a.border-b")
for chapter in chapters:
chapter_id = len(self.chapters) + 1
self.chapters.append(
{
"id": chapter_id,
"title": chapter.select_one("p").get_text(strip=True),
"url": self.absolute_url(chapter["href"]),
}
)

def download_chapter_body(self, chapter):
chapter_json = self.get_json(chapter["url"])
chapter_data = chapter_json["pageProps"]["chapterData"]

return chapter_data["content"].replace("\n", "<br>")
chapter = self.get_soup(chapter["url"])
return self.cleaner.extract_contents(chapter.select_one("div.prose"))
129 changes: 129 additions & 0 deletions sources/zh/27k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
import logging
from bs4 import Tag
from lncrawl.core.crawler import Crawler
import urllib.parse

from lncrawl.models import Volume, Chapter

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, utf-8",
"Accept-Language": "en-US,en;q=0.9,de-CH;q=0.8,de;q=0.7",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://so.27k.net/",
"DNT": "1",
"Referer": "https://so.27k.net/search",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Opera GX";v="106"',
"Sec-Ch-Ua-Platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
}

logger = logging.getLogger(__name__)
search_url = "https://so.27k.net/search/"


class LeYueDu(Crawler):
base_url = [
"https://so.27k.net",
"https://www.27k.net",
"https://m.27k.net",
"https://tw.27k.net",
"https://www.lreads.com",
]

def initialize(self):
self.init_parser("html.parser")
self.init_executor(ratelimit=20)

def search_novel(self, query):
query = urllib.parse.quote(query)
data = f"searchkey={query}&searchtype=all"
soup = self.post_soup(
search_url,
headers=headers,
data=data,
)

results = []
for novel in soup.select("div.newbox ul li"):
results.append(
{
"title": novel.select_one("h3 a:not([imgbox])").text.title(),
"url": self.absolute_url(novel.select_one("h3 a")["href"]),
"info": "Latest: %s" % novel.select_one("div.zxzj p").text.replace("最近章节", "").replace("最近章節", ""),
}
)

return results

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one("div.booknav2 h1 a")
assert possible_title, "No novel title"
self.novel_title = possible_title.text.strip()
logger.info("Novel title: %s", self.novel_title)

possible_image = soup.select_one("div.bookimg2 img")
if isinstance(possible_image, Tag):
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

possible_author = soup.select_one('.booknav2 p a[href*="/author/"]')
if isinstance(possible_author, Tag):
self.novel_author = possible_author.text.strip()
logger.info("Novel Author: %s", self.novel_author)

# Only one category per novel on this website
possible_tag = soup.select_one('.booknav2 p a[href*="/sort/"]')
if isinstance(possible_tag, Tag):
self.novel_tags = [possible_tag.text.strip()]
logger.info("Novel Tag: %s", self.novel_tags)

# https://www.27k.net/book/70154.html -> https://www.27k.net/read/70154/
soup = self.get_soup(self.novel_url.replace("/book/", "/read/").replace(".html", "/"))

# Synopsis in chapter list page is cleaner than in book info page
possible_synopsis = soup.select_one("div.newnav .ellipsis_2")
if isinstance(possible_synopsis, Tag):
self.novel_synopsis = possible_synopsis.text.strip()
logger.info("Novel Synopsis: %s", self.novel_synopsis)

chapter_list = soup.select("div#catalog ul li")

for item in chapter_list:
chap_id = len(self.chapters) + 1
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append(Volume(vol_id))
a = item.a["href"]
self.chapters.append(
Chapter(
chap_id,
url=self.absolute_url(a),
title=item.a.text.strip(),
volume=vol_id,
)
)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url)

contents = soup.select_one("div.txtnav")
contents.select_one("h1").decompose()
contents.select_one("div.txtinfo").decompose()
contents.select_one("div#txtright").decompose()
for elem in contents.select("div.baocuo"):
elem.decompose()

return self.cleaner.extract_contents(contents)
16 changes: 10 additions & 6 deletions sources/zh/69shuba.cx.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@
}

logger = logging.getLogger(__name__)
search_url = "https://69shuba.cx/modules/article/search.php"
search_url = "%s/modules/article/search.php"


class sixnineshu(Crawler):
base_url = [
"https://69shuba.cx"
"https://69shuba.cx",
"https://69shu.me",
]

def initialize(self):
Expand All @@ -43,8 +44,11 @@ def initialize(self):
def search_novel(self, query):
query = urllib.parse.quote(query.encode("gbk"))
data = f"searchkey={query}&searchtype=all"
soup = self.get_soup(
search_url,
headers["Origin"] = self.home_url
headers["Referer"] = search_url % self.home_url

soup = self.post_soup(
search_url % self.home_url,
headers=headers,
data=data,
encoding="gbk",
Expand All @@ -55,8 +59,8 @@ def search_novel(self, query):
results.append(
{
"title": novel.select_one("h3 a:not([imgbox])").text.title(),
"url": self.absolute_url(novel.select_one("h3 a.imgbox")["href"]),
"info": "Latest: %s" % novel.select_one("div.zxzj p").text,
"url": self.absolute_url(novel.select_one("h3 a")["href"]),
"info": "Latest: %s" % novel.select_one("div.zxzj p").text.replace("最近章节", ""),
}
)

Expand Down
24 changes: 18 additions & 6 deletions sources/zh/ddxsss.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
class DdxSss(Crawler):
base_url = [
"https://www.ddxss.cc/",
]
# custom banned text as it's all loose and the cleaner deletes the whole chapter if used in bad_text_*
banned_text = [
"请收藏本站:https://www.ddxsss.com。顶点小说手机版:https://m.ddxsss.com",
"https://www.ddtxt8.cc/",
]

def initialize(self):
self.init_executor(ratelimit=20)

# the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
self.init_parser("html.parser")
self.cleaner.bad_tags.update(["script", "a"])
Expand All @@ -28,6 +27,19 @@ def initialize(self):
"div.Readpage.pagedown",
])

# p tags should only show up after being parsed and formatted the first time
self.cleaner.bad_tag_text_pairs["p"] = [
"请收藏本站:",
"顶点小说手机版:",
"您可以在百度里搜索",
"最新章节地址:",
"全文阅读地址:",
"txt下载地址:",
"手机阅读:",
'为了方便下次阅读,你可以点击下方的"收藏"记录本次',
"请向你的朋友(QQ、博客、微信等方式)推荐本书,谢谢您的支持!!",
]

def search_novel(self, query):
data = self.get_json(
f"{self.home_url}user/search.html?q={query}",
Expand Down Expand Up @@ -105,9 +117,9 @@ def download_chapter_body(self, chapter):
soup = self.get_soup(chapter.url, encoding="utf-8")
contents = soup.select_one("div#chaptercontent")
text = self.cleaner.extract_contents(contents)
for bad_text in self.banned_text:
text = text.replace(bad_text, "")
# chapter title is usually present but without space between chapter X and the title
text = text.replace(chapter.title, "")
text = text.replace(chapter.title.replace(" ", ""), "")
# remove paragraphs with bad text after parsing linebreaks
text = self.cleaner.extract_contents(self.make_soup(text))
return text
Loading

0 comments on commit 09e7f0a

Please sign in to comment.