Skip to content

Commit

Permalink
use sitemap for darkerprojects
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Mar 25, 2024
1 parent 0eae99e commit acacdee
Showing 1 changed file with 26 additions and 34 deletions.
60 changes: 26 additions & 34 deletions audiobooker/scrappers/darkerprojects.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import requests
from sitemapparser import SiteMapParser

from audiobooker import AudioBook
from audiobooker.scrappers import AudioBookSource

Expand All @@ -9,6 +11,8 @@ class DarkerProjectsAudioBook(AudioBook):
def parse_page(self):
streams = []
for url in self.soup.find_all("a"):
if not url.get("href"):
continue
if url["href"].endswith(".mp3"):
if url["href"] not in streams:
streams.append(url["href"])
Expand All @@ -33,25 +37,6 @@ def __repr__(self):

class DarkerProjects(AudioBookSource):
base_url = "http://darkerprojects.com"
_tag_pages = {'Autumn': 'http://darkerprojects.com/autumn/',
"Batman: No Man's Land": 'http://darkerprojects.com/batman-no-mans-land/',
'Behind The Scenes': 'http://darkerprojects.com/behind-the-scenes/',
'Dark Matter': 'http://darkerprojects.com/dark-matter/',
'Darker Projects: Uncovered': 'http://darkerprojects.com/dp-uncovered/',
'Doctor Who': 'http://darkerprojects.com/doctor-who/',
'Five Minute Fears': 'http://darkerprojects.com/five-minute-fears/',
'He-Man: The Parody': 'http://darkerprojects.com/he-man-the-parody/',
'Madness': 'http://darkerprojects.com/madness/',
'Night Terrors': 'http://darkerprojects.com/night-terrors/',
'Other Voices': 'http://darkerprojects.com/other-voices/',
'Outer Limits': 'http://darkerprojects.com/outer-limits/',
'Quantum Leap': 'http://darkerprojects.com/quantum-leap/',
'Quantum Retribution': 'http://darkerprojects.com/quantum-retribution/',
'Star Trek: Lost Frontier': 'http://darkerprojects.com/lostfrontier/',
'Star Trek: Section 31': 'http://darkerprojects.com/section31/',
'Tales From The Museum': 'http://darkerprojects.com/tales-from-the-museum/',
'Tales From The Museum: The Beginning': 'http://darkerprojects.com/tales-from-the-museum-the-beginning/',
'The Falcon Banner': 'http://darkerprojects.com/the-falcon-banner/'}

@classmethod
def _parse_page(cls, html, limit=-1):
Expand Down Expand Up @@ -89,13 +74,14 @@ def scrap_popular(cls, limit=-1, offset=0):

@classmethod
def scrap_tags(cls):
html = requests.get(cls.base_url).text
soup = cls._get_soup(html)
collections = soup.find("div", {"class": "widget-area"})
for ul in collections.find_all("li"):
a = ul.find("a")
cls._tag_pages[a.text] = a["href"]
return cls._tag_pages
bucket = {}
sm = SiteMapParser('https://darkerprojects.com/wp-sitemap-taxonomies-category-1.xml') # reads /sitemap.xml
urls = sm.get_urls() # returns iterator of sitemapper.Url instances
for url in urls:
url = str(url)
title = url.strip("/").split("/")[-1].replace("-", " ").title()
bucket[title] = url
return bucket

@classmethod
def scrap_collections(cls, limit=-1, offset=0):
Expand All @@ -112,8 +98,9 @@ def get_collection(cls, collection):
streams += book.streams
streams.reverse()
return DarkerProjectsAudioBook(title=tag,
stream_list=streams,
url=url)
stream_list=streams,
url=url)

@classmethod
def search_audiobooks(cls, since=None, author=None, title=None, tag=None,
limit=25):
Expand All @@ -135,17 +122,22 @@ def get_audiobook(cls, book_id):

@classmethod
def scrap_all_audiobooks(cls, limit=-1, offset=0):
return cls.scrap_collections()
sm = SiteMapParser('https://darkerprojects.com/wp-sitemap-posts-post-1.xml') # reads /sitemap.xml
urls = sm.get_urls() # returns iterator of sitemapper.Url instances
for url in urls:
url = str(url)
title = url.strip("/").split("/")[-1].replace("-", " ").title()
book = DarkerProjectsAudioBook(url=url, title=title)
book.from_page()
yield book


if __name__ == "__main__":
from pprint import pprint

# for book in DarkerProjects.search_audiobooks(title="Dark Tower"):
# pprint(book.as_json)

scraper = DarkerProjects()

print(scraper.scrap_tags())
exit()
for book in scraper.scrap_collections():

for book in scraper.scrap_all_audiobooks():
pprint(book.as_json)

0 comments on commit acacdee

Please sign in to comment.