diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml index 72c27368f..b98d8970c 100644 --- a/scrapers/AEBN.yml +++ b/scrapers/AEBN.yml @@ -113,4 +113,24 @@ xPathScrapers: - replace: - regex: ^([^?]+).*$ with: "https:$1" +driver: + cookies: + - CookieURL: "https://vod.aebn.com" + Cookies: + - Name: "ageGated" + Domain: "vod.aebn.com" + Value: "" + Path: "/" + - CookieURL: "https://gay.aebn.com" + Cookies: + - Name: "ageGated" + Domain: "gay.aebn.com" + Value: "" + Path: "/" + - CookieURL: "https://straight.aebn.com" + Cookies: + - Name: "ageGated" + Domain: "straight.aebn.com" + Value: "" + Path: "/" # Last Updated July 06, 2022 diff --git a/scrapers/AmateurAllure.yml b/scrapers/AmateurAllure.yml index 08483d306..0e7ad4a60 100644 --- a/scrapers/AmateurAllure.yml +++ b/scrapers/AmateurAllure.yml @@ -3,13 +3,20 @@ galleryByURL: - action: scrapeXPath url: &urls - amateurallure.com/tour/scenes/ - scraper: galleryScraper + scraper: amateurAllure + - action: scrapeXPath + url: &classicUrls + - amateurallureclassics.com/scenes/ + scraper: amateurAllureClassics sceneByURL: - action: scrapeXPath url: *urls - scraper: sceneScraper + scraper: amateurAllure + - action: scrapeXPath + url: *classicUrls + scraper: amateurAllureClassics xPathScrapers: - galleryScraper: + amateurAllure: common: &commonAttr $sceneinfo: //div[@class="scene-info"] $title: //span[@class='title_bar_hilite'] @@ -30,8 +37,6 @@ xPathScrapers: Studio: &studioAttr Name: fixed: Amateur Allure - sceneScraper: - common: *commonAttr scene: Title: *titleSel Code: *id @@ -59,7 +64,43 @@ xPathScrapers: - replace: - regex: ^([^|]+amateurallure[^|]+)\|.+(/content/contentthumbs/\d+/\d+/[^/]+\.jpg) 1920w with: $1$2 - - regex: 1x + - regex: "[124]x" with: "full" Studio: *studioAttr -# Last Updated May 01, 2024 + amateurAllureClassics: + common: + $scene: //div[contains(@class, "gallery_info")] + $excludeUpdates: not(ancestor::*[contains(@class, "category_listing_block")]) + gallery: + Title: &classicTitle //title + Code: &classicCode + selector: //script[contains(., "setid:")] + postProcess: + - replace: + - regex: .*setid:"(\d+).* + with: $1 + Date: &classicDate # Some sites hide their release date in a comment + selector: //*[(contains(@class, "availdate") or contains(@class, "update_date")) and contains(., "/")] + postProcess: + - replace: + - regex: ".*?([0-9]{2}/[0-9]{2}/[0-9]{4}).*" + with: $1 + - parseDate: 01/02/2006 + Details: &classicDetails $scene//span[@class="update_description"] + Performers: &classicPerformers + Name: $scene//span[@class="update_models" and $excludeUpdates]/a + Tags: &classicTags + Name: $scene//span[contains(@class, "update_tags")]/a + Studio: &classicStudio + Name: + fixed: Amateur Allure Classics + scene: + Title: *classicTitle + Code: *classicCode + Date: *classicDate + Details: *classicDetails + Performers: *classicPerformers + Tags: *classicTags + Studio: *classicStudio + Image: //meta[@property="og:image"]/@content +# Last Updated September 27, 2024 diff --git a/scrapers/Freshmen.yml b/scrapers/Freshmen.yml index fe8acbb89..d5f50382d 100644 --- a/scrapers/Freshmen.yml +++ b/scrapers/Freshmen.yml @@ -2,22 +2,32 @@ name: "Freshmen" sceneByURL: - action: scrapeXPath url: - - freshmen.net/content/ + - club.freshmen.net/secure/ scraper: sceneScraper xPathScrapers: sceneScraper: scene: Title: - selector: //h1/span/text() - concat: " " + selector: //h1 + postProcess: + - replace: + - regex: ^(.+)\s\(Issue\s#(\d+).+$ + with: "Issue $2: $1" Details: - selector: //div[@class='contentTab']/div[@class='top']//p + selector: //div[@class='content_detail__first_col__player__more__description']//div/p concat: "\n\n" - Performers: - Name: //div[@class='actor']/div[@class='name'] - Image: - selector: //*[@id="videoPlayer"]/@poster + Date: + selector: //div[@class='content_date']/text() + postProcess: + - parseDate: 01/02/2006 + Image: //div[@class="player"]//img/@src | //div[@class="player"]//video/@poster Studio: Name: fixed: Freshmen -# Last Updated June 26, 2022 + Tags: + Name: + selector: //div[@class="wrapper tag_list"]/a/text() + Performers: + Name: //div[@class='actors_list__actor']//h3/text() + +# Last Updated October 01, 2024 diff --git a/scrapers/Teamskeet/TeamskeetAPI.py b/scrapers/Teamskeet/TeamskeetAPI.py index 7da5e39f1..ae20c3c01 100644 --- a/scrapers/Teamskeet/TeamskeetAPI.py +++ b/scrapers/Teamskeet/TeamskeetAPI.py @@ -13,6 +13,34 @@ print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr) print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr) sys.exit() +try: + import requests +except ModuleNotFoundError: + print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr) + print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr) + sys.exit() + +def try_url(url): + return requests.head(url).status_code == 200 + +def try_img_replacement(imgurl): + # members/full - 1600x900 + # bio_big - 1500x844 + # shared/hi - 1280x720 + # shared/med - 765x430 + for replacement in ['members/full', 'bio_big', 'shared/hi']: + newurl = imgurl.replace('shared/med', replacement) + if (try_url(newurl)): + return newurl + # try shared/hi on /tour url + # get the subsite name + subsite = imgurl.split("/")[4] + # replace with /tour/pics + tourHi = imgurl.replace(f"/{subsite}", f"/{subsite}/tour/pics").replace('shared/med', 'shared/hi') + if (try_url(tourHi)): + return tourHi + # fallback to original image + return imgurl def save_json(api_json, url): try: @@ -185,7 +213,7 @@ def save_json(api_json, url): #fix for TeamKseet including HTML tags in Description CLEANR = re.compile('<.*?>') cleandescription = re.sub(CLEANR,'',scene_api_json.get('description')) -scrape['details'] = cleandescription +scrape['details'] = cleandescription.strip() scrape['studio'] = {} studioApiName = scene_api_json['site'].get('name') log.debug("Studio API name is '" + studioApiName + "'") @@ -208,16 +236,9 @@ def save_json(api_json, url): # high resolution scene images. SayUncle is a high resoution right # from the scrape. TeamSkeet and MYLF have different mappings between # the scraped value and the higher resolution version. -match scene_url: - case str(x) if 'sayuncle.com' in x: - log.debug("Say Uncle image, using default size") - high_res = scrape['image'] - case str(x) if 'teamskeet.com' in x or 'swappz.com' in x: - log.debug("TeamSkeet image, mapping members/full") - high_res = scene_api_json.get('img').replace('shared/med', 'members/full') - case str(x) if 'mylf.com' in x: - log.debug("Mylf image, mapping bio_big") - high_res = scene_api_json.get('img').replace('shared/med', 'bio_big') + +# try to (and check) higher res images if possible +high_res = try_img_replacement(scene_api_json.get('img')) log.debug(f"Image before: {scrape['image']}") log.debug(f"Image after: {high_res}")