Skip to content

Commit

Permalink
Merge branch 'stashapp:master' into PaperStreetMediaUpdate
Browse files Browse the repository at this point in the history
  • Loading branch information
TheStashMaster authored Oct 1, 2024
2 parents 86c79e5 + 23f4301 commit 0b1f765
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 27 deletions.
20 changes: 20 additions & 0 deletions scrapers/AEBN.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,24 @@ xPathScrapers:
- replace:
- regex: ^([^?]+).*$
with: "https:$1"
driver:
cookies:
- CookieURL: "https://vod.aebn.com"
Cookies:
- Name: "ageGated"
Domain: "vod.aebn.com"
Value: ""
Path: "/"
- CookieURL: "https://gay.aebn.com"
Cookies:
- Name: "ageGated"
Domain: "gay.aebn.com"
Value: ""
Path: "/"
- CookieURL: "https://straight.aebn.com"
Cookies:
- Name: "ageGated"
Domain: "straight.aebn.com"
Value: ""
Path: "/"
# Last Updated July 06, 2022
55 changes: 48 additions & 7 deletions scrapers/AmateurAllure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@ galleryByURL:
- action: scrapeXPath
url: &urls
- amateurallure.com/tour/scenes/
scraper: galleryScraper
scraper: amateurAllure
- action: scrapeXPath
url: &classicUrls
- amateurallureclassics.com/scenes/
scraper: amateurAllureClassics
sceneByURL:
- action: scrapeXPath
url: *urls
scraper: sceneScraper
scraper: amateurAllure
- action: scrapeXPath
url: *classicUrls
scraper: amateurAllureClassics
xPathScrapers:
galleryScraper:
amateurAllure:
common: &commonAttr
$sceneinfo: //div[@class="scene-info"]
$title: //span[@class='title_bar_hilite']
Expand All @@ -30,8 +37,6 @@ xPathScrapers:
Studio: &studioAttr
Name:
fixed: Amateur Allure
sceneScraper:
common: *commonAttr
scene:
Title: *titleSel
Code: *id
Expand Down Expand Up @@ -59,7 +64,43 @@ xPathScrapers:
- replace:
- regex: ^([^|]+amateurallure[^|]+)\|.+(/content/contentthumbs/\d+/\d+/[^/]+\.jpg) 1920w
with: $1$2
- regex: 1x
- regex: "[124]x"
with: "full"
Studio: *studioAttr
# Last Updated May 01, 2024
amateurAllureClassics:
common:
$scene: //div[contains(@class, "gallery_info")]
$excludeUpdates: not(ancestor::*[contains(@class, "category_listing_block")])
gallery:
Title: &classicTitle //title
Code: &classicCode
selector: //script[contains(., "setid:")]
postProcess:
- replace:
- regex: .*setid:"(\d+).*
with: $1
Date: &classicDate # Some sites hide their release date in a comment
selector: //*[(contains(@class, "availdate") or contains(@class, "update_date")) and contains(., "/")]
postProcess:
- replace:
- regex: ".*?([0-9]{2}/[0-9]{2}/[0-9]{4}).*"
with: $1
- parseDate: 01/02/2006
Details: &classicDetails $scene//span[@class="update_description"]
Performers: &classicPerformers
Name: $scene//span[@class="update_models" and $excludeUpdates]/a
Tags: &classicTags
Name: $scene//span[contains(@class, "update_tags")]/a
Studio: &classicStudio
Name:
fixed: Amateur Allure Classics
scene:
Title: *classicTitle
Code: *classicCode
Date: *classicDate
Details: *classicDetails
Performers: *classicPerformers
Tags: *classicTags
Studio: *classicStudio
Image: //meta[@property="og:image"]/@content
# Last Updated September 27, 2024
28 changes: 19 additions & 9 deletions scrapers/Freshmen.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,32 @@ name: "Freshmen"
sceneByURL:
- action: scrapeXPath
url:
- freshmen.net/content/
- club.freshmen.net/secure/
scraper: sceneScraper
xPathScrapers:
sceneScraper:
scene:
Title:
selector: //h1/span/text()
concat: " "
selector: //h1
postProcess:
- replace:
- regex: ^(.+)\s\(Issue\s#(\d+).+$
with: "Issue $2: $1"
Details:
selector: //div[@class='contentTab']/div[@class='top']//p
selector: //div[@class='content_detail__first_col__player__more__description']//div/p
concat: "\n\n"
Performers:
Name: //div[@class='actor']/div[@class='name']
Image:
selector: //*[@id="videoPlayer"]/@poster
Date:
selector: //div[@class='content_date']/text()
postProcess:
- parseDate: 01/02/2006
Image: //div[@class="player"]//img/@src | //div[@class="player"]//video/@poster
Studio:
Name:
fixed: Freshmen
# Last Updated June 26, 2022
Tags:
Name:
selector: //div[@class="wrapper tag_list"]/a/text()
Performers:
Name: //div[@class='actors_list__actor']//h3/text()

# Last Updated October 01, 2024
43 changes: 32 additions & 11 deletions scrapers/Teamskeet/TeamskeetAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,34 @@
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
sys.exit()
try:
import requests
except ModuleNotFoundError:
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
sys.exit()

def try_url(url):
return requests.head(url).status_code == 200

def try_img_replacement(imgurl):
# members/full - 1600x900
# bio_big - 1500x844
# shared/hi - 1280x720
# shared/med - 765x430
for replacement in ['members/full', 'bio_big', 'shared/hi']:
newurl = imgurl.replace('shared/med', replacement)
if (try_url(newurl)):
return newurl
# try shared/hi on /tour url
# get the subsite name
subsite = imgurl.split("/")[4]
# replace with /tour/pics
tourHi = imgurl.replace(f"/{subsite}", f"/{subsite}/tour/pics").replace('shared/med', 'shared/hi')
if (try_url(tourHi)):
return tourHi
# fallback to original image
return imgurl

def save_json(api_json, url):
try:
Expand Down Expand Up @@ -185,7 +213,7 @@ def save_json(api_json, url):
#fix for TeamKseet including HTML tags in Description
CLEANR = re.compile('<.*?>')
cleandescription = re.sub(CLEANR,'',scene_api_json.get('description'))
scrape['details'] = cleandescription
scrape['details'] = cleandescription.strip()
scrape['studio'] = {}
studioApiName = scene_api_json['site'].get('name')
log.debug("Studio API name is '" + studioApiName + "'")
Expand All @@ -208,16 +236,9 @@ def save_json(api_json, url):
# high resolution scene images. SayUncle is a high resoution right
# from the scrape. TeamSkeet and MYLF have different mappings between
# the scraped value and the higher resolution version.
match scene_url:
case str(x) if 'sayuncle.com' in x:
log.debug("Say Uncle image, using default size")
high_res = scrape['image']
case str(x) if 'teamskeet.com' in x or 'swappz.com' in x:
log.debug("TeamSkeet image, mapping members/full")
high_res = scene_api_json.get('img').replace('shared/med', 'members/full')
case str(x) if 'mylf.com' in x:
log.debug("Mylf image, mapping bio_big")
high_res = scene_api_json.get('img').replace('shared/med', 'bio_big')

# try to (and check) higher res images if possible
high_res = try_img_replacement(scene_api_json.get('img'))

log.debug(f"Image before: {scrape['image']}")
log.debug(f"Image after: {high_res}")
Expand Down

0 comments on commit 0b1f765

Please sign in to comment.