Merge branch 'stashapp:master' into PaperStreetMediaUpdate

stashapp · Oct 1, 2024 · 0b1f765 · 0b1f765
2 parents 86c79e5 + 23f4301
commit 0b1f765
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 27 deletions.
diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml
@@ -113,4 +113,24 @@ xPathScrapers:
           - replace:
               - regex: ^([^?]+).*$
                 with: "https:$1"
+driver:
+  cookies:
+    - CookieURL: "https://vod.aebn.com"
+      Cookies:
+        - Name: "ageGated"
+          Domain: "vod.aebn.com"
+          Value: ""
+          Path: "/"
+    - CookieURL: "https://gay.aebn.com"
+      Cookies:
+        - Name: "ageGated"
+          Domain: "gay.aebn.com"
+          Value: ""
+          Path: "/"
+    - CookieURL: "https://straight.aebn.com"
+      Cookies:
+        - Name: "ageGated"
+          Domain: "straight.aebn.com"
+          Value: ""
+          Path: "/"
 # Last Updated July 06, 2022
diff --git a/scrapers/AmateurAllure.yml b/scrapers/AmateurAllure.yml
@@ -3,13 +3,20 @@ galleryByURL:
   - action: scrapeXPath
     url: &urls
       - amateurallure.com/tour/scenes/
-    scraper: galleryScraper
+    scraper: amateurAllure
+  - action: scrapeXPath
+    url: &classicUrls
+      - amateurallureclassics.com/scenes/
+    scraper: amateurAllureClassics
 sceneByURL:
   - action: scrapeXPath
     url: *urls
-    scraper: sceneScraper
+    scraper: amateurAllure
+  - action: scrapeXPath
+    url: *classicUrls
+    scraper: amateurAllureClassics
 xPathScrapers:
-  galleryScraper:
+  amateurAllure:
     common: &commonAttr
       $sceneinfo: //div[@class="scene-info"]
       $title: //span[@class='title_bar_hilite']
@@ -30,8 +37,6 @@ xPathScrapers:
       Studio: &studioAttr
         Name:
           fixed: Amateur Allure
-  sceneScraper:
-    common: *commonAttr
     scene:
       Title: *titleSel
       Code: *id
@@ -59,7 +64,43 @@ xPathScrapers:
                 - replace:
                     - regex: ^([^|]+amateurallure[^|]+)\|.+(/content/contentthumbs/\d+/\d+/[^/]+\.jpg) 1920w
                       with: $1$2
-                    - regex: 1x
+                    - regex: "[124]x"
                       with: "full"
       Studio: *studioAttr
-# Last Updated May 01, 2024
+  amateurAllureClassics:
+    common:
+      $scene: //div[contains(@class, "gallery_info")]
+      $excludeUpdates: not(ancestor::*[contains(@class, "category_listing_block")])
+    gallery:
+      Title: &classicTitle //title
+      Code: &classicCode
+        selector: //script[contains(., "setid:")]
+        postProcess:
+          - replace:
+              - regex: .*setid:"(\d+).*
+                with: $1
+      Date: &classicDate # Some sites hide their release date in a comment
+        selector: //*[(contains(@class, "availdate") or contains(@class, "update_date")) and contains(., "/")]
+        postProcess:
+          - replace:
+              - regex: ".*?([0-9]{2}/[0-9]{2}/[0-9]{4}).*"
+                with: $1
+          - parseDate: 01/02/2006
+      Details: &classicDetails $scene//span[@class="update_description"]
+      Performers: &classicPerformers
+        Name: $scene//span[@class="update_models" and $excludeUpdates]/a
+      Tags: &classicTags
+        Name: $scene//span[contains(@class, "update_tags")]/a
+      Studio: &classicStudio
+        Name:
+          fixed: Amateur Allure Classics
+    scene:
+      Title: *classicTitle
+      Code: *classicCode
+      Date: *classicDate
+      Details: *classicDetails
+      Performers: *classicPerformers
+      Tags: *classicTags
+      Studio: *classicStudio
+      Image: //meta[@property="og:image"]/@content
+# Last Updated September 27, 2024
diff --git a/scrapers/Freshmen.yml b/scrapers/Freshmen.yml
@@ -2,22 +2,32 @@ name: "Freshmen"
 sceneByURL:
   - action: scrapeXPath
     url:
-      - freshmen.net/content/
+      - club.freshmen.net/secure/
     scraper: sceneScraper
 xPathScrapers:
   sceneScraper:
     scene:
       Title:
-        selector: //h1/span/text()
-        concat: " "
+        selector: //h1
+        postProcess:
+          - replace:
+              - regex: ^(.+)\s\(Issue\s#(\d+).+$
+                with: "Issue $2: $1"
       Details:
-        selector: //div[@class='contentTab']/div[@class='top']//p
+        selector: //div[@class='content_detail__first_col__player__more__description']//div/p
         concat: "\n\n"
-      Performers:
-        Name: //div[@class='actor']/div[@class='name']
-      Image:
-        selector: //*[@id="videoPlayer"]/@poster
+      Date:
+        selector: //div[@class='content_date']/text()
+        postProcess:
+          - parseDate: 01/02/2006
+      Image: //div[@class="player"]//img/@src | //div[@class="player"]//video/@poster
       Studio:
         Name:
           fixed: Freshmen
-# Last Updated June 26, 2022
+      Tags:
+        Name:
+          selector: //div[@class="wrapper tag_list"]/a/text()
+      Performers:
+        Name: //div[@class='actors_list__actor']//h3/text()
+
+# Last Updated October 01, 2024
diff --git a/scrapers/Teamskeet/TeamskeetAPI.py b/scrapers/Teamskeet/TeamskeetAPI.py
@@ -13,6 +13,34 @@
     print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
     print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
     sys.exit()
+try:
+    import requests
+except ModuleNotFoundError:
+    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
+    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
+    sys.exit()
+
+def try_url(url):
+    return requests.head(url).status_code == 200
+
+def try_img_replacement(imgurl):
+    # members/full - 1600x900
+    # bio_big - 1500x844
+    # shared/hi - 1280x720
+    # shared/med - 765x430
+    for replacement in ['members/full', 'bio_big', 'shared/hi']:
+        newurl = imgurl.replace('shared/med', replacement)
+        if (try_url(newurl)):
+            return newurl
+    # try shared/hi on /tour url
+    # get the subsite name
+    subsite = imgurl.split("/")[4]
+    # replace with /tour/pics
+    tourHi = imgurl.replace(f"/{subsite}", f"/{subsite}/tour/pics").replace('shared/med', 'shared/hi')
+    if (try_url(tourHi)):
+        return tourHi
+    # fallback to original image
+    return imgurl
 
 def save_json(api_json, url):
     try:
@@ -185,7 +213,7 @@ def save_json(api_json, url):
 #fix for TeamKseet including HTML tags in Description
 CLEANR = re.compile('<.*?>') 
 cleandescription = re.sub(CLEANR,'',scene_api_json.get('description'))
-scrape['details'] = cleandescription
+scrape['details'] = cleandescription.strip()
 scrape['studio'] = {}
 studioApiName = scene_api_json['site'].get('name')
 log.debug("Studio API name is '" + studioApiName + "'")
@@ -208,16 +236,9 @@ def save_json(api_json, url):
 # high resolution scene images.  SayUncle is a high resoution right
 # from the scrape.  TeamSkeet and MYLF have different mappings between
 # the scraped value and the higher resolution version.
-match scene_url:
-    case str(x) if 'sayuncle.com' in x:
-        log.debug("Say Uncle image, using default size")
-        high_res = scrape['image']
-    case str(x) if 'teamskeet.com' in x or 'swappz.com' in x:
-        log.debug("TeamSkeet image, mapping members/full")
-        high_res = scene_api_json.get('img').replace('shared/med', 'members/full')
-    case str(x) if 'mylf.com' in x:
-        log.debug("Mylf image, mapping bio_big")
-        high_res = scene_api_json.get('img').replace('shared/med', 'bio_big')
+
+# try to (and check) higher res images if possible
+high_res = try_img_replacement(scene_api_json.get('img'))
 
 log.debug(f"Image before: {scrape['image']}")
 log.debug(f"Image after: {high_res}")