Added json support, exampleconf.json, support to convert image-respon…

…se to pdf and then merge
Armandur · Apr 3, 2022 · 9237d91 · 9237d91
1 parent 377e4ac
commit 9237d91
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,9 @@
 # prenly-dl
 
-Utility to download pdfs from prenlys online viewer.
+Utility to download pdfs from prenlys online viewer.
+
+## Requirements
+* PyPDF2
+* Pillow
+* img2pdf
+* requests
diff --git a/exampleconf.json b/exampleconf.json
@@ -0,0 +1,18 @@
+{
+    "credentials":
+    {
+        "textalk-auth": "YOUR OWN", #Look in api/v2/ request header (X-Textalk-Content-Client-Authorize), seems to be persistent
+        "auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent
+        "h": "YOUR OWN" #Look in url of pdf-request to api/v2/, seems to be persistent
+    },
+    "issue":
+    {
+        "title": 4019,  # ID of a paper
+        "uids": [       #UIDs of issues to download from paper, look in request to /api/v2 for issue_uid
+            "517313",
+            "490377"
+        ],
+        "site": "",    # URL of reader webpage
+        "cdn": ""      # Some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN
+    }
+}
diff --git a/prenly-dl.py b/prenly-dl.py
@@ -1,10 +1,14 @@
+import getopt
 import json
 import os
 import sys
 from glob import glob
+from PIL import Image
+import img2pdf
 
 import requests
 from PyPDF2 import PdfFileMerger
+from PyPDF2.utils import PdfReadError
 
 
 def getIssueJSON(session, credentials, issue):
@@ -29,21 +33,21 @@ def getIssueJSON(session, credentials, issue):
         str(issue["title"]) + ',"issue_uid":"' + issue["uid"] + '"},"id":1}'
     req = session.post(url, data=data, headers=headers)
 
-    #TODO Check response for API error
+    # TODO Check response for API error
 
     # DEBUG to print json response
-    #with open("response.json", 'w') as file:
+    # with open("response.json", 'w') as file:
     #    file.write(req.text)
-    
-    #exit(0)
+
+    # exit(0)
 
     return req.text
 
 
 def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"):
-    #Some sites may use another cdn
+    # Some sites may use another cdn
     # What is this h=23bcv... ??? All but first page works without it, without it the first page gets as webp
-    
+
     url = f"{cdn}/api/v2/media/get/{issue['title']}/{hash}?h={h}"
 
     headers = {
@@ -58,7 +62,8 @@ def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"):
         "Sec-Fetch-Site": "cross-site"
     }
 
-    response = session.get(url, headers=headers) #TODO Check for response error, wrong CDN, h, auth etc
+    # TODO Check for response error, wrong CDN, h, auth etc
+    response = session.get(url, headers=headers)
     return response
 
 
@@ -68,73 +73,108 @@ def getHashes(JSON):
     for spread in JSON["result"]["replica_spreads"]:
         for page in spread["pages"]:
             number = int(page["page_no"])
-            page_num = str(number).zfill(3) #pad the numbers to get them in order when globbing pdfs for merging
+            # pad the numbers to get them in order when globbing pdfs for merging
+            page_num = str(number).zfill(3)
             hashes[page_num] = page["media"][0]["checksum"]
 
     return hashes
 
 
 def pdfMerge(title):
     merger = PdfFileMerger()
-    allpdfs = [a for a in glob("*.pdf")]
-    [merger.append(pdf) for pdf in allpdfs]
-
-    with open(title, "wb") as merged:
-        merger.write(merged)
-
+    allpdfs = [a for a in glob(f"{title}*.pdf")]
+    currentpdf = allpdfs[0]
     try:
         for pdf in allpdfs:
-            # TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe...
-            os.remove(pdf)
-    except OSError as error:
-        # At least we have som error handling for it...
-        print(repr(error), file=sys.stderr)
-        exit(1)
+            currentpdf = pdf
+            merger.append(pdf)
+    except PdfReadError as error:
+        print(f"{repr(error)} - File: {currentpdf}", file=sys.stderr)
+        print("You can try to merge the files yourself, we won't delete them", file=sys.stderr)
+    else:
+        with open(f"{title}.pdf", "wb") as merged:
+            merger.write(merged)
+
+        try:
+            for pdf in allpdfs:
+                # TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe...
+                os.remove(pdf)
+        except OSError as error:
+            # At least we have som error handling for it...
+            print(repr(error), file=sys.stderr)
+            # exit(1)
+
+
+def main(conf):
+    # Support for multiple uids
+    for uid in conf["issue"]["uids"]:
+        issue = {
+            "title": conf["issue"]["title"],
+            "uid": uid,
+            "site": conf["issue"]["site"]
+        }
+
+        session = requests.Session()
+        session.headers.update(
+            {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})
+
+        JSON = json.loads(getIssueJSON(session, conf["credentials"], issue))
+        hashes = getHashes(JSON)  # Extract the hashes for individual pages
+
+        # Get all PDFs and write them to files.
+        for page_num in hashes:
+            if "cdn" in conf["issue"] and conf["issue"]["cdn"] != "":  # Custom CDN supplied
+                req = getPDF(
+                    session, issue, hashes[page_num], conf["credentials"]["h"], conf["issue"]["cdn"])
+            else:
+                req = getPDF(session, issue,
+                             hashes[page_num], conf["credentials"]["h"])
+            name = JSON['result']['name']
+
+            content = req.content
+
+            # Sometimes we might just not get a pdf, convert response to pdf instead.
+            if req.headers["content-type"] in ("image/webp", "image/jpeg", "image/png", "image/gif", "image/svg"):
+                print(f"{name} - page {page_num} is image/, not application/pdf, we convert file to pdf", file=sys.stderr)
+                image = req.content
+                content = img2pdf.convert(image)
+
+            with open(f"{name} - {page_num}.pdf", "wb") as file:
+                file.write(content)
+
+        pdfMerge(name)
 
+    return
 
-def main():
-    session = requests.Session()
-    session.headers.update(
-        {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})
 
-    credentials = {
-        "textalk-auth": "YOUR OWN", #Look in api request content
-        "auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken
-        "h": "YOUR OWN" #Look in pdf-request to api
-    }
+def opts(argv):
+    usage = "test"  # TODO Put a helpful text here
+    if len(argv) == 0:
+        print("No parameters given")
+        print(usage)
+        sys.exit(1)
 
-    # TODO Add support to supply list of issue uids
-    issue = {
-        "title": 4019,      # ID of a paper
-        "uid": "517313",    # uid of a specific issue
-        "site": "https://hemslojd.prenly.com" # URL of reader
-    }
+    config = {}
 
-    # TODO Add getopts
-    # Getopts:
-    #   - Title
-    #   - IssueID
-    #   - Site
-    #   - Auth
-    #   - CDN
-    #   - h
-    #   --json of above
-
-    JSON = json.loads(getIssueJSON(session, credentials, issue))
-    hashes = getHashes(JSON)  # Extract the hashes for individual pages
-
-    # Get all PDFs and write them to files.
-    for page_num in hashes:
-        req = getPDF(session, issue, hashes[page_num], credentials["h"])
-        name = JSON['result']['name']
-        with open(f"{name} - {page_num}.pdf", "wb") as file:
-            file.write(req.content)
-
-    pdfMerge(name+".pdf")
+    try:
+        # title issue site, cdn, textalk, auth, h=23ob...
+        # TODO Custom title?
+        opts, args = getopt.getopt(argv, "t:i:s:c:u:a:h:", [
+                                   "title=", "issue=", "site=", "cdn=", "textalk=", "auth=", "json=", "help"])
+    except getopt.GetoptError as error:
+        print(repr(error), file=sys.stderr)
+        print(usage)
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == "--json":
+            with open(arg, "r") as file:
+                config = json.loads(file.read())
+            break
+        # TODO rest of options, build config with supplied options
 
-    return
+    main(config)
 
 
 if __name__ == '__main__':
-    main()
+    opts(sys.argv[1:])
     sys.exit(0)