diff --git a/README.md b/README.md index ae147e1..02b9cbe 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ # prenly-dl -Utility to download pdfs from prenlys online viewer. \ No newline at end of file +Utility to download pdfs from prenlys online viewer. + +## Requirements +* PyPDF2 +* Pillow +* img2pdf +* requests \ No newline at end of file diff --git a/exampleconf.json b/exampleconf.json new file mode 100644 index 0000000..8631acb --- /dev/null +++ b/exampleconf.json @@ -0,0 +1,18 @@ +{ + "credentials": + { + "textalk-auth": "YOUR OWN", #Look in api/v2/ request header (X-Textalk-Content-Client-Authorize), seems to be persistent + "auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent + "h": "YOUR OWN" #Look in url of pdf-request to api/v2/, seems to be persistent + }, + "issue": + { + "title": 4019, # ID of a paper + "uids": [ #UIDs of issues to download from paper, look in request to /api/v2 for issue_uid + "517313", + "490377" + ], + "site": "", # URL of reader webpage + "cdn": "" # Some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN + } +} \ No newline at end of file diff --git a/prenly-dl.py b/prenly-dl.py index e808481..3faec02 100644 --- a/prenly-dl.py +++ b/prenly-dl.py @@ -1,10 +1,14 @@ +import getopt import json import os import sys from glob import glob +from PIL import Image +import img2pdf import requests from PyPDF2 import PdfFileMerger +from PyPDF2.utils import PdfReadError def getIssueJSON(session, credentials, issue): @@ -29,21 +33,21 @@ def getIssueJSON(session, credentials, issue): str(issue["title"]) + ',"issue_uid":"' + issue["uid"] + '"},"id":1}' req = session.post(url, data=data, headers=headers) - #TODO Check response for API error + # TODO Check response for API error # DEBUG to print json response - #with open("response.json", 'w') as file: + # with open("response.json", 'w') as file: # file.write(req.text) - - #exit(0) + + # exit(0) return req.text def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"): - #Some sites may use another cdn + # Some sites may use another cdn # What is this h=23bcv... ??? All but first page works without it, without it the first page gets as webp - + url = f"{cdn}/api/v2/media/get/{issue['title']}/{hash}?h={h}" headers = { @@ -58,7 +62,8 @@ def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"): "Sec-Fetch-Site": "cross-site" } - response = session.get(url, headers=headers) #TODO Check for response error, wrong CDN, h, auth etc + # TODO Check for response error, wrong CDN, h, auth etc + response = session.get(url, headers=headers) return response @@ -68,7 +73,8 @@ def getHashes(JSON): for spread in JSON["result"]["replica_spreads"]: for page in spread["pages"]: number = int(page["page_no"]) - page_num = str(number).zfill(3) #pad the numbers to get them in order when globbing pdfs for merging + # pad the numbers to get them in order when globbing pdfs for merging + page_num = str(number).zfill(3) hashes[page_num] = page["media"][0]["checksum"] return hashes @@ -76,65 +82,99 @@ def getHashes(JSON): def pdfMerge(title): merger = PdfFileMerger() - allpdfs = [a for a in glob("*.pdf")] - [merger.append(pdf) for pdf in allpdfs] - - with open(title, "wb") as merged: - merger.write(merged) - + allpdfs = [a for a in glob(f"{title}*.pdf")] + currentpdf = allpdfs[0] try: for pdf in allpdfs: - # TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe... - os.remove(pdf) - except OSError as error: - # At least we have som error handling for it... - print(repr(error), file=sys.stderr) - exit(1) + currentpdf = pdf + merger.append(pdf) + except PdfReadError as error: + print(f"{repr(error)} - File: {currentpdf}", file=sys.stderr) + print("You can try to merge the files yourself, we won't delete them", file=sys.stderr) + else: + with open(f"{title}.pdf", "wb") as merged: + merger.write(merged) + + try: + for pdf in allpdfs: + # TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe... + os.remove(pdf) + except OSError as error: + # At least we have som error handling for it... + print(repr(error), file=sys.stderr) + # exit(1) + + +def main(conf): + # Support for multiple uids + for uid in conf["issue"]["uids"]: + issue = { + "title": conf["issue"]["title"], + "uid": uid, + "site": conf["issue"]["site"] + } + + session = requests.Session() + session.headers.update( + {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}) + + JSON = json.loads(getIssueJSON(session, conf["credentials"], issue)) + hashes = getHashes(JSON) # Extract the hashes for individual pages + + # Get all PDFs and write them to files. + for page_num in hashes: + if "cdn" in conf["issue"] and conf["issue"]["cdn"] != "": # Custom CDN supplied + req = getPDF( + session, issue, hashes[page_num], conf["credentials"]["h"], conf["issue"]["cdn"]) + else: + req = getPDF(session, issue, + hashes[page_num], conf["credentials"]["h"]) + name = JSON['result']['name'] + + content = req.content + + # Sometimes we might just not get a pdf, convert response to pdf instead. + if req.headers["content-type"] in ("image/webp", "image/jpeg", "image/png", "image/gif", "image/svg"): + print(f"{name} - page {page_num} is image/, not application/pdf, we convert file to pdf", file=sys.stderr) + image = req.content + content = img2pdf.convert(image) + + with open(f"{name} - {page_num}.pdf", "wb") as file: + file.write(content) + + pdfMerge(name) + return -def main(): - session = requests.Session() - session.headers.update( - {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}) - credentials = { - "textalk-auth": "YOUR OWN", #Look in api request content - "auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken - "h": "YOUR OWN" #Look in pdf-request to api - } +def opts(argv): + usage = "test" # TODO Put a helpful text here + if len(argv) == 0: + print("No parameters given") + print(usage) + sys.exit(1) - # TODO Add support to supply list of issue uids - issue = { - "title": 4019, # ID of a paper - "uid": "517313", # uid of a specific issue - "site": "https://hemslojd.prenly.com" # URL of reader - } + config = {} - # TODO Add getopts - # Getopts: - # - Title - # - IssueID - # - Site - # - Auth - # - CDN - # - h - # --json of above - - JSON = json.loads(getIssueJSON(session, credentials, issue)) - hashes = getHashes(JSON) # Extract the hashes for individual pages - - # Get all PDFs and write them to files. - for page_num in hashes: - req = getPDF(session, issue, hashes[page_num], credentials["h"]) - name = JSON['result']['name'] - with open(f"{name} - {page_num}.pdf", "wb") as file: - file.write(req.content) - - pdfMerge(name+".pdf") + try: + # title issue site, cdn, textalk, auth, h=23ob... + # TODO Custom title? + opts, args = getopt.getopt(argv, "t:i:s:c:u:a:h:", [ + "title=", "issue=", "site=", "cdn=", "textalk=", "auth=", "json=", "help"]) + except getopt.GetoptError as error: + print(repr(error), file=sys.stderr) + print(usage) + sys.exit(2) + for opt, arg in opts: + if opt == "--json": + with open(arg, "r") as file: + config = json.loads(file.read()) + break + # TODO rest of options, build config with supplied options - return + main(config) if __name__ == '__main__': - main() + opts(sys.argv[1:]) sys.exit(0)