Skip to content

Commit

Permalink
Added json support, exampleconf.json, support to convert image-respon…
Browse files Browse the repository at this point in the history
…se to pdf and then merge
  • Loading branch information
Armandur committed Apr 3, 2022
1 parent 377e4ac commit 9237d91
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 59 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# prenly-dl

Utility to download pdfs from prenlys online viewer.
Utility to download pdfs from prenlys online viewer.

## Requirements
* PyPDF2
* Pillow
* img2pdf
* requests
18 changes: 18 additions & 0 deletions exampleconf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"credentials":
{
"textalk-auth": "YOUR OWN", #Look in api/v2/ request header (X-Textalk-Content-Client-Authorize), seems to be persistent
"auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent
"h": "YOUR OWN" #Look in url of pdf-request to api/v2/, seems to be persistent
},
"issue":
{
"title": 4019, # ID of a paper
"uids": [ #UIDs of issues to download from paper, look in request to /api/v2 for issue_uid
"517313",
"490377"
],
"site": "", # URL of reader webpage
"cdn": "" # Some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN
}
}
156 changes: 98 additions & 58 deletions prenly-dl.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import getopt
import json
import os
import sys
from glob import glob
from PIL import Image
import img2pdf

import requests
from PyPDF2 import PdfFileMerger
from PyPDF2.utils import PdfReadError


def getIssueJSON(session, credentials, issue):
Expand All @@ -29,21 +33,21 @@ def getIssueJSON(session, credentials, issue):
str(issue["title"]) + ',"issue_uid":"' + issue["uid"] + '"},"id":1}'
req = session.post(url, data=data, headers=headers)

#TODO Check response for API error
# TODO Check response for API error

# DEBUG to print json response
#with open("response.json", 'w') as file:
# with open("response.json", 'w') as file:
# file.write(req.text)
#exit(0)

# exit(0)

return req.text


def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"):
#Some sites may use another cdn
# Some sites may use another cdn
# What is this h=23bcv... ??? All but first page works without it, without it the first page gets as webp

url = f"{cdn}/api/v2/media/get/{issue['title']}/{hash}?h={h}"

headers = {
Expand All @@ -58,7 +62,8 @@ def getPDF(session, issue, hash, h, cdn="https://mediacdn.prenly.com"):
"Sec-Fetch-Site": "cross-site"
}

response = session.get(url, headers=headers) #TODO Check for response error, wrong CDN, h, auth etc
# TODO Check for response error, wrong CDN, h, auth etc
response = session.get(url, headers=headers)
return response


Expand All @@ -68,73 +73,108 @@ def getHashes(JSON):
for spread in JSON["result"]["replica_spreads"]:
for page in spread["pages"]:
number = int(page["page_no"])
page_num = str(number).zfill(3) #pad the numbers to get them in order when globbing pdfs for merging
# pad the numbers to get them in order when globbing pdfs for merging
page_num = str(number).zfill(3)
hashes[page_num] = page["media"][0]["checksum"]

return hashes


def pdfMerge(title):
merger = PdfFileMerger()
allpdfs = [a for a in glob("*.pdf")]
[merger.append(pdf) for pdf in allpdfs]

with open(title, "wb") as merged:
merger.write(merged)

allpdfs = [a for a in glob(f"{title}*.pdf")]
currentpdf = allpdfs[0]
try:
for pdf in allpdfs:
# TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe...
os.remove(pdf)
except OSError as error:
# At least we have som error handling for it...
print(repr(error), file=sys.stderr)
exit(1)
currentpdf = pdf
merger.append(pdf)
except PdfReadError as error:
print(f"{repr(error)} - File: {currentpdf}", file=sys.stderr)
print("You can try to merge the files yourself, we won't delete them", file=sys.stderr)
else:
with open(f"{title}.pdf", "wb") as merged:
merger.write(merged)

try:
for pdf in allpdfs:
# TODO Why doesn't this work? WinError 32, can't access file, in use by another process, can't find any process with procexp.exe...
os.remove(pdf)
except OSError as error:
# At least we have som error handling for it...
print(repr(error), file=sys.stderr)
# exit(1)


def main(conf):
# Support for multiple uids
for uid in conf["issue"]["uids"]:
issue = {
"title": conf["issue"]["title"],
"uid": uid,
"site": conf["issue"]["site"]
}

session = requests.Session()
session.headers.update(
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})

JSON = json.loads(getIssueJSON(session, conf["credentials"], issue))
hashes = getHashes(JSON) # Extract the hashes for individual pages

# Get all PDFs and write them to files.
for page_num in hashes:
if "cdn" in conf["issue"] and conf["issue"]["cdn"] != "": # Custom CDN supplied
req = getPDF(
session, issue, hashes[page_num], conf["credentials"]["h"], conf["issue"]["cdn"])
else:
req = getPDF(session, issue,
hashes[page_num], conf["credentials"]["h"])
name = JSON['result']['name']

content = req.content

# Sometimes we might just not get a pdf, convert response to pdf instead.
if req.headers["content-type"] in ("image/webp", "image/jpeg", "image/png", "image/gif", "image/svg"):
print(f"{name} - page {page_num} is image/, not application/pdf, we convert file to pdf", file=sys.stderr)
image = req.content
content = img2pdf.convert(image)

with open(f"{name} - {page_num}.pdf", "wb") as file:
file.write(content)

pdfMerge(name)

return

def main():
session = requests.Session()
session.headers.update(
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})

credentials = {
"textalk-auth": "YOUR OWN", #Look in api request content
"auth": "YOUR OWN", #Look in Storage/Local Storage/prenlyreadersessiontoken
"h": "YOUR OWN" #Look in pdf-request to api
}
def opts(argv):
usage = "test" # TODO Put a helpful text here
if len(argv) == 0:
print("No parameters given")
print(usage)
sys.exit(1)

# TODO Add support to supply list of issue uids
issue = {
"title": 4019, # ID of a paper
"uid": "517313", # uid of a specific issue
"site": "https://hemslojd.prenly.com" # URL of reader
}
config = {}

# TODO Add getopts
# Getopts:
# - Title
# - IssueID
# - Site
# - Auth
# - CDN
# - h
# --json of above

JSON = json.loads(getIssueJSON(session, credentials, issue))
hashes = getHashes(JSON) # Extract the hashes for individual pages

# Get all PDFs and write them to files.
for page_num in hashes:
req = getPDF(session, issue, hashes[page_num], credentials["h"])
name = JSON['result']['name']
with open(f"{name} - {page_num}.pdf", "wb") as file:
file.write(req.content)

pdfMerge(name+".pdf")
try:
# title issue site, cdn, textalk, auth, h=23ob...
# TODO Custom title?
opts, args = getopt.getopt(argv, "t:i:s:c:u:a:h:", [
"title=", "issue=", "site=", "cdn=", "textalk=", "auth=", "json=", "help"])
except getopt.GetoptError as error:
print(repr(error), file=sys.stderr)
print(usage)
sys.exit(2)
for opt, arg in opts:
if opt == "--json":
with open(arg, "r") as file:
config = json.loads(file.read())
break
# TODO rest of options, build config with supplied options

return
main(config)


if __name__ == '__main__':
main()
opts(sys.argv[1:])
sys.exit(0)

0 comments on commit 9237d91

Please sign in to comment.