-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitepreview
executable file
·157 lines (131 loc) · 4.79 KB
/
sitepreview
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
import asyncio
import base64
import json
import os
import subprocess
import sys
import magic
import requests
from playwright.async_api import (
async_playwright,
Error as PlaywrightError,
TimeoutError as PlaywrightTimeoutError,
)
from utils import (
script_directory,
die,
log,
warn,
)
MAX_TASKS = 5
VALID_TYPES = [
"image/jpeg",
"image/png",
"image/gif",
"image/webp",
]
# Simple globe icon
GENERIC_IMAGE = base64.b64decode(
b"""
/9j/4AAQSkZJRgABAQIAJQAlAAD/2wBDABgQEhUSDxgVExUaGRgcIzsmIyAgI0gzNis7VUtaWFRL
UlFeaodzXmSAZVFSdqB3gIyQl5mXW3GmsqWTsIeUl5L/wAALCABkAGQBAREA/8QAGAABAQEBAQAA
AAAAAAAAAAAAAgABAwX/xAAmEAADAAECBAcBAQAAAAAAAAAAAQIRAxITIVFhIjFBQmJxgTJS/9oA
CAEBAAA/APbIiMdSvNherPcziroxLUl+uBJ58iIiIjKpSssHiv4oqUwvLL7mxHurzDpyqVJ9Sxse
2llejFw8c4eCm+eKWGMiIyqUrLDMunur8Q28LLOcLdW9/h0Bpe77FUqlhh0203L80KpVLDDLcvbX
4xkRypur5LKRu+v8Buqrw7cCV0lhQW+v8Bims4nIt9f4DTrKrbjAuJT9hluqX8McVunuIy3iWzNN
YhdxHOOepT6HQjno+77OhlLMtB0nmMdBgXh1WvRjBq/yl1YyBo/y33GRz0fd9nQjnp/1S7nQGpyq
X3GDW8l9lwvkzOF8mHTjcvNofC+TM4XyYYjdnm1gfC+TLhfJgmM1Sy+Q+F8mGo2uebfM6h1VmH2N
l5lM056XKqR0I56Pu+zoXoDS8m+rGCuerK6cxk1lYOU3szNeguLPcG9cTcvL1HxZ7lxZ7g07U5z6
j4s9zK1E5aWSnUmZS5m8We5afNun6jIGpPuXmhS1SyjLndOPUzTaaw1zQ8Loc9Jf19nTC6HNeO8+
iOmF0OdeOts/rOiWFhERAqXL3R+o2bVdn0MuXL3z+imlSygaXu+yqnT2z+saSiewHTvlHl1HMqVh
GkREGoVffUzFz5PcgPcqypaMW7DST5incliZx3Ylpt87eRpJLCIiIiIiIiIiI//Z
"""
)
def resize_image(path):
subprocess.Popen(["convert", "-quality", "50%", "-resize", "x320", path, path])
async def get_social_image(page):
"""
Return a bytes object containing purportedly valid image data or None
"""
# Find at most ONE image url
image_url = None
for sel, attr in {
'meta[property="og:image"]': "content",
'meta[name="twitter:image"]': "content",
'link[rel="image_src"]': "href",
'link[as="image"]': "href",
'meta[itemprop="image"]': "content",
'meta[itemprop="thumbnailUrl"]': "content",
}.items():
try:
locator = page.locator(f"css={sel}")
if image_url := await locator.get_attribute(attr, timeout=100):
break
except PlaywrightTimeoutError:
continue
# Download and check the validity of the image
if image_url:
try:
r = requests.get(image_url)
except requests.RequestException:
return
if magic.from_buffer(r.content, mime=True) in VALID_TYPES:
return r.content
async def make_preview(browser, device, url, output_path):
# Basic sanity check prevents overwriting non-images
log(f"get: {url}")
if os.path.isfile(output_path):
if magic.from_file(output_path, mime=True) not in VALID_TYPES:
warn(f"preview: not overwriting {output_path}")
return
try:
context = await browser.new_context(**device)
page = await context.new_page()
await page.goto(url)
# First we try downloading a social image
if image_data := await get_social_image(page):
with open(output_path, "wb") as f:
f.write(image_data)
resize_image(output_path)
log(f"preview: downloaded preview {output_path}")
return
# Otherwise we'll render an image. If this times out we'll use GENERIC_IMAGE
await page.screenshot(path=output_path, type="jpeg", timeout=10000)
resize_image(output_path)
log(f"preview: generated preview {output_path}")
except PlaywrightError:
warn(f"preview: couldn't load {url}")
with open(output_path, "wb") as f:
f.write(GENERIC_IMAGE)
finally:
await context.close()
async def main(urls_and_outputs):
log("playwright: starting up...")
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
device = playwright.devices["Desktop Chrome"]
async def sem_locked_coroutine(coroutine):
async with semaphore:
return await coroutine
semaphore = asyncio.Semaphore(MAX_TASKS)
coroutines = (
make_preview(browser, device, url, output_path)
for url, output_path in urls_and_outputs.items()
)
await asyncio.gather(*(sem_locked_coroutine(c) for c in coroutines))
await browser.close()
await playwright.stop()
log("preview: finished")
if __name__ == "__main__":
if len(sys.argv) != 2:
die(
f"preview: Usage: {os.path.basename(sys.argv[0])} JSON_MAPPING_OF_URLS:OUTPUTS"
)
try:
urls_and_outputs = json.loads(sys.argv[1])
except json.decoder.JSONDecodeError:
die("preview: unable to load json mapping of urls to output files")
with script_directory():
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls_and_outputs))