-
Notifications
You must be signed in to change notification settings - Fork 26
/
slideslive-slides-dl.py
122 lines (97 loc) · 4.6 KB
/
slideslive-slides-dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import re
import os
import requests
import pandas as pd
import xml.etree.ElementTree as et
import time
def parse_xml(xml_file, df_cols):
"""Parse the input XML file and store the result in a pandas
DataFrame with the given columns.
Features will be parsed from the text content
of each sub-element.
based on:
https://medium.com/@robertopreste/from-xml-to-pandas-dataframes-9292980b1c1c
"""
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
for el in df_cols[0:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
def get_video_id(video_url):
ids = re.findall('https://slideslive\\.(com|de)/([0-9]*)/([^/]*)(.*)', video_url)
if len(ids) < 1:
print('Error: {0} is not a correct url.'.format(video_url))
exit()
return ids[0][1], ids[0][2]
def download_save_file(url, save_path, headers, wait_time=0.2):
r = requests.get(url, headers=headers)
with open(save_path, 'wb') as f:
f.write(r.content)
time.sleep(wait_time)
def download_slides_xml(base_xml_url, video_id, video_name, headers, wait_time):
folder_name = '{0}-{1}'.format(video_id, video_name)
if not os.path.exists(folder_name):
os.mkdir(folder_name)
if os.path.isfile(folder_name):
print('Error: {0} is a file, can\'t create a folder with that name'.format(folder_name))
exit()
file_path = '{0}/{1}.xml'.format(folder_name, video_id)
if not os.path.exists(file_path):
xml_url = '{0}{1}/{1}.xml'.format(base_xml_url, video_id)
print('downloading {}'.format(file_path))
download_save_file(xml_url, file_path, headers, wait_time)
return open(file_path, 'r')
def download_slides(video_id, video_name, df, base_img_url, size, headers, wait_time):
folder_name = '{0}-{1}'.format(video_id, video_name)
for index, row in df.iterrows():
img_url = base_img_url.format(video_id, row['slideName'], size)
file_path = '{0}/{3}-{1}-{2}.jpg'.format(folder_name, row['slideName'], size, row['time'])
print('downloading {}'.format(file_path))
download_save_file(img_url, file_path, headers, wait_time)
def create_ffmpeg_concat_file(video_id, video_name, df, size):
folder_name = '{0}-{1}'.format(video_id, video_name)
ffmpeg_file_path = '{0}/ffmpeg_concat.txt'.format(folder_name)
if os.path.exists(ffmpeg_file_path):
return
with open(ffmpeg_file_path, 'a') as f:
last_time = 0
last_file_path = ''
for index, row in df.iterrows():
# if not first, write duration
duration = int(row['timeSec']) - last_time
if index != 0:
f.write('duration {0}\n'.format(duration))
file_path = '{3}-{1}-{2}.jpg'.format(folder_name, row['slideName'], size, row['time'])
f.write("file '{0}'\n".format(file_path))
last_time = int(row['timeSec'])
last_file_path = file_path
# add some time for the last slide, we have no information how long it should be shown.
f.write('duration 30\n')
# Due to a quirk, the last image has to be specified twice - the 2nd time without any duration directive
# see: https://trac.ffmpeg.org/wiki/Slideshow
# still not bug free
f.write("file '{0}'\n".format(last_file_path))
parser = argparse.ArgumentParser()
parser.add_argument('url')
parser.add_argument('--size', default='big', help='medium or big')
parser.add_argument('--useragent', default='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/76.0.3809.100 Chrome/76.0.3809.100 Safari/537.36')
parser.add_argument('--basedataurl', default='https://d2ygwrecguqg66.cloudfront.net/data/presentations/')
parser.add_argument('--waittime', default='0.2', type=float, help='seconds to wait after each download')
args = parser.parse_args()
headers = {'User-Agent': args.useragent}
base_img_url = '{0}{1}'.format(args.basedataurl, '{0}/slides/{2}/{1}.jpg')
video_id, video_name = get_video_id(args.url)
xml = download_slides_xml(args.basedataurl, video_id, video_name, headers, args.waittime)
df = parse_xml(xml, ['orderId', 'timeSec', 'time', 'slideName'])
download_slides(video_id, video_name, df, base_img_url, args.size, headers, args.waittime)
create_ffmpeg_concat_file(video_id, video_name, df, args.size)