This repository has been archived by the owner on Jun 21, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
blackboard_duster.py
executable file
·561 lines (503 loc) · 19.8 KB
/
blackboard_duster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
#!/usr/bin/env python3
"""
Copyright (C) 2020 Taylor Smith
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see https://www.gnu.org/licenses/
~^~
Blackboard Duster
Scrapes course materials from your Blackboard courses, such as
lecture notes and homework
Author: Taylor Smith, Winter 2019
Python Version: 3.7
Notes: Uses Selenium to scrape urls from Blackboard, then urllib to
download files
TODO:
- avoid redundant visit to course home page (just ignore it?)
- dump notes from items/assignments into a .txt : use div.details
- don't abort if navpane is missing, reload or skip
- put a 'download progress' label on progress bar
- use etag instead of last-modified date (note - etag may not always be available)
~*~ """
import argparse
import json
import requests
from enum import Enum
from datetime import datetime
from os import get_terminal_size
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from urllib.parse import unquote
# Last Modified value in the header has a timezone. Once it is
# converted to a datetime object, the timezone info is lost
lastmod_parse_fmt = '%a, %d %b %Y %H:%M:%S %Z'
lastmod_save_fmt = '%a, %d %b %Y %H:%M:%S'
class Link:
"""contains useful information about a link
'url': url found on page, will (probably) get redirected
'name': friendly name of link
'save_path': relative to download path, usually the page's name
'element': the selenium Element that the url came from
'lastmod': last modified date
'full_path': full save path, used for troubleshooting
"""
def __init__(self, url, name='', save_path=None, element=None):
self.url = url
self.name = name
self.save_path = save_path
self.element = element
self.lastmod = None
self.full_path = None
def __repr__(self):
return f'{self.url}\n\t{self.name}\n\t{self.save_path}'
def set_lastmod(self, datestr):
self.lastmod = datetime.strptime(
datestr.strip(), lastmod_parse_fmt)
def json(self):
result = {
'url': self.url,
'name': self.name,
'save_path': self.save_path.as_posix(),
'lastmod': self.lastmod.strftime(lastmod_save_fmt)
}
return result
class DLResult(Enum):
"""represents various download results"""
COLLISION = 0
DOWNLOADED = 1
DUPLICATE = 2
UPDATED = 3
def apply_style(driver, element, res_code):
style = 'border: '
if res_code == DLResult.COLLISION:
style += '4px dotted red'
elif res_code == DLResult.DOWNLOADED:
style += '4px solid green'
elif res_code == DLResult.DUPLICATE:
style += '4px dashed cyan'
elif res_code == DLResult.UPDATED:
style += '4px solid blue'
else: # PENDING DOWNLOAD
style += '1px dotted magenta'
driver.execute_script(
'arguments[0].setAttribute("style", arguments[1]);',
element, style)
def parse_args():
navpane_ignore = {'Announcements', 'Calendar',
'My Grades', 'Blackboard Collaborate'}
parser = argparse.ArgumentParser(
description='Scrapes files from Blackboard courses')
parser.add_argument(
'bb_url', metavar='BB_base_URL',
help='URL for your Blackboard instance.')
parser.add_argument(
'-s', '--save', metavar='path', default='.',
help='directory to save your downloads in')
parser.add_argument(
'--historypath', '--history', metavar='json',
default='BlackboardDuster.json',
help='path to blackboard duster history file. Relative to' +
' download directory unless path is absolute. The file' +
' will be created if it does not exit.')
parser.add_argument(
'--delay', metavar='delay_mult', type=int, default=1,
help='multiplier for sleep/delays')
parser.add_argument(
'-w', '--webdriver', '--wd', metavar='name', default='firefox',
help='browser WebDriver to use - either "firefox" or' +
' "chrome". You must have the WebDriver in your system' +
' path. Currently, only firefox is supported; that' +
' will change in the future')
parser.add_argument(
'-a', '--auto', action='store_true',
help='disable user input. The script will continue after' +
' parsing a page')
parser.add_argument(
'-b', '--binary', metavar='file', default=None,
help='Path to the binary you want to use - use if your' +
' browser binary is not in the default location')
parser.add_argument(
'-i', '--ignore', metavar='name', action='append',
help=f'Name of a page in the navpane to ignore; repeat this argument' +
f' to ignore multiple pages. Defaults are {navpane_ignore}')
args = parser.parse_args()
# convert given path string into a Path object
args.save = Path(args.save)
# if history path isn't absolute, make it relative to save
args.historypath = Path(args.historypath)
if not args.historypath.is_absolute():
args.historypath = args.save / args.historypath
# sterilize webdriver name
args.webdriver = args.webdriver.lower().strip()
# combine args.ignore with navpane_ignore
if args.ignore:
navpane_ignore.update(args.ignore)
args.ignore = navpane_ignore
# inform user about auto mode
print(f'running in {"auto" if args.auto else "manual"} mode')
return args
# end parse_args()
def wait_on_CSS_selector(driver, selector, delay_mult, delay):
"""delay until an element is located by the given css selector"""
try:
WebDriverWait(driver, delay_mult * delay).until(
EC.presence_of_element_located((
By.CSS_SELECTOR, selector))
)
except TimeoutException:
return False
return True
# end wait_on_CSS_selector
def setup_history(path):
# set up the download history JSON object
history = None
try:
with path.open('r') as file:
history = json.load(file)
except json.decoder.JSONDecodeError:
print('current history file will not parse, aborting')
exit()
except IOError:
print('history file not found, creating new history')
history = json.loads('{"links":[]}')
return history
def setup_session(driver):
"""copies login cookies from WebDriver into a requests session"""
session = requests.Session()
for cookie in driver.get_cookies():
session.cookies.set(cookie['name'], cookie['value'])
return session
def manual_login(driver):
"""allow user to signs in manually
waits until the Blackboard homepage appears, returns nothing
"""
print('Please log into your university Blackboard account - I will'
' wait for you to reach the home page!')
# looking for "Welcome, #### – Blackboard Learn" (the dash is NOT a
# minus sign!)
while not driver.title.startswith('Welcome, ') or \
not driver.title.endswith(' – Blackboard Learn'):
pass
def accept_cookies(driver, delay_mult):
"""if the cookie notice appears, click 'accept'"""
try:
element = WebDriverWait(driver, delay_mult * 4).until(
EC.presence_of_element_located((By.ID, 'agree_button'))
)
print('I am accepting the cookie notice, I hope that is ok!')
element.click()
except TimeoutException:
print('I did not see a cookie notice.')
def get_courses_info(driver, delay_mult, save_root):
"""returns an array of link objects for each course
driver: a selenium WebDriver
delay_mult: delay multiplier
save_root: base directory for downloads
expects homepage to already be loaded
"""
result = []
# TODO course announcements are included in the list
if not wait_on_CSS_selector(
driver,'div#div_25_1 a',delay_mult,10):
print('I did not see your course list! Aborting')
driver.quit()
exit()
# be more specific when selecting the links - the wait statement's
# selector includes announcement links, which we don't want
course_links = driver.find_elements_by_css_selector(
'div#div_25_1 > div > ul > li > a')
for c_l in course_links:
link = Link(
c_l.get_attribute('href'),
c_l.text,
(save_root / c_l.text)
)
result.append(link)
return result
def get_navpane_info(driver, course_link, delay_mult):
"""returns an array of Links for items in the navpane
driver: a selenium WebDriver
course_link: Link object representing the course homepage - this
link will be loaded
delay_mult: delay multiplier
returns a Link array
"""
driver.get(course_link.url)
if not wait_on_CSS_selector(
driver,'ul#courseMenuPalette_contents',delay_mult,10):
print('I could not access the navpane! skipping')
return []
page_link_elements = driver.find_elements_by_css_selector(
'ul#courseMenuPalette_contents a')
result = []
for element in page_link_elements:
title = element.find_element_by_css_selector(
'span').get_attribute('title')
link = Link(
element.get_attribute('href'),
title,
(course_link.save_path / title)
)
result.append(link)
return result
def gather_links(page_link, driver, delay_mult=1):
"""gathers and highlights available file urls on the given page
page should already be loaded
driver: a selenium WebDriver
page_link: Link object
delay_mult: delay multiplier
returns a dictionary:
links: a list of Link objects
folders: a list of sub-folders on the page
"""
results = {
'links': [],
'folders': []
}
if not wait_on_CSS_selector(
driver,'ul#content_listContainer',delay_mult,3):
print('This page does not have a content list.')
return results
# get a list of all items in the content list
page_content = driver.find_elements_by_css_selector(
'ul#content_listContainer > li')
for item in page_content:
i_type = item.find_element_by_css_selector(
'img').get_attribute('alt')
# in the header holding the name there is a hidden <span> that
# gets in the way; ignore it by looking for the style attribute
try:
i_name = item.find_element_by_css_selector(
'span[style]').text
except:
print('failed to find item name. Skipping... ')
continue
# print(f' {i_type}: {i_name}'
if i_type == 'File':
# files are just a link
link_element = item.find_element_by_css_selector(
'a')
link = Link(
link_element.get_attribute('href'),
i_name,
page_link.save_path,
link_element
)
apply_style(driver, link.element, None)
results['links'].append(link)
elif i_type == 'Content Folder':
# folders contain another page
# no need to track its element
link = Link(
item.find_element_by_css_selector(
'a').get_attribute('href'),
i_name,
(page_link.save_path / i_name)
)
results['folders'].append(link)
elif i_type == 'Web Link':
# TODO dump links into a per-page file (markdown?)
# TODO ignore webpages but download files
pass
elif i_type == 'Item':
# TODO dump info into a per-page file (markdown?)
pass
else:
# FIXME this is really ugly
print(f' ** {i_type} is not a supported item',
' type - attachments will still be collected **')
# find attachments; Items and Assignments usually have some
i_files = item.find_elements_by_css_selector(
'ul.attachments > li')
# if there are multiple attachments on the item, stick them in
# a new folder
save_path = page_link.save_path
if len(i_files) > 1:
save_path = save_path / i_name
for file in i_files:
link_element = file.find_element_by_css_selector('a')
link = Link(
link_element.get_attribute('href'),
file.find_element_by_css_selector('a').text.strip(),
save_path,
link_element
)
# print(f' - {link.name}')
apply_style(driver, link.element, None)
results['links'].append(link)
return results
def dowload_file(session, link, history):
"""uses requests to download a file"""
# set up download result code
res_code = DLResult.DOWNLOADED
# get the link's last modified date
response = session.head(link.url, allow_redirects=True)
link.set_lastmod(response.headers['last-modified'])
# look for link in history
dupe = None
for hist_link in history['links']:
if link.url == hist_link['url']:
dupe = hist_link
continue
# compare link's last modified date to historical date
if dupe is not None:
hist_lastmod = datetime.strptime(
dupe['lastmod'].strip(), lastmod_save_fmt)
if link.lastmod <= hist_lastmod:
return DLResult.DUPLICATE
else:
res_code = DLResult.UPDATED
# download the file
result = session.get(link.url)
# setup the file's full path and create any needed directories
link.save_path.mkdir(parents=True, exist_ok=True)
file_name = unquote(result.url.rsplit('/', 1)[1])
file_path = link.save_path / file_name
try:
with file_path.open('xb') as file:
file.write(result.content)
# FIXME updated files still trigger exception
except:
# hang onto the full path to report it later
link.full_path = file_path
res_code = DLResult.COLLISION
# TODO hash the two files to see if they are the same
# FIXME collided files are still added to history, collision is forgotten
# add link to history or update lastmod
if dupe is None:
history['links'].append(link.json())
else:
dupe['lastmod'] = link.lastmod.strftime(lastmod_parse_fmt)
return res_code
def download_links(links, driver, session, history):
"""uses requests to download files, shows a progress bar
driver: a WebDriver object
links: a list of Link objects
history: a JSON object with prevoius download history
returns a list of counters
"""
# set up download tracking variables
counters = [0]*len(DLResult)
# watch for collisions
collided = []
# set progress bar length
prog_len = get_terminal_size().columns-2
for count, link in enumerate(links):
res_code = dowload_file(session, link, history)
counters[res_code.value] += 1
# mark link to indicate download result to user
apply_style(driver, link.element, res_code)
# if it's a collision, hang onto the link
if res_code == DLResult.COLLISION:
collided.append(link)
# draw progress bar
progress = (count + 1) * int(prog_len / len(links))
print('|{}{}|'.format('#'*progress, '-'*(prog_len-progress)),
end='\r')
# erase progress bar using a ansi escape code
# \033[K' clears the row
print('\033[K', end='\r')
# TODO let user know how many items downloaded ect
# let user know what collided
if len(collided) > 0:
print('Some of the files on this page could not download',
'beacuse another file was in the way:')
for link in collided:
print(f' ~ "{link.full_path}"')
print('The associated links are marked with a dotted red',
'outline if you need to manually download these files.')
return counters
def process_page(page_link, driver, session, history, args):
"""gathers urls and downloads file from a page, handles folders
page_link: link object
driver: a selenium WebDriver
session: a requests Session, with blackboard cookies
history: the JSON download history
args: the parsed arguments object
returns a list of counters, indexed by DLResult values
"""
print(f' {page_link.name}')
driver.get(page_link.url)
gather_results = gather_links(page_link, driver, args.delay)
counters = download_links(
gather_results['links'], driver, session, history)
# save history after every page
try:
with args.historypath.open('w') as file:
json.dump(history, file, indent=4)
except IOError:
print('failed to save download history! You may want to',
'investigate before continuing to the next page.')
if not args.auto:
# wait for user input
input('Press enter here once you are ready to move on: ')
# erase prompt using ansi escape codes since a newline was printed
# '\033[A' moves cursor up once, '\033[K' clears the row
print('\033[A\033[K', end='\r')
for folder_link in gather_results['folders']:
sub_counters = process_page(
folder_link, driver, session, history, args)
for i, s_ctr in enumerate(sub_counters):
counters[i] += s_ctr
return counters
def main():
args = parse_args()
history = setup_history(args.historypath)
# set up the WebDriver
driver = None
if args.webdriver == 'firefox':
# driver = webdriver.Firefox(firefox_profile=get_ff_profile(args))
driver = webdriver.Firefox()
elif args.webdriver == 'chrome':
# driver = webdriver.Chrome(options=get_ch_options(args))
driver = webdriver.Chrome()
else:
print(
f'sorry, but {args.webdriver} is not a supported WebDriver. Aborting')
exit()
print("here we go!")
# choose a nice size - the navpane is invisible at small widths,
# but selenium can still see its elements
driver.set_window_size(600, 500)
driver.get(args.bb_url)
manual_login(driver)
session = setup_session(driver)
print('Alright, I can drive from here.')
# links are visible behind the cookie notice, but it gets annoying
# plus, there might be legal implications - so accept and move on
accept_cookies(driver, args.delay)
courses = get_courses_info(driver, args.delay, args.save)
print(f'I found {len(courses)} courses. I will go through each one now!')
counters = [0]*len(DLResult)
for course in courses:
print(f'{course.name}')
navpane = get_navpane_info(driver, course, args.delay)
for page in navpane:
# a few pages have no (downloadable) content, skip them
if page.name in args.ignore:
print(f' *SKIPPED* {page.name}')
continue
page_counters = process_page(
page, driver, session, history, args)
for i, p_ctr in enumerate(page_counters):
counters[i] += p_ctr
print('#'*get_terminal_size().columns)
print('I am all done! Here are the stats:')
for res_code in DLResult:
print(f' {res_code.name}: {counters[res_code.value]}')
driver.quit()
# end main()
if __name__ == "__main__":
main()