-
Notifications
You must be signed in to change notification settings - Fork 4
/
eapbl-project.py
240 lines (198 loc) · 13.7 KB
/
eapbl-project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# -*- coding: utf-8 -*-
import mechanize
from bs4 import BeautifulSoup
import os
import csv
import sys
import shutil
import time
import urllib2
import random
'''
Program to download images from the endangered archives collection from http://eap.bl.uk/database/collections.a4d
'''
if sys.version_info[0] != 2:
raise Exception("Python 2 is required to run this script.")
# list of project urls to downloads
urls = ['http://eap.bl.uk/database/results.a4d?projID=EAP023', # EAP023 Preserving Marathi manuscripts and making them accessible
'http://eap.bl.uk/database/results.a4d?projID=EAP038', # EAP038 Survey, conservation and archiving of pre-1947 Telugu printed materials in India
'http://eap.bl.uk/database/results.a4d?projID=EAP127', # EAP127 Archiving 'popular market' Bengali books
'http://eap.bl.uk/database/results.a4d?projID=EAP183', # EAP183 Preserving early print literature on the history of Tamilnadu
'http://eap.bl.uk/database/results.a4d?projID=EAP191', # EAP191 Strategies for archiving the endangered publications of French India (1800-1953)
'http://eap.bl.uk/database/results.a4d?projID=EAP201', # EAP201 Study and collection of Hakku Patras and other documents among folk communities in Andhra Pradesh
'http://eap.bl.uk/database/results.a4d?projID=EAP208', # EAP208 Preserving memory: documentation and digitisation of palm leaf manuscripts from northern Kerala, India
'http://eap.bl.uk/database/results.a4d?projID=EAP248', # EAP248 Preserving more Marathi manuscripts and making them accessible - major project
'http://eap.bl.uk/database/results.a4d?projID=EAP261', # EAP261 Digital archive of early Bengali drama
'http://eap.bl.uk/database/results.a4d?projID=EAP262', # EAP262 Retrieval of two major and endangered newspapers: Jugantar and Amrita Bazar Patrika
'http://eap.bl.uk/database/results.a4d?projID=EAP314', # EAP314 Rescuing Tamil customary law: locating and copying endangered records of village judicial assemblies (1870-1940)
'http://eap.bl.uk/database/results.a4d?projID=EAP341', # EAP341 Rescuing text: retrieval and documentation of printed books and periodicals from public institutions in eastern India published prior to 1950 - major project
'http://eap.bl.uk/database/results.a4d?projID=EAP372', # EAP372 Preserving early periodicals and newspapers of Tamilnadu and Pondichery
'http://eap.bl.uk/database/results.a4d?projID=EAP458', # EAP458 Constituting a digital archive of Tamil agrarian history during the colonial period
'http://eap.bl.uk/database/results.a4d?projID=EAP584', # EAP584 Preserving memory II - documentation and digitisation of palm leaf manuscripts from Kerala, India
'http://eap.bl.uk/database/results.a4d?projID=EAP676', # EAP676 Survey of Buddhist Sanskrit manuscripts in the possession of Vajrayana Viharas and Newar Buddhist families in Lalitpur in the Kathmandu valley, Nepal
'http://eap.bl.uk/database/results.a4d?projID=EAP683', # EAP683 Rāmamālā Library manuscript project
'http://eap.bl.uk/database/results.a4d?projID=EAP689', # EAP689 Constituting a digital archive of Tamil agrarian history (1650-1950) - phase II
'http://eap.bl.uk/database/results.a4d?projID=EAP692', # EAP692 Documentation of endangered temple art of Tamil Nadu
'http://eap.bl.uk/database/results.a4d?projID=EAP759' # EAP759 Survey of Bengal puthis discovered in Sundarbans, South 24 Parganas, West Bengal
]
user_agent_list = ['Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121202 Firefox/17.0 Iceweasel/17.0.1',
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.2; WOW64; rv:1.8.0.7) Gecko/20110321 MultiZilla/4.33.2.6a SeaMonkey/8.6.55',
'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
'Opera/12.02 (Android 4.1; Linux; Opera Mobi/ADR-1111101157; U; en-US) Presto/2.9.201 Version/12.02',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; XH; rv:8.578.498) fr, Gecko/20121021 Camino/8.723+ (Firefox compatible)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
'Mozilla/5.0 (X11; U; Linux x86_64; it-it) AppleWebKit/534.26+ (KHTML, like Gecko) Ubuntu/11.04 Epiphany/2.30.6',
'Mozilla/5.0 (X11; U; Linux x86_64; fr-FR) AppleWebKit/534.7 (KHTML, like Gecko) Epiphany/2.30.6 Safari/534.7',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Epiphany/2.30.6 Safari/534.7',
'Mozilla/5.0 (X11; U; Linux i686; sv-se) AppleWebKit/531.2+ (KHTML, like Gecko) Safari/531.2+ Epiphany/2.30.6',
'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.13) Gecko/20100916 Iceape/2.0.8',
'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121201 icecat/17.0.1',
'Mozilla/5.0 (X11; Linux i686; rv:7.0.1) Gecko/20111106 IceCat/7.0.1',
'Mozilla/5.0 (X11; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1 Iceweasel/15.0.1',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1250.0 Iron/22.0.2150.0 Safari/537.4'
]
# create a directory to work in and cd into it
try:
os.mkdir(os.path.join(os.getcwd(), 'eapbl-project'))
except OSError:
print "Directory 'eapbl-project' already exists! Not creating it again!"
os.chdir('eapbl-project')
br = mechanize.Browser()
abspath = os.path.abspath('.')
for url in urls:
# TODO: don't download page if already exists
print "Downloading page: " + url
html_page = br.open(url).read()
soup = BeautifulSoup(html_page, 'html.parser')
results = soup.select('#results')
heading = results[0].find('th').h3.text # heading is the name of the project
heading = heading.replace(':', '-')
try:
os.mkdir(os.path.join(abspath, heading))
except OSError:
print "Directory " + heading + " already exists! Not creating it again!"
# don't write page if already exists
if not os.path.exists(os.path.join(abspath+'/'+heading, 'page.html')):
with open(os.path.join(abspath+'/'+heading, 'page.html'), 'w') as f:
f.write(html_page)
else:
print "Not writing 'page.html' - already exists"
with open(os.path.join(abspath + '/' + heading, 'ref.txt'), 'w') as r: # store referer link in ref.txt
r.write(url)
# list all the directories
dirs = [d for d in os.listdir('.') if os.path.isdir(d) and not d.startswith('.')]
for directory in dirs:
print "Switching to directory: " + directory
os.chdir(directory)
# check if pubs.csv file exists - and write accordingly
if not os.path.exists('pubs.csv'):
soup = BeautifulSoup(open('page.html', 'r'), 'html.parser')
results = soup.select("#results")
base_url = 'http://eap.bl.uk/database/'
with open('pubs.csv', 'wb') as f:
writer = csv.writer(f, delimiter='@', quoting=csv.QUOTE_MINIMAL)
for tr in results[0].find_all('tr'):
for x in range(1, len(tr.find_all('td')), 2):
title = tr.find_all('td')[x].a.text.encode('utf-8').strip() # encode to utf-8
link = base_url + tr.find_all('td')[x].a['href'].encode('utf-8').strip() # encode to utf-8
writer.writerow([title, link])
else:
print "Not writing 'pubs.csv' already exists"
# read the pubs.csv, create a folder for every publication
with open('pubs.csv', 'r') as f:
abspath = os.path.abspath('.')
br = mechanize.Browser()
user_agent_string = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
with open('ref.txt', 'r') as r:
referer_string = r.read().strip()
br.addheaders = [('user-Agent', user_agent_string), ('referer', referer_string)]
reader = csv.reader(f, delimiter='@')
for row in reader:
title = row[0].replace('/', '-').replace(':', '-')
link = row[1]
# skip 1 iteration if zip file of title name exists
if os.path.exists(title + '.zip'):
print "Already downloaded and zipped " + title
continue
# don't load link and write if thumbs.html already exists
if not os.path.exists(os.path.join(title, 'thumbs.html')):
# time.sleep(2)
print "Loading publication link: " + link
try:
random_user_agent = random.choice(user_agent_list)
br.addheaders = [('user-agent', random_user_agent)]
page = br.open(link).read()
except urllib2.URLError, e:
print e.reason
# request = br.request
# print request.header_items()
try:
os.mkdir(os.path.join(abspath, title))
except OSError:
print "Directory: " + title + " already exists! Not creating it again. Not downloading again."
with open(os.path.join(abspath + '/' + title, 'thumbs.html'), 'w') as f:
f.write(page)
else:
print "Already exists! folder: " + title + " and thumbs.html"
if not os.path.exists(os.path.join(title, 'ref.txt')): # store a referer link for the images
with open(os.path.join(abspath + '/' + title, 'ref.txt'), 'w') as tref:
tref.write(link)
# get a list of folder names, cd into it, parse the thumbs.html file and store the urls of the image in a file
folders = [d for d in os.listdir('.') if os.path.isdir(d) and not d.startswith('.')]
base_image_url = 'http://eap.bl.uk/'
for f in folders:
os.chdir(f)
print "Switched to folder: " + f + ". Downloading images."
image_soup = BeautifulSoup(open('thumbs.html', 'r'), 'html.parser')
ul = image_soup.find('ul', class_='ad-thumb-list')
# add image referer link
with open('ref.txt', 'r') as image_referer:
img_ref = image_referer.read().strip()
random_user_agent = random.choice(user_agent_list)
br.addheaders = [('user-agent', random_user_agent), ('Referer', img_ref)]
try:
for li in ul.find_all('li'):
image_link = li.a['href']
full_image_link = base_image_url + li.a['href']
image_file_name = image_link.replace('/', '_').strip('_') # strip leading '_' from image_file_name
image_path = os.path.join(os.path.abspath('.'), image_file_name)
# don't download image if already exists
if not os.path.exists(image_file_name):
# download and write the image
time.sleep(2)
random_user_agent = random.choice(user_agent_list)
br.addheaders = [('user-agent', random_user_agent)]
print "Retrieving image: " + full_image_link
br.retrieve(full_image_link, image_path)
else:
print "Already exists! image: " + image_file_name
# sleep for 3 seconds after downloading a set of images
time.sleep(3)
except AttributeError:
print "AttributeError - There may be no images available yet."
pass
os.chdir('..')
# zip up all folders
for pub_folder in os.listdir('.'):
if os.path.isdir(pub_folder):
shutil._make_zipfile(pub_folder, pub_folder) # create a zip archive of the folder
shutil.rmtree(pub_folder) # delete the folder & contents
# return to previous directory
os.chdir('..')
print "DONE!"