-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_latex_pdf_sources.py
52 lines (43 loc) · 1.54 KB
/
get_latex_pdf_sources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import requests
from time import sleep, time
'''ASDF AIDS
HEP TH PDF LINK https://arxiv.org/pdf/hep-th/<ID>.pdf
<ID> IS unique to each paper in each section for arxiv
run latex parser as latexml <ID> --xml --destination=<ID>.xml
Store into the loading_dock directory
'''
directory = "hep-th-1992-2003-kddcup"
year_dir_list = []
for filename in os.listdir(directory):
year_dir_list.append(filename)
#year_dir_list = sorted(year_dir_list, reverse=True)
while(True):
try:
for filename in year_dir_list:
#print filename
for ID in os.listdir(directory + "/" + filename):
if(os.path.isdir(str(ID) + "_data")):
print "ID Exists: " + str(ID)
continue
r = requests.get("https://arxiv.org/pdf/hep-th/" + str(ID) + ".pdf", stream=True, headers={'User-agent':'Mozilla/5.0'})
with open('loading_dock/' + str(ID) + '.pdf', 'wb') as file:
file.write(r.content)
print("GOT PDF")
command = "latexml " + directory + "/" + filename + "/"+ str(ID) + " --xml --quiet --nocomments --destination=loading_dock/" + str(ID) + ".xml"
os.system(command)
print("Finished XML")
command = "python mine_pdf.py " + str(ID)
os.system(command)
print("FINISHED MINE PDF")
command = "python mine_xml.py " + str(ID)
os.system(command)
print("FINISHED MINE XML")
command = "python locate_me.py " + str(ID)
os.system(command)
print("FINISHED LOCATE ME")
command = "python img_creation.py " + str(ID) + "_ground_truth.json"
#os.system(command)
print("FINISHED OCR DB UPDATE")
except:
sleep(300)