-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
153 lines (131 loc) · 5.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import tarfile
import gzip
import shutil
import os
import glob
import pathlib
import zipfile
import subprocess as sp
def confirmDir(dir_name):
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
def get_outpath(inpath):
'''
Returns the filepath for a XML file,
based on the given TEX filepath.
'''
path_parts = pathlib.Path(inpath).parts
arxiv_id = os.path.splitext(path_parts[2])[0]
confirmDir('xml')
outpath = 'xml/' + arxiv_id + '.xml'
return outpath
def extract(filepath, identifiers):
'''
Extracts astro-ph submissions from given tar filepath.
Logs which submissions belong to particular tarfile.
'''
# Quit if given file is not tarfile
if not tarfile.is_tarfile(filepath):
# print('can\'t unzip {}, not a .tar file'.format(filepath))
return
total_submissions_extracted = 0
tar_dir = 'latex/' + os.path.splitext(os.path.basename(filepath))[0]
confirmDir(tar_dir)
confirmDir('logs')
# Open tarfile, read-only
# print('Extracting {}'.format(filepath))
tar = tarfile.open(filepath)
# Iterate over submissions, extracting only those that belong to the astro-ph category,
# logging which submissions belong to which tarfile
with open(tar_dir + '.txt', 'w+') as logfile:
logfile.write('TARFILE: {}'.format(os.path.basename(filepath)))
for submission in tar.getmembers():
submission_id = os.path.splitext(os.path.basename(submission.name))[0]
# Note if submission is .pdf, we will skip this
if submission.name.endswith('.pdf'):
with open('logs/pdf_submissions.txt', 'a+') as pdf_logfile:
pdf_logfile.write(submission_id + '\n')
elif submission.name.endswith('.gz') and identifiers.str.contains(submission_id).any():
logfile.write('\n' + submission_id)
submission_path = tar_dir + '/' + submission_id
# If it's been converted already, don't bother extracting it
if os.path.isfile(get_outpath(submission_path)):
# print('{} exists, already extracted and converted {}!'.format(get_outpath(submission_path), submission_id))
continue
# Extract the submission as a .gzip
try:
# print('Extracting {}...'.format(submission_id))
# Extract and convert to .zip
gz_obj = tar.extractfile(submission)
gz = tarfile.open(fileobj=gz_obj)
zipf = zipfile.ZipFile(file=submission_path + '.zip', mode='a', compression=zipfile.ZIP_DEFLATED)
for m in gz:
f = gz.extractfile(m)
if m.isdir():
continue
f_out = f.read()
f_in = m.name
zipf.writestr(f_in, f_out)
zipf.close()
gz.close()
total_submissions_extracted += 1
except tarfile.ReadError:
confirmDir('temp')
tar.extract(submission, 'temp')
with gzip.open('temp/' + submission.name, 'rb') as f_in:
with open(submission_path + '.tex', 'wb+') as f_out:
shutil.copyfileobj(f_in, f_out)
total_submissions_extracted += 1
tar.close()
# Delete the temporary folder for those wonky gz files
shutil.rmtree('temp/', ignore_errors=True)
# print(filepath + ' extraction complete')
# print('Number of submissions obtained: ' + str(total_submissions_extracted))
def get_submissions_to_convert(base_path):
'''
Returns a list of strings. Each string
is a path to a submission directory or .tex file within
the tar directory that has not yet been converted to XML,
or attempted to be converted (as evidenced by existence of logfile)
'''
submissions = glob.glob(base_path + '/*[.tex|.zip]')
submissions_to_convert = []
for submission_path in submissions:
outpath = get_outpath(submission_path)
logfile_path = 'logs/' + os.path.splitext(os.path.basename(submission_path))[0] + '.txt'
if not os.path.isfile(outpath) and not os.path.isfile(logfile_path):
submissions_to_convert.append(submission_path)
# print('{} submissions already converted, {} submissions still to be converted...'.format(len(submissions) - len(submissions_to_convert), len(submissions_to_convert)))
return submissions_to_convert
def convert(tar_path):
'''
Converts submission into XML, calling
latexmlc --dest=[output_file] [input_file]
Latexmlc will be able to extract ZIPs (not Tars unfortunately)
https://github.com/brucemiller/LaTeXML/issues/1091
'''
confirmDir('logs')
submissions = get_submissions_to_convert(tar_path)
for submission in submissions:
# Get its outpath
outpath = get_outpath(submission)
submission_id = os.path.splitext(os.path.basename(outpath))[0]
logfile_path = 'logs/' + submission_id + '.txt'
try:
# print('Converting {} to {}...'.format(submission, outpath))
with open(logfile_path, 'w+') as logfile:
sp.call(['latexmlc', '--timeout=240', '--dest=' + outpath, submission], timeout=300, stderr=logfile)
print('Writing logfile for ' + submission_id)
except sp.TimeoutExpired: # prevents hanging, for now
print(submission_id + 'timed out!')
with open('logs/failed_conversions_log.txt', 'w+') as failed_logfile:
failed_logfile.write(submission_id + '\n')
except KeyboardInterrupt:
# If I interrupt the conversion, remove the logfile so it can be reattempted
print('You interrupted convert()!')
print('Removing ' + logfile_path)
os.remove(logfile_path)
raise
except Exception as e:
print('Something went wrong in convert(): ' + e)
raise