-
Notifications
You must be signed in to change notification settings - Fork 0
/
img_creation.py
94 lines (78 loc) · 2.45 KB
/
img_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from pyPdf import PdfFileWriter, PdfFileReader
import sys
import json
import os
class pdf_metadata():
page_height = 0
def load_pdf_metadata(filename):
metadata = filename + "_data/pdfbox_output/pdf_metadata.txt"
#print metadata
with open(metadata) as fn:
for line in fn:
tmp = line.split(",")
pdf_metadata.page_height = float(tmp[1])
break
def crop_image(box, pdf_page, filename, count):
print "BOX"
print box
with open(filename + "_data/" + pdf_page, "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
page = input1.getPage(0)
x0 = float(box[0])
y0 = pdf_metadata.page_height-float(box[1])
x1 = float(box[2])
y1 = pdf_metadata.page_height-float(box[3])
page.trimBox.lowerLeft = (x0, y1)
page.trimBox.upperRight = (x1, y0)
page.cropBox.lowerLeft = (x0, y1)
page.cropBox.upperRight = (x1, y0)
output.addPage(page)
with open("OCR_DATASET/" + filename + "_me_" + str(count), "wb") as out_f:
output.write(out_f)
def ocr_fullme_to_latex_dataset(json_filename,ocr_type):
directory = "OCR_DATASET"
if not os.path.exists(directory):
os.makedirs(directory)
#pdf_me_data
filename = json_filename.split("_ground")
filename = filename[0]
load_pdf_metadata(filename)
directory = filename + "_data/pdfbox_output/"
DATA_FILENAME = "OCR_DATASET/img_to_latex_mapping.txt"
with open(directory+json_filename) as file:
data = json.load(file)
total_count = 0
mined_data = data["pdf_me_data"]
#ocr_full_latex_dataset = []
count = 0
for page in mined_data:
if(os.path.isfile(filename + "_data/" + page)):
latex_list = mined_data[page]["ME_LATEX"]
bbox_list = mined_data[page]["FULL_BBOX"]
#local_dataset = []
me_count = 0
for box in bbox_list:
if(box[0] == "FAILED" or box == "FAILED"):
me_count = me_count + 1
continue
#print latex_list[me_count]
crop_image(box, page, filename,count)
#local_dataset.append({filename + "_me_" + str(count):latex_list[count]})
total_count = total_count + 1
with open(DATA_FILENAME, "a") as ocr_source:
ocr_source.write(filename + "_me_" + str(count) + "," + latex_list[me_count] + "\n")
count = count + 1
me_count = me_count + 1
else:
#print "WTF"
#print page
continue
def main():
if(len(sys.argv) > 1):
json_filename = sys.argv[1]
#ocr_type = sys.argv[2]
ocr_type = 1 #ignore for now
ocr_fullme_to_latex_dataset(json_filename,ocr_type)
main()
#ocr_fullme_to_latex_dataset("1605.02019_ground_truth.json",1)