-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
32 lines (26 loc) · 1.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
from src.data import xml_extraction as xmle
if __name__ == '__main__':
for i in range(3, 4):
two_col_loc = f"data\\raw\\BMC_{i}_2\\*"
four_col_loc = f"data\\raw\\BMC_{i}_4\\*"
xmls_2, xmls_4 = xmle.gen_2_4_col_xml_paths(two_col_loc, four_col_loc)
print(f"{len(xmls_2) + len(xmls_4)} xmls extracted from\n"
f"2 col [{len(xmls_2):03}]: {os.path.dirname(xmls_2[0])}\n4 col [{len(xmls_4):03}]: {os.path.dirname(xmls_4[0])}")
vol_xml_trees = xmle.gen_2_4_col_xml_trees(xmls_2, xmls_4)
print("\nExtracting catalogue entries from xmls")
lines, xml_track_df = xmle.extract_lines_for_vol(vol_xml_trees)
title_shelfmarks, title_indices, ordered_lines = xmle.find_headings(lines)
entry_df = xmle.extract_catalogue_entries(ordered_lines, title_indices, title_shelfmarks, xml_track_df)
print(f"Extracted {len(entry_df)} entries")
out_path = f"data\\processed\\BMC_{i}"
print(f"Saving catalogue entries to {out_path}")
if not os.path.exists(out_path):
os.makedirs(out_path)
entry_df.to_csv(os.path.join(out_path, "catalogue_entries_bought_in.csv"), index=False)
# xmle.save_poorly_scanned_pages(xmle.get_poorly_scanned_pages(current_volume, xmls), out_path)
# print("Saving raw txt files")
# # entry_df.groupby(by=["xml", "shelfmark"]).progress_apply(lambda x: xmle.groupby_save(x, outpath))
# # print("Saving split txt files")
# # xmle.saveSplitTxt(allTitleIndices, allLines, os.path.join(out_path, "splittextfiles"), titleRefs)
# xmle.save_xml(lines, title_indices, title_shelfmarks, out_path)