From 95621a61f8e3bd43b129130cd99488db56146bc0 Mon Sep 17 00:00:00 2001 From: Maurice Mengel Date: Tue, 18 Jun 2024 10:55:20 +0200 Subject: [PATCH] debugging and modernization --- test/query767070-chunk1.lido.xml | 1 + test/test_lidoTool.py | 15 +++++ zml2lido/lidoTool.py | 95 +++++++++++++++++--------------- zml2lido/linkChecker.py | 31 +++++++---- zml2lido/relWorksCache.py | 33 ++++++----- 5 files changed, 108 insertions(+), 67 deletions(-) create mode 100644 test/query767070-chunk1.lido.xml diff --git a/test/query767070-chunk1.lido.xml b/test/query767070-chunk1.lido.xml new file mode 100644 index 0000000..e16c76d --- /dev/null +++ b/test/query767070-chunk1.lido.xml @@ -0,0 +1 @@ +"" diff --git a/test/test_lidoTool.py b/test/test_lidoTool.py index cf5cdc3..c2c7f39 100644 --- a/test/test_lidoTool.py +++ b/test/test_lidoTool.py @@ -7,6 +7,21 @@ from pathlib import Path +def test_firstChunkName() -> None: + lt = LidoTool(src="group416397-chunk1.lido.xml") + p = Path("query767070-chunk1.lido.xml") + first_chunk = lt.firstChunkName(src=p) + assert str(first_chunk) == "query767070-chunk1.lido.xml" + + p = Path("query767070-chunk10.lido.xml") + first_chunk = lt.firstChunkName(src=p) + assert str(first_chunk) == "query767070-chunk1.lido.xml" + + p = Path("query767070-chunk10.lvl2.lido.xml") + first_chunk = lt.firstChunkName(src=p) + assert str(first_chunk) == "query767070-chunk1.lido.xml" + + def test_saxon() -> None: lt = LidoTool(src="group416397-chunk1.lido.xml") diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py index a6ff732..94ba9f7 100644 --- a/zml2lido/lidoTool.py +++ b/zml2lido/lidoTool.py @@ -76,7 +76,7 @@ def __init__( self.chunks = chunks self.script_dir = Path(__file__).parents[1] - self.src = self._sanitize(src=src) + self.src = self._sanitize(src=src) # returns Path self.outdir = self._prepareOutdir() print(f" outdir {self.outdir}") self._initLog() @@ -92,19 +92,20 @@ def execute(self, job: str) -> None: case "ddd": # debug. Only lvl1 and validate lido_fn = self.zml2lido(src=self.src) self.validate(path=lido_fn) - self.splitLido(src=lido_fn) + self.split_lido(src=lido_fn) case "ohneLit": # use different xslt for lvl1 conversion plus lvl2 lido_fn = self.zml2lido(src=self.src, xslt="ohneLit") lvl2_fn = self.to_lvl2(src=lido_fn) + logging.info(f"{lvl2_fn} should be lvl2 file") self.validate(path=lvl2_fn) - self.splitLido(src=lvl2_fn) + self.split_lido(src=lvl2_fn) case "mitLit": # regular xslt, lvl2 lido_fn = self.zml2lido(src=self.src) lvl2_fn = self.to_lvl2(src=lido_fn) self.validate(path=lvl2_fn) - self.splitLido(src=lvl2_fn) + self.split_lido(src=lvl2_fn) case _: raise SyntaxError("ERROR: Unknown job name!") @@ -119,28 +120,33 @@ def lfilter(self, *, split: bool = False, Type: str) -> None: if split: self.force = True - self.splitLido(src=out_fn) + self.split_lido(src=out_fn) - def to_lvl2(self, *, src: str) -> Path: + def to_lvl2(self, *, src: Path) -> Path: + """ + In chunking mode returns the path of first chunk. + """ if self.chunks: for chunkFn in self.loopChunks(src=src): print(f"{chunkFn=}") - new_fn = self.to_lvl2Single(src=chunkFn) + new_fn = self.to_lvl2_single(src=chunkFn) return self.firstChunkName(src=new_fn) else: - return self.to_lvl2Single(src=src) + return self.to_lvl2_single(src=src) - def to_lvl2Single(self, *, src: str | Path) -> Path: + def to_lvl2_single(self, *, src: Path) -> Path: """ Using Python rewrite (fix) generic Zetcom xml, mostly working on links (urls) """ - out_fn = self._lvl2_path(src) try: self.lc except AttributeError: # only initalize and load lido files into relWorksCache once + # need src here for path atm self.lc = LinkChecker(src=src, chunks=self.chunks) + out_fn = self._lvl2_path(src) if not out_fn.exists() or self.force: + self.lc.new_src(src=src) # self.lc.relWorks_cache_single(fn=src) self.lc.rmUnpublishedRecords() # remove unpublished records (not on SMB-Digital) self.lc.fixRelatedWorks() @@ -149,17 +155,19 @@ def to_lvl2Single(self, *, src: str | Path) -> Path: print(f" lvl2 already exists: {out_fn}") return out_fn - def splitLido(self, *, src: str | Path) -> str | Path: - # print("SPLITLIDO enter") + def split_lido(self, *, src: Path) -> Path: + # logging.debug(f"WARN: split_lido: {src}") + # print("split_lido enter") if self.chunks: self.force = True # otherwise subsequent chunks are not written for chunkFn in self.loopChunks(src=src): - self.splitLidoSingle(src=chunkFn) + logging.debug(f"WARN: split_lido: XXXXX: {chunkFn}") + self.split_lido_single(src=chunkFn) else: - self.splitLidoSingle(src=src) + self.split_lido_single(src=src) return src # dont act on split files - def splitLidoSingle(self, *, src: str | Path) -> None: + def split_lido_single(self, *, src: Path) -> None: """ Create individual files per lido record """ @@ -168,7 +176,7 @@ def splitLidoSingle(self, *, src: str | Path) -> None: print(f"split's parent: {self.outdir=}") # existance of splitDir is a bad criterion, but cant think of a better one if not splitDir.exists() or self.force: # self.force is True was problematic - print("SPLITLIDO making") + print("split_lido making") os.chdir(self.outdir) self.saxon(src=src, xsl=xsl["splitLido"], output="o.xml") os.chdir(orig) @@ -201,12 +209,9 @@ def splitSachbegriffSingle(self, *, src: str) -> Path: def validate(self, *, path: Path) -> None: """ - It's optionally possible to specify a path for a file that needs validatation. If - path is None, the file that was specified during __init__ will be validated. + Only validates if self.validation is True. If the method validate doesn't die, data validates. - - (Not tested recently for chunks...) """ if not self.validation: return @@ -215,11 +220,11 @@ def validate(self, *, path: Path) -> None: if self.chunks: print(" with chunks") for chunkFn in self.loopChunks(src=path): - self.validateSingle(src=chunkFn) + self.validate_single(src=chunkFn) else: - self.validateSingle(src=path) + self.validate_single(src=path) - def validateSingle(self, *, src: Path) -> Path: + def validate_single(self, *, src: Path) -> Path: """ Why do we return a the path? """ @@ -278,7 +283,7 @@ def zml2lidoSingle(self, *, src: str | Path, xslt="zml2lido") -> Path: # more helpers # - def loopChunks(self, *, src: str | Path) -> Iterable[str | Path]: + def loopChunks(self, *, src: Path) -> Iterable[Path]: """ returns generator with path for existing files, counting up as long files exist. For this to work, filename has to include @@ -289,13 +294,13 @@ def loopChunks(self, *, src: str | Path) -> Iterable[str | Path]: print(f"chunk src: {src}") root, no, tail = self._analyze_chunkFn(src=src) chunkFn = src - while Path(chunkFn).exists(): + while chunkFn.exists(): yield chunkFn # print(f"{chunkFn} exists") no += 1 - chunkFn = f"{root}-chunk{no}{tail}" + chunkFn = Path(f"{root}-chunk{no}{tail}") - def firstChunkName(self, *, src: str | Path): + def firstChunkName(self, *, src: Path) -> Path: """ returns the chunk with no. 1 @@ -303,19 +308,21 @@ def firstChunkName(self, *, src: str | Path): Can we get the first file instead of forcing people to start with chunk1? - List glob root* and take the first item? """ root, no, tail = self._analyze_chunkFn(src=src) - src = Path(src) - parent = src.parent + parent_dir = src.parent + if not parent_dir.exists(): + raise Exception("parent dir does not exist") folder = {} - for each in parent.iterdir(): - if str(each).startswith(root): - root, no, tail = self._analyze_chunkFn(src=each) - folder[no] = each + for file in parent_dir.iterdir(): + if str(file).startswith(root): + root, no, tail = self._analyze_chunkFn(src=file) + folder[no] = file + if len(folder) == 0: + raise FileNotFoundError(f"No file found in {parent_dir}") no = min(folder.keys()) firstFn = folder[no] - # print(f"***firstChunkName {firstFn}") + # logging.info(f"firstChunkName: {src} -> {firstFn=}") return firstFn def saxon( @@ -382,23 +389,23 @@ def _initLog(self) -> None: logging.basicConfig( datefmt="%Y%m%d %I:%M:%S %p", filename=log_fn, - filemode="a", # append now since we're starting a new folder + filemode="w", # w=write, was: append now since we're starting a new folder encoding="utf-8", - level=logging.INFO, + level=logging.DEBUG, format="%(asctime)s: %(message)s", ) log = logging.getLogger() log.addHandler(logging.StreamHandler(sys.stdout)) - def _lvl2_path(self, p: str | Path) -> Path: + def _lvl2_path(self, p: Path) -> Path: """ Given a lvl1 lido path, determine the lvl2 path """ - p = Path(p) suffixes = "".join(p.suffixes) stem = str(p.name).split(".")[0] # splits off multiple suffixes - new_dir = p.parent # / "lvl2" - # new_dir.mkdir(exist_ok=True) + new_dir = p.parent / "lvl2" + if not new_dir.exists(): + new_dir.mkdir() # exist_ok=True new_p = new_dir.joinpath(stem + "-lvl2" + suffixes) return new_p @@ -429,12 +436,12 @@ def _prepareOutdir(self) -> Path: outdir.mkdir(parents=True, exist_ok=False) return outdir - def _sanitize(self, *, src: str | Path) -> Path: + def _sanitize(self, *, src: str) -> Path: """ - src could be Path or str. + src should be a str. Some checks for convenience; mainly for our users, so they get more intelligable - error messages. + error messages at an earlier time. """ # script_dir = Path(__file__).parents[1] # print(f"SCRIPT_DIR: {script_dir}") diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py index deb943e..a414c52 100644 --- a/zml2lido/linkChecker.py +++ b/zml2lido/linkChecker.py @@ -30,6 +30,8 @@ # from zml2lido import NSMAP NSMAP = {"l": "http://www.lido-schema.org"} +rescan_lvl1_files_at_init = False + class LinkChecker: def __init__(self, *, src: Path, chunks: bool = False) -> None: @@ -37,7 +39,6 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None: f"STATUS: LinkChecker is working on {src}" ) # not exactly an error # self.chunk = chunk - self.data = etree.parse(str(src)) self.chunks = chunks user, pw, baseURL = get_credentials() self.client = MpApi(baseURL=baseURL, user=user, pw=pw) @@ -45,12 +46,13 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None: self.rwc = RelWorksCache(maxSize=20_000, cache_dir=cache_dir) self.rwc.load_cache_file() # load file if it exists once atb - # run only once to make cache - if self.chunks: - print("prepare relWorks cache (chunks, many)") - self.rwc.lookup_from_lido_chunks(path=Path(src)) - else: - self.rwc.lookup_from_lido_file(path=Path(src)) + if rescan_lvl1_files_at_init: + # run only once to update cache + if self.chunks: + print("prepare relWorks cache (chunks, many)") + self.rwc.lookup_from_lido_chunks(path=Path(src)) + else: + self.rwc.lookup_from_lido_file(path=Path(src)) def fixRelatedWorks(self) -> None: """ @@ -84,8 +86,9 @@ def fixRelatedWorks(self) -> None: mtype = "Literature" case "ISIL/ID": # conceivable that lxml processes some nodes multiple times + # this seems to happen when we change lxml tree without making a deepcopy logging.warning( - "ERROR: 'ISIL/ID' indicates that processing a LIDO file for a second time" + "WARN: 'ISIL/ID' indicates that processing a LIDO file for a second time" ) mtype = "rewritten" # fake case case _: @@ -153,6 +156,9 @@ def rmInternalLinks(self) -> None: resourceSet = link.getparent().getparent() resourceSet.getparent().remove(resourceSet) + def new_src(self, *, src: Path) -> None: + self.data = etree.parse(str(src)) + def rmUnpublishedRecords(self) -> None: """ Remove lido records which are not published on SMB Digital. @@ -193,7 +199,12 @@ def _del_relWork(self, *, ID_N: Any) -> None: """ logging.debug(f" removing unpublic relWork {ID_N.text}") relWorkSet = ID_N.getparent().getparent().getparent() - relWorkSet.getparent().remove(relWorkSet) + relWorkWrap = relWorkSet.getparent() + relWorkWrap.remove(relWorkSet) + resL = relWorkWrap.xpath("l:relatedWorkSet", namespaces=NSMAP) + if len(resL) == 0: + # logging.info("removing empty relWorkWrap") + relWorkWrap.getparent().remove(relWorkWrap) def _lookup_ISIL(self, *, institution) -> str: """ @@ -252,7 +263,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID_N: Any) -> None: else: ISIL = self._lookup_ISIL(institution=verwInst.text) objectID_N.text = f"{ISIL}/{str(id_int)}" - logging.debug(f" relWork {id_int}: {verwInst.text} -> {ISIL}") + # logging.debug(f" relWork {id_int}: {verwInst.text} -> {ISIL}") # print(f"_rewrite_relWork {mtype} {id_int} rewrite ok") else: self._del_relWork(ID_N=objectID_N) # rm from lido lvl2 diff --git a/zml2lido/relWorksCache.py b/zml2lido/relWorksCache.py index 68c8a6e..c4f512c 100644 --- a/zml2lido/relWorksCache.py +++ b/zml2lido/relWorksCache.py @@ -15,13 +15,13 @@ rw.item_is_online(mtype="Object", ID=1234) # true if item in cache indicates it's online rw.save() # save in-memory cache to disk + rw.save_if_changed -Currently: we NOT respect max_size? - -How do we delete items from cache if the maxSize is reached? - - +Currently: If maxSize is reached, we cant add any more data. Let's just split the mpApi +data in smaller chunks then. +TODO: How do we delete items from cache if the maxSize is reached? We could drop the first to +add the next. """ from lxml import etree @@ -90,7 +90,8 @@ def lookup_relWork(self, *, mtype: str, ID: int) -> None: q = self._optimize_query(query=q) print(f"{self.length()} looking up relWork {mtype} {ID}") relWorkM = self.client.search2(query=q) - if relWorkM: # realistic that query results are empty? + # realistic that query results are empty? + if relWorkM and self.cache.length() < self.maxSize: self.changed = True self.cache += relWorkM # appending them to relWork cache # what to do if nothing is found? @@ -113,6 +114,7 @@ def item_is_online(self, *, mtype: str, ID: int) -> bool: Report if, according to info in cache, the item has SMB-Freigabe. """ if not self.item_exists(mtype=mtype, ID=ID): + # it's possible if maxSize exceeded raise KeyError("ERROR: Item not in Cache") r = self.cache.xpath( @@ -197,15 +199,20 @@ def _lido_to_ids_not_in_cache(self, path: Path) -> set[tuple[str, int]]: id_cache = set() for ID_N in relWorksL: src = ID_N.xpath("@l:source", namespaces=NSMAP)[0] - if src == "OBJ.ID": - mtype = "Object" - elif src == "LIT.ID": - mtype = "Literature" - else: - raise ValueError(f"ERROR: Unknown type: {src}") + match src: + case "OBJ.ID": + mtype = "Object" + case "LIT.ID": + mtype = "Literature" + case _: + raise ValueError(f"ERROR: Unknown type: {src}") id_int = int(ID_N.text) - if not self.cache.item_exists(mtype=mtype, ID=id_int): + if ( + not self.cache.item_exists(mtype=mtype, ID=id_int) + and self.cache.length() < self.maxSize + ): + self.changed = True id_cache.add((mtype, id_int)) # else: # print(f"item {mtype} {id_int} already in relWorks cache")