From 4f3041e8793901e08ca892ea8158655a27828624 Mon Sep 17 00:00:00 2001 From: Maurice Mengel Date: Mon, 17 Jun 2024 14:27:52 +0200 Subject: [PATCH] fix relWorksCache: save cache file to disk --- zml2lido/lidoTool.py | 4 ++-- zml2lido/linkChecker.py | 5 ++++- zml2lido/relWorksCache.py | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py index 775ecbb..5d8a1a2 100644 --- a/zml2lido/lidoTool.py +++ b/zml2lido/lidoTool.py @@ -248,11 +248,11 @@ def zml2lidoSingle(self, *, src: str | Path, xslt="zml2lido") -> Path: """ srcP = Path(src) lidoFn = self.outdir.joinpath(srcP.stem + ".lido.xml") - print(f"zml2lidoSingle with {xsl[xslt]}") # with file '{lidoFn}' + # print(f"zml2lidoSingle with {xsl[xslt]}") # with file '{lidoFn}' if self.force is True or not lidoFn.exists(): if srcP.suffix == ".zip": # unzipping temp file - print(" src is zipped") + print(f" src is zipped {srcP}") parent_dir = srcP.parent member = Path(srcP.name).with_suffix(".xml") temp_fn = parent_dir / member diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py index e4a78fe..bb2614b 100644 --- a/zml2lido/linkChecker.py +++ b/zml2lido/linkChecker.py @@ -69,7 +69,7 @@ def fixRelatedWorks(self) -> None: ) # for //relatedWork in the current LIDO document - for objectID_N in relatedWorksL: + for idx, objectID_N in enumerate(relatedWorksL): # don't _log self._log(f"fixRelatedWorks checking {objectID_N.text}") # assuming that source always exists @@ -98,6 +98,9 @@ def fixRelatedWorks(self) -> None: self.rwc.lookup_relWork(mtype=mtype, ID=id_int) # at this point we can rely on item being in relWorks cache self._rewrite_relWork(mtype=mtype, objectID_N=objectID_N) + if idx % 10: + print("Saving relWorks cache") + self.rwc.save() def linkResource_online_http(self) -> None: """ diff --git a/zml2lido/relWorksCache.py b/zml2lido/relWorksCache.py index 2190099..f3dda77 100644 --- a/zml2lido/relWorksCache.py +++ b/zml2lido/relWorksCache.py @@ -41,6 +41,7 @@ def __init__(self, *, maxSize: int = 20_000, cache_dir: Path) -> None: self.cache = Module() self.maxSize = maxSize self.cache_path = cache_dir / "relWorks_cache.xml" + print(f"{self.cache_path=}") user, pw, baseURL = get_credentials() self.client = MpApi(baseURL=baseURL, user=user, pw=pw) @@ -52,6 +53,12 @@ def lookup_from_lido_chunks(self, *, path: Path) -> None: """ for p in per_chunk(path=path): self.lookup_from_lido_file(path=p) + print(f"Cache size: {len(self.cache)}") + self.save() + if len(self.cache) > self.maxSize: + print("Cache is big enough. Let's go {len(self.cache)}") + self.save() + break def lookup_from_lido_file(self, *, path: Path) -> None: """ @@ -65,6 +72,9 @@ def lookup_from_lido_file(self, *, path: Path) -> None: IDs = self._lido_to_ids(path=path) for mtype, id_int in IDs: self.lookup_relWork(mtype=mtype, ID=id_int) + if len(self.cache) > self.maxSize: + print("relWorksCache has reached maxSize") + break def lookup_relWork(self, *, mtype: str, ID: int) -> None: """ @@ -139,7 +149,12 @@ def save(self) -> Path: """ path: Path = self.cache_path print(f"Saving file cache {self.cache_path}") - self.cache.toFile(path=path) + try: + self.cache.toFile(path=path) + except KeyboardInterrupt: + print( + "Catching keyboard interrupt while saving of relWorksCache; try again..." + ) return path #