From 4f3041e8793901e08ca892ea8158655a27828624 Mon Sep 17 00:00:00 2001
From: Maurice Mengel <mauricemengel@gmail.com>
Date: Mon, 17 Jun 2024 14:27:52 +0200
Subject: [PATCH] fix relWorksCache: save cache file to disk

---
 zml2lido/lidoTool.py      |  4 ++--
 zml2lido/linkChecker.py   |  5 ++++-
 zml2lido/relWorksCache.py | 17 ++++++++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py
index 775ecbb..5d8a1a2 100644
--- a/zml2lido/lidoTool.py
+++ b/zml2lido/lidoTool.py
@@ -248,11 +248,11 @@ def zml2lidoSingle(self, *, src: str | Path, xslt="zml2lido") -> Path:
         """
         srcP = Path(src)
         lidoFn = self.outdir.joinpath(srcP.stem + ".lido.xml")
-        print(f"zml2lidoSingle with {xsl[xslt]}")  # with file '{lidoFn}'
+        # print(f"zml2lidoSingle with {xsl[xslt]}")  # with file '{lidoFn}'
 
         if self.force is True or not lidoFn.exists():
             if srcP.suffix == ".zip":  # unzipping temp file
-                print("   src is zipped")
+                print(f"   src is zipped {srcP}")
                 parent_dir = srcP.parent
                 member = Path(srcP.name).with_suffix(".xml")
                 temp_fn = parent_dir / member
diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py
index e4a78fe..bb2614b 100644
--- a/zml2lido/linkChecker.py
+++ b/zml2lido/linkChecker.py
@@ -69,7 +69,7 @@ def fixRelatedWorks(self) -> None:
         )
 
         # for //relatedWork in the current LIDO document
-        for objectID_N in relatedWorksL:
+        for idx, objectID_N in enumerate(relatedWorksL):
             # don't _log self._log(f"fixRelatedWorks checking {objectID_N.text}")
 
             # assuming that source always exists
@@ -98,6 +98,9 @@ def fixRelatedWorks(self) -> None:
                         self.rwc.lookup_relWork(mtype=mtype, ID=id_int)
                     # at this point we can rely on item being in relWorks cache
                     self._rewrite_relWork(mtype=mtype, objectID_N=objectID_N)
+            if idx % 10:
+                print("Saving relWorks cache")
+                self.rwc.save()
 
     def linkResource_online_http(self) -> None:
         """
diff --git a/zml2lido/relWorksCache.py b/zml2lido/relWorksCache.py
index 2190099..f3dda77 100644
--- a/zml2lido/relWorksCache.py
+++ b/zml2lido/relWorksCache.py
@@ -41,6 +41,7 @@ def __init__(self, *, maxSize: int = 20_000, cache_dir: Path) -> None:
         self.cache = Module()
         self.maxSize = maxSize
         self.cache_path = cache_dir / "relWorks_cache.xml"
+        print(f"{self.cache_path=}")
 
         user, pw, baseURL = get_credentials()
         self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
@@ -52,6 +53,12 @@ def lookup_from_lido_chunks(self, *, path: Path) -> None:
         """
         for p in per_chunk(path=path):
             self.lookup_from_lido_file(path=p)
+            print(f"Cache size: {len(self.cache)}")
+            self.save()
+            if len(self.cache) > self.maxSize:
+                print("Cache is big enough. Let's go {len(self.cache)}")
+                self.save()
+                break
 
     def lookup_from_lido_file(self, *, path: Path) -> None:
         """
@@ -65,6 +72,9 @@ def lookup_from_lido_file(self, *, path: Path) -> None:
         IDs = self._lido_to_ids(path=path)
         for mtype, id_int in IDs:
             self.lookup_relWork(mtype=mtype, ID=id_int)
+            if len(self.cache) > self.maxSize:
+                print("relWorksCache has reached maxSize")
+                break
 
     def lookup_relWork(self, *, mtype: str, ID: int) -> None:
         """
@@ -139,7 +149,12 @@ def save(self) -> Path:
         """
         path: Path = self.cache_path
         print(f"Saving file cache {self.cache_path}")
-        self.cache.toFile(path=path)
+        try:
+            self.cache.toFile(path=path)
+        except KeyboardInterrupt:
+            print(
+                "Catching keyboard interrupt while saving of relWorksCache; try again..."
+            )
         return path
 
     #