From 95621a61f8e3bd43b129130cd99488db56146bc0 Mon Sep 17 00:00:00 2001
From: Maurice Mengel <mauricemengel@gmail.com>
Date: Tue, 18 Jun 2024 10:55:20 +0200
Subject: [PATCH] debugging and modernization

---
 test/query767070-chunk1.lido.xml |  1 +
 test/test_lidoTool.py            | 15 +++++
 zml2lido/lidoTool.py             | 95 +++++++++++++++++---------------
 zml2lido/linkChecker.py          | 31 +++++++----
 zml2lido/relWorksCache.py        | 33 ++++++-----
 5 files changed, 108 insertions(+), 67 deletions(-)
 create mode 100644 test/query767070-chunk1.lido.xml

diff --git a/test/query767070-chunk1.lido.xml b/test/query767070-chunk1.lido.xml
new file mode 100644
index 0000000..e16c76d
--- /dev/null
+++ b/test/query767070-chunk1.lido.xml
@@ -0,0 +1 @@
+""
diff --git a/test/test_lidoTool.py b/test/test_lidoTool.py
index cf5cdc3..c2c7f39 100644
--- a/test/test_lidoTool.py
+++ b/test/test_lidoTool.py
@@ -7,6 +7,21 @@
 from pathlib import Path
 
 
+def test_firstChunkName() -> None:
+    lt = LidoTool(src="group416397-chunk1.lido.xml")
+    p = Path("query767070-chunk1.lido.xml")
+    first_chunk = lt.firstChunkName(src=p)
+    assert str(first_chunk) == "query767070-chunk1.lido.xml"
+
+    p = Path("query767070-chunk10.lido.xml")
+    first_chunk = lt.firstChunkName(src=p)
+    assert str(first_chunk) == "query767070-chunk1.lido.xml"
+
+    p = Path("query767070-chunk10.lvl2.lido.xml")
+    first_chunk = lt.firstChunkName(src=p)
+    assert str(first_chunk) == "query767070-chunk1.lido.xml"
+
+
 def test_saxon() -> None:
     lt = LidoTool(src="group416397-chunk1.lido.xml")
 
diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py
index a6ff732..94ba9f7 100644
--- a/zml2lido/lidoTool.py
+++ b/zml2lido/lidoTool.py
@@ -76,7 +76,7 @@ def __init__(
         self.chunks = chunks
         self.script_dir = Path(__file__).parents[1]
 
-        self.src = self._sanitize(src=src)
+        self.src = self._sanitize(src=src)  # returns Path
         self.outdir = self._prepareOutdir()
         print(f" outdir {self.outdir}")
         self._initLog()
@@ -92,19 +92,20 @@ def execute(self, job: str) -> None:
             case "ddd":  # debug. Only lvl1 and validate
                 lido_fn = self.zml2lido(src=self.src)
                 self.validate(path=lido_fn)
-                self.splitLido(src=lido_fn)
+                self.split_lido(src=lido_fn)
             case "ohneLit":
                 # use different xslt for lvl1 conversion plus lvl2
                 lido_fn = self.zml2lido(src=self.src, xslt="ohneLit")
                 lvl2_fn = self.to_lvl2(src=lido_fn)
+                logging.info(f"{lvl2_fn} should be lvl2 file")
                 self.validate(path=lvl2_fn)
-                self.splitLido(src=lvl2_fn)
+                self.split_lido(src=lvl2_fn)
             case "mitLit":
                 # regular xslt, lvl2
                 lido_fn = self.zml2lido(src=self.src)
                 lvl2_fn = self.to_lvl2(src=lido_fn)
                 self.validate(path=lvl2_fn)
-                self.splitLido(src=lvl2_fn)
+                self.split_lido(src=lvl2_fn)
             case _:
                 raise SyntaxError("ERROR: Unknown job name!")
 
@@ -119,28 +120,33 @@ def lfilter(self, *, split: bool = False, Type: str) -> None:
 
         if split:
             self.force = True
-            self.splitLido(src=out_fn)
+            self.split_lido(src=out_fn)
 
-    def to_lvl2(self, *, src: str) -> Path:
+    def to_lvl2(self, *, src: Path) -> Path:
+        """
+        In chunking mode returns the path of first chunk.
+        """
         if self.chunks:
             for chunkFn in self.loopChunks(src=src):
                 print(f"{chunkFn=}")
-                new_fn = self.to_lvl2Single(src=chunkFn)
+                new_fn = self.to_lvl2_single(src=chunkFn)
             return self.firstChunkName(src=new_fn)
         else:
-            return self.to_lvl2Single(src=src)
+            return self.to_lvl2_single(src=src)
 
-    def to_lvl2Single(self, *, src: str | Path) -> Path:
+    def to_lvl2_single(self, *, src: Path) -> Path:
         """
         Using Python rewrite (fix) generic Zetcom xml, mostly working on links (urls)
         """
-        out_fn = self._lvl2_path(src)
         try:
             self.lc
         except AttributeError:
             # only initalize and load lido files into relWorksCache once
+            # need src here for path atm
             self.lc = LinkChecker(src=src, chunks=self.chunks)
+        out_fn = self._lvl2_path(src)
         if not out_fn.exists() or self.force:
+            self.lc.new_src(src=src)
             # self.lc.relWorks_cache_single(fn=src)
             self.lc.rmUnpublishedRecords()  # remove unpublished records (not on SMB-Digital)
             self.lc.fixRelatedWorks()
@@ -149,17 +155,19 @@ def to_lvl2Single(self, *, src: str | Path) -> Path:
             print(f"   lvl2 already exists: {out_fn}")
         return out_fn
 
-    def splitLido(self, *, src: str | Path) -> str | Path:
-        # print("SPLITLIDO enter")
+    def split_lido(self, *, src: Path) -> Path:
+        # logging.debug(f"WARN: split_lido: {src}")
+        # print("split_lido enter")
         if self.chunks:
             self.force = True  # otherwise subsequent chunks are not written
             for chunkFn in self.loopChunks(src=src):
-                self.splitLidoSingle(src=chunkFn)
+                logging.debug(f"WARN: split_lido: XXXXX: {chunkFn}")
+                self.split_lido_single(src=chunkFn)
         else:
-            self.splitLidoSingle(src=src)
+            self.split_lido_single(src=src)
         return src  # dont act on split files
 
-    def splitLidoSingle(self, *, src: str | Path) -> None:
+    def split_lido_single(self, *, src: Path) -> None:
         """
         Create individual files per lido record
         """
@@ -168,7 +176,7 @@ def splitLidoSingle(self, *, src: str | Path) -> None:
         print(f"split's parent: {self.outdir=}")
         # existance of splitDir is a bad criterion, but cant think of a better one
         if not splitDir.exists() or self.force:  # self.force is True was problematic
-            print("SPLITLIDO making")
+            print("split_lido making")
             os.chdir(self.outdir)
             self.saxon(src=src, xsl=xsl["splitLido"], output="o.xml")
             os.chdir(orig)
@@ -201,12 +209,9 @@ def splitSachbegriffSingle(self, *, src: str) -> Path:
 
     def validate(self, *, path: Path) -> None:
         """
-        It's optionally possible to specify a path for a file that needs validatation. If
-        path is None, the file that was specified during __init__ will be validated.
+        Only validates if self.validation is True.
 
         If the method validate doesn't die, data validates.
-
-        (Not tested recently for chunks...)
         """
         if not self.validation:
             return
@@ -215,11 +220,11 @@ def validate(self, *, path: Path) -> None:
         if self.chunks:
             print(" with chunks")
             for chunkFn in self.loopChunks(src=path):
-                self.validateSingle(src=chunkFn)
+                self.validate_single(src=chunkFn)
         else:
-            self.validateSingle(src=path)
+            self.validate_single(src=path)
 
-    def validateSingle(self, *, src: Path) -> Path:
+    def validate_single(self, *, src: Path) -> Path:
         """
         Why do we return a the path?
         """
@@ -278,7 +283,7 @@ def zml2lidoSingle(self, *, src: str | Path, xslt="zml2lido") -> Path:
     # more helpers
     #
 
-    def loopChunks(self, *, src: str | Path) -> Iterable[str | Path]:
+    def loopChunks(self, *, src: Path) -> Iterable[Path]:
         """
         returns generator with path for existing files, counting up as long
         files exist. For this to work, filename has to include
@@ -289,13 +294,13 @@ def loopChunks(self, *, src: str | Path) -> Iterable[str | Path]:
         print(f"chunk src: {src}")
         root, no, tail = self._analyze_chunkFn(src=src)
         chunkFn = src
-        while Path(chunkFn).exists():
+        while chunkFn.exists():
             yield chunkFn
             # print(f"{chunkFn} exists")
             no += 1
-            chunkFn = f"{root}-chunk{no}{tail}"
+            chunkFn = Path(f"{root}-chunk{no}{tail}")
 
-    def firstChunkName(self, *, src: str | Path):
+    def firstChunkName(self, *, src: Path) -> Path:
         """
         returns the chunk with no. 1
 
@@ -303,19 +308,21 @@ def firstChunkName(self, *, src: str | Path):
 
         Can we get the first file instead of forcing people to
         start with chunk1?
-        List glob root* and take the first item?
         """
         root, no, tail = self._analyze_chunkFn(src=src)
-        src = Path(src)
-        parent = src.parent
+        parent_dir = src.parent
+        if not parent_dir.exists():
+            raise Exception("parent dir does not exist")
         folder = {}
-        for each in parent.iterdir():
-            if str(each).startswith(root):
-                root, no, tail = self._analyze_chunkFn(src=each)
-                folder[no] = each
+        for file in parent_dir.iterdir():
+            if str(file).startswith(root):
+                root, no, tail = self._analyze_chunkFn(src=file)
+                folder[no] = file
+        if len(folder) == 0:
+            raise FileNotFoundError(f"No file found in {parent_dir}")
         no = min(folder.keys())
         firstFn = folder[no]
-        # print(f"***firstChunkName {firstFn}")
+        # logging.info(f"firstChunkName: {src} -> {firstFn=}")
         return firstFn
 
     def saxon(
@@ -382,23 +389,23 @@ def _initLog(self) -> None:
         logging.basicConfig(
             datefmt="%Y%m%d %I:%M:%S %p",
             filename=log_fn,
-            filemode="a",  # append now since we're starting a new folder
+            filemode="w",  # w=write, was: append now since we're starting a new folder
             encoding="utf-8",
-            level=logging.INFO,
+            level=logging.DEBUG,
             format="%(asctime)s: %(message)s",
         )
         log = logging.getLogger()
         log.addHandler(logging.StreamHandler(sys.stdout))
 
-    def _lvl2_path(self, p: str | Path) -> Path:
+    def _lvl2_path(self, p: Path) -> Path:
         """
         Given a lvl1 lido path, determine the lvl2 path
         """
-        p = Path(p)
         suffixes = "".join(p.suffixes)
         stem = str(p.name).split(".")[0]  # splits off multiple suffixes
-        new_dir = p.parent  # / "lvl2"
-        # new_dir.mkdir(exist_ok=True)
+        new_dir = p.parent / "lvl2"
+        if not new_dir.exists():
+            new_dir.mkdir()  # exist_ok=True
         new_p = new_dir.joinpath(stem + "-lvl2" + suffixes)
         return new_p
 
@@ -429,12 +436,12 @@ def _prepareOutdir(self) -> Path:
             outdir.mkdir(parents=True, exist_ok=False)
         return outdir
 
-    def _sanitize(self, *, src: str | Path) -> Path:
+    def _sanitize(self, *, src: str) -> Path:
         """
-        src could be Path or str.
+        src should be a str.
 
         Some checks for convenience; mainly for our users, so they get more intelligable
-        error messages.
+        error messages at an earlier time.
         """
         # script_dir = Path(__file__).parents[1]
         # print(f"SCRIPT_DIR: {script_dir}")
diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py
index deb943e..a414c52 100644
--- a/zml2lido/linkChecker.py
+++ b/zml2lido/linkChecker.py
@@ -30,6 +30,8 @@
 # from zml2lido import NSMAP
 NSMAP = {"l": "http://www.lido-schema.org"}
 
+rescan_lvl1_files_at_init = False
+
 
 class LinkChecker:
     def __init__(self, *, src: Path, chunks: bool = False) -> None:
@@ -37,7 +39,6 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None:
             f"STATUS: LinkChecker is working on {src}"
         )  # not exactly an error
         # self.chunk = chunk
-        self.data = etree.parse(str(src))
         self.chunks = chunks
         user, pw, baseURL = get_credentials()
         self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
@@ -45,12 +46,13 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None:
         self.rwc = RelWorksCache(maxSize=20_000, cache_dir=cache_dir)
         self.rwc.load_cache_file()  # load file if it exists once atb
 
-        # run only once to make cache
-        if self.chunks:
-            print("prepare relWorks cache (chunks, many)")
-            self.rwc.lookup_from_lido_chunks(path=Path(src))
-        else:
-            self.rwc.lookup_from_lido_file(path=Path(src))
+        if rescan_lvl1_files_at_init:
+            # run only once to update cache
+            if self.chunks:
+                print("prepare relWorks cache (chunks, many)")
+                self.rwc.lookup_from_lido_chunks(path=Path(src))
+            else:
+                self.rwc.lookup_from_lido_file(path=Path(src))
 
     def fixRelatedWorks(self) -> None:
         """
@@ -84,8 +86,9 @@ def fixRelatedWorks(self) -> None:
                     mtype = "Literature"
                 case "ISIL/ID":
                     # conceivable that lxml processes some nodes multiple times
+                    # this seems to happen when we change lxml tree without making a deepcopy
                     logging.warning(
-                        "ERROR: 'ISIL/ID' indicates that processing a LIDO file for a second time"
+                        "WARN: 'ISIL/ID' indicates that processing a LIDO file for a second time"
                     )
                     mtype = "rewritten"  # fake case
                 case _:
@@ -153,6 +156,9 @@ def rmInternalLinks(self) -> None:
                     resourceSet = link.getparent().getparent()
                     resourceSet.getparent().remove(resourceSet)
 
+    def new_src(self, *, src: Path) -> None:
+        self.data = etree.parse(str(src))
+
     def rmUnpublishedRecords(self) -> None:
         """
         Remove lido records which are not published on SMB Digital.
@@ -193,7 +199,12 @@ def _del_relWork(self, *, ID_N: Any) -> None:
         """
         logging.debug(f"   removing unpublic relWork {ID_N.text}")
         relWorkSet = ID_N.getparent().getparent().getparent()
-        relWorkSet.getparent().remove(relWorkSet)
+        relWorkWrap = relWorkSet.getparent()
+        relWorkWrap.remove(relWorkSet)
+        resL = relWorkWrap.xpath("l:relatedWorkSet", namespaces=NSMAP)
+        if len(resL) == 0:
+            # logging.info("removing empty relWorkWrap")
+            relWorkWrap.getparent().remove(relWorkWrap)
 
     def _lookup_ISIL(self, *, institution) -> str:
         """
@@ -252,7 +263,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID_N: Any) -> None:
             else:
                 ISIL = self._lookup_ISIL(institution=verwInst.text)
                 objectID_N.text = f"{ISIL}/{str(id_int)}"
-                logging.debug(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
+                # logging.debug(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
                 # print(f"_rewrite_relWork {mtype} {id_int} rewrite ok")
         else:
             self._del_relWork(ID_N=objectID_N)  # rm from lido lvl2
diff --git a/zml2lido/relWorksCache.py b/zml2lido/relWorksCache.py
index 68c8a6e..c4f512c 100644
--- a/zml2lido/relWorksCache.py
+++ b/zml2lido/relWorksCache.py
@@ -15,13 +15,13 @@
     rw.item_is_online(mtype="Object", ID=1234) # true if item in cache indicates it's online
 
     rw.save() # save in-memory cache to disk
+    rw.save_if_changed
 
-Currently: we NOT respect max_size?
-
-How do we delete items from cache if the maxSize is reached?
-
-
+Currently: If maxSize is reached, we cant add any more data. Let's just split the mpApi
+data in smaller chunks then.
 
+TODO: How do we delete items from cache if the maxSize is reached? We could drop the first to
+add the next.
 """
 
 from lxml import etree
@@ -90,7 +90,8 @@ def lookup_relWork(self, *, mtype: str, ID: int) -> None:
         q = self._optimize_query(query=q)
         print(f"{self.length()} looking up relWork {mtype} {ID}")
         relWorkM = self.client.search2(query=q)
-        if relWorkM:  # realistic that query results are empty?
+        # realistic that query results are empty?
+        if relWorkM and self.cache.length() < self.maxSize:
             self.changed = True
             self.cache += relWorkM  # appending them to relWork cache
         # what to do if nothing is found?
@@ -113,6 +114,7 @@ def item_is_online(self, *, mtype: str, ID: int) -> bool:
         Report if, according to info in cache, the item has SMB-Freigabe.
         """
         if not self.item_exists(mtype=mtype, ID=ID):
+            # it's possible if maxSize exceeded
             raise KeyError("ERROR: Item not in Cache")
 
         r = self.cache.xpath(
@@ -197,15 +199,20 @@ def _lido_to_ids_not_in_cache(self, path: Path) -> set[tuple[str, int]]:
         id_cache = set()
         for ID_N in relWorksL:
             src = ID_N.xpath("@l:source", namespaces=NSMAP)[0]
-            if src == "OBJ.ID":
-                mtype = "Object"
-            elif src == "LIT.ID":
-                mtype = "Literature"
-            else:
-                raise ValueError(f"ERROR: Unknown type: {src}")
+            match src:
+                case "OBJ.ID":
+                    mtype = "Object"
+                case "LIT.ID":
+                    mtype = "Literature"
+                case _:
+                    raise ValueError(f"ERROR: Unknown type: {src}")
 
             id_int = int(ID_N.text)
-            if not self.cache.item_exists(mtype=mtype, ID=id_int):
+            if (
+                not self.cache.item_exists(mtype=mtype, ID=id_int)
+                and self.cache.length() < self.maxSize
+            ):
+                self.changed = True
                 id_cache.add((mtype, id_int))
             # else:
             #    print(f"item {mtype} {id_int} already in relWorks cache")