From d8c2964a55a73b2a1f0fa7a90fcbf2014abd2769 Mon Sep 17 00:00:00 2001
From: Maurice Mengel <mauricemengel@gmail.com>
Date: Tue, 24 Oct 2023 07:29:55 +0200
Subject: [PATCH] refactor python LidoTool, LinkChecker

---
 zml2lido/__init__.py    |   2 +-
 zml2lido/lidoTool.py    | 227 +++++++++++++--------------
 zml2lido/linkChecker.py | 330 ++++++++++++++++++++++------------------
 zml2lido/qc.py          |  19 ++-
 4 files changed, 308 insertions(+), 270 deletions(-)

diff --git a/zml2lido/__init__.py b/zml2lido/__init__.py
index c029877..8290399 100644
--- a/zml2lido/__init__.py
+++ b/zml2lido/__init__.py
@@ -74,7 +74,7 @@ def lido():
     print(f"JOB: {args.job}")
 
     lt = LidoTool(
-        Input=args.input, force=args.force, validation=args.validate, chunks=args.chunks
+        src=args.input, force=args.force, validation=args.validate, chunks=args.chunks
     )
     lt.execute(args.job)
 
diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py
index b690d2f..e4165ae 100644
--- a/zml2lido/lidoTool.py
+++ b/zml2lido/lidoTool.py
@@ -7,7 +7,7 @@
         C:/M3/zml2lido
     You need to specify three parameters 
         -j/--job: which flavor (job) of the transformation you want to use 
-        -i/--input: where the input xml file is
+        -i/--src: where the src xml file is
         -o/--output: will be used as output directory; in my case 
             C:/m3/zml2lido/sdata/{output}
 
@@ -56,23 +56,23 @@ class LidoTool:
     def __init__(
         self,
         *,
-        Input: str,
+        src: str,
         force: bool = False,
         validation: bool = False,
         chunks: bool = False,
     ) -> None:
         """
-        Input: lido file or first chunk
+        src: lido file or first chunk
         force: overwrites files
         validation: validate lido files?
-        chunks: expect consecutively numbered and zipped lido files as input
+        chunks: expect consecutively numbered and zipped lido files as src
         """
 
         self.validation = validation
         self.force = force
         self.chunks = chunks
 
-        self.Input = self._sanitize(Input=Input)
+        self.src = self._sanitize(src=src)
         self.outdir = self._prepareOutdir()
         self._initLog()
 
@@ -83,20 +83,20 @@ def __init__(
     def execute(self, job: str) -> None:
         if job == "dd":
             # debug. Only lvl1
-            lido_fn = self.zml2lido(Input=self.Input)
+            lido_fn = self.zml2lido(src=self.src)
         elif job == "ddd":
             # debug. Only lvl1
-            lido_fn = self.zml2lido(Input=self.Input)
+            lido_fn = self.zml2lido(src=self.src)
             self._valsplit(lido_fn)
         elif job == "ohneLit":
             # use different xslt for lvl1 conversion plus lvl2
-            lido_fn = self.zml2lido(Input=self.Input, xslt="ohneLit")
-            lvl2_fn = self.to_lvl2(Input=lido_fn)
+            lido_fn = self.zml2lido(src=self.src, xslt="ohneLit")
+            lvl2_fn = self.to_lvl2(src=lido_fn)
             self._valsplit(lvl2_fn)
         elif job == "mitLit":
             # regular xslt, lvl2
-            lido_fn = self.zml2lido(Input=self.Input)
-            lvl2_fn = self.to_lvl2(Input=lido_fn)
+            lido_fn = self.zml2lido(src=self.src)
+            lvl2_fn = self.to_lvl2(src=lido_fn)
             self._valsplit(lvl2_fn)
         else:
             raise SyntaxError("ERROR: Unknown job name!")
@@ -105,53 +105,53 @@ def lfilter(self, *, split: bool = False, Type: str) -> None:
         if not Type in xsl:
             raise TypeError(f"Error: Unknown type '{Type}'")
 
-        new_fn = self.Input.stem + f"-no{Type}.xml"
+        new_fn = self.src.stem + f"-no{Type}.xml"
         out_fn = self.outdir / new_fn
 
-        self.saxon(Input=self.Input, xsl=xsl[Type], output=out_fn)
+        self.saxon(src=self.src, xsl=xsl[Type], output=out_fn)
 
         if split:
             self.force = True
-            self.splitLido(Input=out_fn)
+            self.splitLido(src=out_fn)
 
-    def to_lvl2(self, *, Input: str) -> Path:
+    def to_lvl2(self, *, src: str) -> Path:
         if self.chunks:
-            for chunkFn in self.loopChunks(Input=Input):
-                new_fn = self.to_lvl2Single(Input=chunkFn)
-            return self.firstChunkName(Input=new_fn)
+            for chunkFn in self.loopChunks(src=src):
+                new_fn = self.to_lvl2Single(src=chunkFn)
+            return self.firstChunkName(src=new_fn)
         else:
-            return self.to_lvl2Single(Input=Input)
+            return self.to_lvl2Single(src=src)
 
-    def to_lvl2Single(self, *, Input: str) -> Path:
+    def to_lvl2Single(self, *, src: str | Path) -> Path:
         """
-        Using Python rewrite (fix) generic Zetcom xml, mostly working on
-        links (urls)
+        Using Python rewrite (fix) generic Zetcom xml, mostly working on links (urls)
         """
-        out_fn = self._lvl2_path(Input)
+        out_fn = self._lvl2_path(src)
         # print(f"lvl2: {out_fn}")
-        # init for each chunk required, although we will
-        lc = LinkChecker(Input=Input, chunks=self.chunks)  # reads cache
+        try:  # only load the first time
+            self.lc: LinkChecker
+        except:
+            self.lc = LinkChecker(src=src, chunks=self.chunks)  # reads cache
         if not out_fn.exists() or self.force:
-            lc.relWorks_cache_single(fn=Input)
-            lc.rmUnpublishedRecords()  # remove unpublished records (not on SMB-Digital)
-            # lc.rmInternalLinks()  # remove resourceSets with internal links
-            lc.fixRelatedWorks()
-            lc.saveTree(out_fn)
+            # self.lc.relWorks_cache_single(fn=src)
+            self.lc.rmUnpublishedRecords()  # remove unpublished records (not on SMB-Digital)
+            self.lc.fixRelatedWorks()
+            self.lc.saveTree(out_fn)
         else:
-            print(f"   rewrite exists already: {out_fn}, no overwrite")
+            print(f"   lvl2 already exists: {out_fn}")
         return out_fn
 
-    def splitLido(self, *, Input: str) -> str:
+    def splitLido(self, *, src: str | Path) -> str | Path:
         # print("SPLITLIDO enter")
         if self.chunks:
             self.force = True  # otherwise subsequent chunks are not written
-            for chunkFn in self.loopChunks(Input=Input):
-                self.splitLidoSingle(Input=chunkFn)
+            for chunkFn in self.loopChunks(src=src):
+                self.splitLidoSingle(src=chunkFn)
         else:
-            self.splitLidoSingle(Input=Input)
-        return Input  # dont act on split files
+            self.splitLidoSingle(src=src)
+        return src  # dont act on split files
 
-    def splitLidoSingle(self, *, Input: str) -> None:
+    def splitLidoSingle(self, *, src: str | Path) -> None:
         """
         Create individual files per lido record
         """
@@ -161,21 +161,21 @@ def splitLidoSingle(self, *, Input: str) -> None:
         if not splitDir.exists() or self.force:  # self.force is True was problematic
             print("SPLITLIDO making")
             os.chdir(self.outdir)
-            self.saxon(Input=Input, xsl=xsl["splitLido"], output="o.xml")
+            self.saxon(src=src, xsl=xsl["splitLido"], output="o.xml")
             os.chdir(orig)
         else:
             print(f" SPLIT DIR exists already: {splitDir}")
 
-    def splitSachbegriff(self, *, Input: str) -> Path:
+    def splitSachbegriff(self, *, src: str) -> Path:
         print("SPLITSACHBEGRIFF")
         if self.chunks:
-            for chunkFn in self.loopChunks(Input=Input):
-                sachbegriffFn = self.splitSachbegriff(Input=chunkFn)
-            return self.firstChunkName(Input=sachbegriffFn)
+            for chunkFn in self.loopChunks(src=src):
+                sachbegriffFn = self.splitSachbegriff(src=chunkFn)
+            return self.firstChunkName(src=sachbegriffFn)
         else:
-            return self.splitSachbegriffSingle(Input=Input)
+            return self.splitSachbegriffSingle(src=src)
 
-    def splitSachbegriffSingle(self, *, Input: str) -> Path:
+    def splitSachbegriffSingle(self, *, src: str) -> Path:
         """
         Writes two files to output dir
         ohneSachbegriff.xml is meant for debugging.
@@ -184,13 +184,13 @@ def splitSachbegriffSingle(self, *, Input: str) -> Path:
         os.chdir(self.outdir)
         out = "mitSachbegriff.xml"
         if not Path(out).exists() or self.force is True:
-            self.saxon(Input=Input, xsl=xsl["splitSachbegriff"], output=out)
+            self.saxon(src=src, xsl=xsl["splitSachbegriff"], output=out)
         else:
             print(f"{out} exist already, no overwrite")
         os.chdir(orig)
         return xslDir / out
 
-    def validate(self, *, path: Optional[str] = None):
+    def validate(self, *, p: str | Path | None = None):
         """
         It's optionally possible to specify a path for a file that needs validatation. If
         path is None, the file that was specified during __init__ will be validated.
@@ -200,74 +200,74 @@ def validate(self, *, path: Optional[str] = None):
         (Not tested recently for chunks...)
         """
 
-        if path is None:
-            to_val_fn = self.Input
+        if p is None:
+            to_val_fn = self.src
         else:
-            to_val_fn = path
+            to_val_fn: Path = Path(p)
 
         print(f"VALIDATING LIDO FILE {to_val_fn}")
         if self.chunks:
             print(" with chunks")
-            for chunkFn in self.loopChunks(Input=to_val_fn):
-                self.validateSingle(Input=chunkFn)
+            for chunkFn in self.loopChunks(src=to_val_fn):
+                self.validateSingle(src=chunkFn)
         else:
-            self.validateSingle(Input=to_val_fn)
+            self.validateSingle(src=to_val_fn)
 
-    def validateSingle(self, *, Input):
+    def validateSingle(self, *, src):
         if not hasattr(self, "schema"):
             print(f" loading schema {lidoXSD}")
             schemaDoc = etree.parse(lidoXSD)
             self.schema = etree.XMLSchema(schemaDoc)
 
-        print(f" validating {Input}")
-        doc = etree.parse(str(Input))
+        print(f" validating {src}")
+        doc = etree.parse(str(src))
         self.schema.assert_(doc)  # raises error when not valid
-        return Input
+        return src
 
-    def zml2lido(self, *, Input, xslt="zml2lido"):
+    def zml2lido(self, *, src, xslt="zml2lido"):
         print(f"ZML2LIDO {xslt}")
         if self.chunks:
             print(" with chunks")
-            for chunkFn in self.loopChunks(Input=self.Input):
-                lidoFn = self.zml2lidoSingle(Input=chunkFn, xslt=xslt)
-            return self.firstChunkName(Input=lidoFn)
+            for chunkFn in self.loopChunks(src=self.src):
+                lidoFn = self.zml2lidoSingle(src=chunkFn, xslt=xslt)
+            return self.firstChunkName(src=lidoFn)
         else:
-            return self.zml2lidoSingle(Input=Input, xslt=xslt)
+            return self.zml2lidoSingle(src=src, xslt=xslt)
 
-    def zml2lidoSingle(self, *, Input: str | Path, xslt="zml2lido") -> Path:
+    def zml2lidoSingle(self, *, src: str | Path, xslt="zml2lido") -> Path:
         """
         Convert a single file from zml to lido using the specified xslt.
-        Input is a full path.
+        src is a full path.
         """
-        inputP = Path(Input)
-        lidoFn = self.outdir.joinpath(inputP.stem + ".lido.xml")
+        srcP = Path(src)
+        lidoFn = self.outdir.joinpath(srcP.stem + ".lido.xml")
         print(f"zml2lidoSingle with {xsl[xslt]}")  # with file '{lidoFn}'
 
         if self.force is True or not lidoFn.exists():
-            if inputP.suffix == ".zip":  # unzipping temp file
-                print("   input is zipped")
-                parent_dir = inputP.parent
-                member = Path(inputP.name).with_suffix(".xml")
+            if srcP.suffix == ".zip":  # unzipping temp file
+                print("   src is zipped")
+                parent_dir = srcP.parent
+                member = Path(srcP.name).with_suffix(".xml")
                 temp_fn = parent_dir / member
-                with ZipFile(inputP, "r") as zippy:
+                with ZipFile(srcP, "r") as zippy:
                     zippy.extract(str(member), path=parent_dir)
-                new_input = temp_fn
+                new_src = temp_fn
             else:
-                new_input = inputP
+                new_src = srcP
 
-            self.saxon(Input=new_input, xsl=xsl[xslt], output=lidoFn)
+            self.saxon(src=new_src, xsl=xsl[xslt], output=lidoFn)
 
-            if inputP.suffix == ".zip":
+            if srcP.suffix == ".zip":
                 temp_fn.unlink()
         else:
-            print(f"lidoFn exists {lidoFn}")
+            print(f"exists {lidoFn}")
         return lidoFn
 
     #
     # more helpers
     #
 
-    def loopChunks(self, *, Input: str) -> Iterable[str]:
+    def loopChunks(self, *, src: str | Path) -> Iterable[str | Path]:
         """
         returns generator with path for existing files, counting up as long
         files exist. For this to work, filename has to include
@@ -275,16 +275,16 @@ def loopChunks(self, *, Input: str) -> Iterable[str]:
 
         This might belong in chunker,py to be reusable.
         """
-        print(f"chunk input: {Input}")
-        root, no, tail = self._analyze_chunkFn(Input=Input)
-        chunkFn = Input
+        print(f"chunk src: {src}")
+        root, no, tail = self._analyze_chunkFn(src=src)
+        chunkFn = src
         while Path(chunkFn).exists():
             yield chunkFn
             # print(f"{chunkFn} exists")
             no += 1
             chunkFn = f"{root}-chunk{no}{tail}"
 
-    def firstChunkName(self, *, Input: str | Path):
+    def firstChunkName(self, *, src: str | Path):
         """
         returns the chunk with no. 1
 
@@ -294,30 +294,30 @@ def firstChunkName(self, *, Input: str | Path):
         start with chunk1?
         List glob root* and take the first item?
         """
-        root, no, tail = self._analyze_chunkFn(Input=Input)
-        Input = Path(Input)
-        parent = Input.parent
+        root, no, tail = self._analyze_chunkFn(src=src)
+        src = Path(src)
+        parent = src.parent
         folder = {}
         for each in parent.iterdir():
             if str(each).startswith(root):
-                root, no, tail = self._analyze_chunkFn(Input=each)
+                root, no, tail = self._analyze_chunkFn(src=each)
                 folder[no] = each
         no = min(folder.keys())
         firstFn = folder[no]
         # print(f"***firstChunkName {firstFn}")
         return firstFn
 
-    def saxon(self, *, Input: str, output: str, xsl: str) -> None:
+    def saxon(self, *, src: str | Path, output: str | Path, xsl: str | Path) -> None:
         if not Path(saxLib).exists():
             raise SyntaxError(f"ERROR: saxLib {saxLib} does not exist!")
 
-        if not Path(Input).exists():
-            raise SyntaxError(f"ERROR: input {Input} does not exist!")
+        if not Path(src).exists():
+            raise SyntaxError(f"ERROR: src {src} does not exist!")
 
         if not Path(xsl).exists():
             raise SyntaxError(f"ERROR: xsl file does not exist!")
 
-        cmd = f"java -Xmx1450m -jar {saxLib} -s:{Input} -xsl:{xsl} -o:{output}"
+        cmd = f"java -Xmx1450m -jar {saxLib} -s:{src} -xsl:{xsl} -o:{output}"
         print(cmd)
 
         subprocess.run(
@@ -328,18 +328,19 @@ def saxon(self, *, Input: str, output: str, xsl: str) -> None:
     # private helper
     #
 
-    def _analyze_chunkFn(self, *, Input: str):
+    def _analyze_chunkFn(self, *, src: str | Path) -> tuple[str, int, str]:
         """
-        Input could be Path or str.
+        src could be Path or str.
 
         This might belong in chunker.py ...
         """
-        # print(f"ENTER ANALYZE WITH {Input}")
-        partsL = str(Input).split("-chunk")
+        # print(f"ENTER ANALYZE WITH {src}")
+        partsL = str(src).split("-chunk")
         root = partsL[0]
         m = re.match(r"(\d+)[\.-]", partsL[1])
-        no = int(m.group(1))
-        tail = str(Input).split("-chunk" + str(no))[1]
+        if m is not None:
+            no = int(m.group(1))
+        tail = str(src).split("-chunk" + str(no))[1]
         # print(f"_ANALYZE '{root}' '{no}' '{tail}'")
         return root, no, tail
 
@@ -366,15 +367,15 @@ def _lvl2_path(self, p: str | Path) -> Path:
     def _prepareOutdir(self) -> Path:
         # determine outdir (long or short)
         sdataP = Path("sdata").resolve()  # resolve probably not necessary
-        if re.match(r"\d\d\d\d\d\d", self.Input.parent.name):
-            outdir = sdataP / self.Input.parents[1].name / self.Input.parent.name
-        elif self.Input.parent.name == "sdata":
+        if re.match(r"\d\d\d\d\d\d", self.src.parent.name):
+            outdir = sdataP / self.src.parents[1].name / self.src.parent.name
+        elif self.src.parent.name == "sdata":
             raise SyntaxError(
-                """ERROR: Don't use an input file inside of sdata. 
+                """ERROR: Don't use an src file inside of sdata. 
                 Use a subdirectory instead!"""
             )
         else:
-            outdir = sdataP / self.Input.parent.name
+            outdir = sdataP / self.src.parent.name
 
         if not outdir.exists():
             print(f"Making new dir {outdir}")
@@ -382,9 +383,9 @@ def _prepareOutdir(self) -> Path:
         print(f" outdir {outdir}")
         return outdir
 
-    def _sanitize(self, *, Input: str) -> Path:
+    def _sanitize(self, *, src: str | Path) -> Path:
         """
-        Input could be Path or str.
+        src could be Path or str.
 
         Some checks for convenience; mainly for our users, so they get more intelligable
         error messages.
@@ -396,21 +397,23 @@ def _sanitize(self, *, Input: str) -> Path:
             raise SyntaxError(f"ERROR: Call me from directory '{script_dir}', please!")
 
         if not Path(saxLib).is_file():
-            raise SyntaxError(f"ERROR: Saxon not found, check config file at {conf_fn}")
+            raise SyntaxError(
+                f"ERROR: Saxon not found, check environment variable saxLib"
+            )
 
-        # check Input
-        if Input is None:
-            raise SyntaxError("ERROR: Input can't be None!")
-        Input = Path(Input)  # initial input file, e.g. 3Wege.zml.xml
+        # check src
+        if src is None:
+            raise SyntaxError("ERROR: src can't be None!")
+        src = Path(src)  # initial src file, e.g. 3Wege.zml.xml
 
-        if Input.is_dir():
-            raise SyntaxError("ERROR: Input is directory!")
-        elif not Input.exists():
-            raise SyntaxError("ERROR: Input does not exist!")
+        if src.is_dir():
+            raise SyntaxError("ERROR: src is directory!")
+        elif not src.exists():
+            raise SyntaxError("ERROR: src does not exist!")
 
-        return Input
+        return src
 
     def _valsplit(self, fn):
         if self.validation:
             self.validate(path=fn)
-        self.splitLido(Input=fn)
+        self.splitLido(src=fn)
diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py
index dcfbb49..90ebfb9 100644
--- a/zml2lido/linkChecker.py
+++ b/zml2lido/linkChecker.py
@@ -1,8 +1,8 @@
 """
-    parse a LIDO file for linkResources and work on linkResources that don't start with http 
+    parse a LIDO file for g and work on linkResources that don't start with http 
     for those guess the URL based on heuristics indicated by the examples path below
-    write result to lido file in same dir as input
-    input and output are lido
+    write result to lido file in same dir as src
+    src and output are lido
     
     https://recherche.smb.museum/images/5403567_2500x2500.jpg
     lidoWrap/lido/administrativeMetadata/resourceWrap/resourceSet/resourceRepresentation/linkResource
@@ -17,7 +17,7 @@
 from mpapi.search import Search
 from pathlib import Path
 import re
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 NSMAP = {"l": "http://www.lido-schema.org"}
 relWorks_maxSize = 20000  # more lasts forever
@@ -25,46 +25,26 @@
 
 
 class LinkChecker:
-    def __init__(self, *, Input: str | Path, chunks: bool = False):
-        self.log(f"STATUS: LinkChecker is working on {Input}")  # not exactly an error
-        self.Input = Path(Input)
+    def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
+        self._log(f"STATUS: LinkChecker is working on {src}")  # not exactly an error
+        self.src = Path(src)
         # self.chunk = chunk
-        self.relWorksFn = self.Input.parent / "relWorks.cache.xml"
-        self.tree = etree.parse(str(Input))
+        self.relWorksFn = self.src.parent / "relWorks.cache.xml"
+        self.tree = etree.parse(str(src))
         # we used to not prepare the relWorksCache here. Why?
+        self._init_relWorks_cache()
         if chunks:
             print("prepare relWorks cache (chunks, many)")
-            self._relWorks_cache_many(first=Input)  # run only once to make cache
-
-    def checkRelWorkOnline(self, *, modType: str, modItemId: int):
-        """
-        Checks if a specific relWork is online. No urlrequest, just examins if
-        SMB-Freigabe = Ja.
-
-        Expects modItemId as int; but str should work as well.
-        """
-        r = self.relWorks.xpath(
-            f"""/m:application/m:modules/m:module[
-                @name = '{modType}']/m:moduleItem[
-                @id = {str(modItemId)}]/m:repeatableGroup[
-                @name = 'ObjPublicationGrp']/m:repeatableGroupItem[
-                    m:vocabularyReference[@name='PublicationVoc']/m:vocabularyReferenceItem[@name='Ja'] 
-                    and m:vocabularyReference[@name='TypeVoc']/m:vocabularyReferenceItem[@id = 2600647]
-                ]"""
-        )
-        if len(r) > 0:
-            return True
-        else:
-            return False
+            self._relWorks_cache_many(first=src)  # run only once to make cache
 
-    def fixRelatedWorks(self):
+    def fixRelatedWorks(self) -> None:
         """
         Frank doesn't want dead links in relatedWorks. So we loop thru them, check
         if they are SMB-approved (using MpApi) and, if not, we remove them. We're
         also include ISILs in the same step.
         """
 
-        self.log(
+        self._log(
             "fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
         )
 
@@ -77,118 +57,40 @@ def fixRelatedWorks(self):
         )
 
         # for //relatedWork in the current LIDO document
-        for ID in relatedWorksL:
-            # don't log self.log(f"fixRelatedWorks checking {ID.text}")
+        for objectID in relatedWorksL:
+            # don't _log self._log(f"fixRelatedWorks checking {objectID.text}")
 
             # assuming that source always exists
-            src = ID.xpath("@l:source", namespaces=NSMAP)[0]
+            src = objectID.xpath("@l:source", namespaces=NSMAP)[0]
             if src == "OBJ.ID":
                 modType = "Object"
             elif src == "LIT.ID":
                 modType = "Literature"
             elif src == "ISIL/ID":
                 raise ValueError(
-                    "ERROR: @lido:source='ISIL/ID' indicates that an already"
+                    "ERROR: @lido:source='ISIL/ID' indicates that an already "
                     + "processed LIDO file is being processed again"
                 )
                 modType = "Object"
             else:
                 raise ValueError(f"ERROR: Unknown type: {src}")
 
-            if ID.text is not None:
-                id_int = int(ID.text)
-                # only recursive should get us here
-                # except:
-                #    id_int = int(ID.text.split("/")[-1])
-                # print (f"*****{id_str} {modType}")
+            if objectID.text is not None:
+                id_int = int(objectID.text)
                 if modType == "Literature":
                     pass
                     # print("WARN: No check for modType 'Literature'")
                 else:
                     # print(f"fixing relatedWork {modType} {id_int}")
-                    try:
-                        # is the work already in the cache?
-                        relWorkN = self.relWorks[(modType, id_int)]
-                    except:  # if not, get record and add it to cache
-                        print(f"   getting item from online RIA {modType} {id_int}")
-                        # if not, get it now and add to cache
-                        q = Search(module=modType, limit=-1)
-                        q.addCriterion(
-                            operator="equalsField",
-                            field="__id",
-                            value=str(id_int),
-                        )
-                        q = self._optimize_relWorks_cache(query=q)
-                        # q.toFile(path="sdata/debug.search.xml")
-                        relWork = client.search2(query=q)
-                        if relWork:  # realistic that query results are empty?
-                            # appending them to relWork cache
-                            self.relWorks += relWork
-                            # print ("   update file cache")
-                            self.relWorks.toFile(path=self.relWorksFn)
-                    else:
-                        # if relWork record is already in cache
-                        relWork = Module()
-                        relWork.addItem(itemN=relWorkN, mtype=modType)
-
-                    if self.checkRelWorkOnline(modType=modType, modItemId=id_int):
-                        # rewrite ISIL, should look like this:
-                        # <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
-                        # self.log(f"   looking up ISIL for relWork")
-                        ID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
-                        # we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
-                        try:
-                            verwInst = relWork.xpath(
-                                """//m:moduleReference[
-                                    @name='ObjOwnerRef'
-                                ]/m:moduleReferenceItem/m:formattedValue"""
-                            )[0]
-                        except:
-                            self.log(
-                                f"WARNING: verwaltendeInstitution empty! {modType} {id_int}"
-                            )
-                        else:
-                            ISIL = self.ISIL_lookup(institution=verwInst.text)
-                            ID.text = f"{ISIL}/{str(id_int)}"
-                            print(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
-                    else:
-                        self.log(f"   removing unpublic relWork")
-                        relWorkSet = ID.getparent().getparent().getparent()
-                        relWorkSet.getparent().remove(relWorkSet)
-
-    def ISIL_lookup(self, *, institution):
-        """
-        Load vocmap.xml and lookup ISIL for name of institution.
+                    if not self.relWorks.item_exists(mtype=modType, ID=id_int):
+                        self._add_to_relWorks_cache(mtype=modType, ID=id_int)
+                    # at this point we can rely on item being in relWorks cache
+                    self._rewrite_relWork(mtype=modType, objectID=objectID)
 
-        In the beginning, we die when no ISIL found, but later we might carp more gracefully.
+    def linkResource_online_http(self) -> None:
         """
-        vm_fn = Path(__file__).parents[1] / "vocmap.xml"
-        if not vm_fn.exists():
-            raise SyntaxError(f"File not found {vm_fn}")
-        vocMap = etree.parse(vm_fn)
-        try:
-            ISIL = vocMap.xpath(
-                f"""/vocmap/voc[
-                @name='verwaltendeInstitution'
-            ]/concept[
-                source = '{institution}'
-            ]/target[
-                @name = 'ISIL'
-            ]"""
-            )[0]
-        except:
-            raise SyntaxError(
-                f"vocMap: verwaltendeInstitution '{institution}' not found"
-            )
-        return ISIL.text
-
-    def log(self, msg):
-        print(msg)
-        logging.info(msg)
-
-    def new_check(self):
-        """
-        For all linkResources, check if url responds ok
+        For all linkResources in self.tree, check if url responds ok using http.
+        Prints the result (which is a bit awkward).
         """
         linkResourceL = self.tree.xpath(
             "/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
@@ -212,10 +114,12 @@ def new_check(self):
                 else:
                     print("\tsuccess")
 
-    def relWorks_cache_single(self, *, fn):
+    def relWorks_cache_single(self, *, fn: str | Path) -> None:
         """
         Extracts IDs from one file (fn), queriess RIA for those IDs and adds new info to
-        self.relWorks
+        self.relWorks.
+
+        This function currently seems to be so slow that it's useless.
         """
         fn = Path(fn)
         ID_cache = set()  # set of relWork ids, no duplicates
@@ -223,14 +127,14 @@ def relWorks_cache_single(self, *, fn):
         print(f"growing relWorks with ids from {fn}")
         self._grow_relWorks_cache(ID_cache)
 
-    def rmInternalLinks(self):
+    def rmInternalLinks(self) -> None:
         """
-        SEEMS TO BE NO LONGER NEEDED!
-
         Remove resourceSet whose linkResource point to internal links;
         links are internal if they dont begin with "http", e.g.
+
+        Not currently used.
         """
-        self.log("resourceSet: Removing sets with remaining internal links")
+        self._log("resourceSet: Removing sets with remaining internal links")
         linkResourceL = self.tree.xpath(
             "/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
             namespaces=NSMAP,
@@ -241,13 +145,13 @@ def rmInternalLinks(self):
                     resourceSet = link.getparent().getparent()
                     resourceSet.getparent().remove(resourceSet)
 
-    def rmUnpublishedRecords(self):
+    def rmUnpublishedRecords(self) -> None:
         """
         Remove lido records which are not published on SMB Digital.
 
         Assumes that only records which have SMBFreigabe=Ja have objectPublishedID
         """
-        # self.log(
+        # self._log(
         #    "   LinkChecker: Removing lido records that are not published on recherche.smb"
         # )
         recordsL = self.tree.xpath(
@@ -255,28 +159,65 @@ def rmUnpublishedRecords(self):
         )
         for recordN in recordsL:
             recID = recordN.xpath("l:lidoRecID", namespaces=NSMAP)[0]
-            self.log(f"rm unpublishedRecords: {recID}")
+            self._log(f"rm unpublishedRecords: {recID}")
             recordN.getparent().remove(recordN)
-        self.log("rmUnpublishedRecords: done!")
+        self._log("rmUnpublishedRecords: done!")
 
-    def saveTree(self, out_fn) -> str:
+    def saveTree(self, out_fn: str | Path) -> str:
         """
         During __init__ we loaded a LIDO file, with this function we write it back to the
         out file location as set during __init__.
         """
-        self.log(f"Writing back to {out_fn}")
+        self._log(f"Writing back to {out_fn}")
         self.tree.write(
             str(out_fn), pretty_print=True, encoding="UTF-8", xml_declaration=True
         )
         return out_fn
 
     #
+    # PRIVATE
     #
-    #
+
+    def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
+        """
+        Get item from RIA, add to relWorks cache and write cache to disk.
+
+        Caution: Does not include a check if relWork is already in cache.
+        """
+        print(f"   getting item from online RIA {modType} {id_int}")
+        # if not, get it now and add to cache
+        q = Search(module=mType, limit=-1)
+        q.addCriterion(
+            operator="equalsField",
+            field="__id",
+            value=str(id_int),
+        )
+        q = self._optimize_relWorks_cache(query=q)
+        # q.toFile(path="sdata/debug.search.xml")
+        relWork = client.search2(query=q)
+        if relWork:  # realistic that query results are empty?
+            # appending them to relWork cache
+            self.relWorks += relWork
+            # print ("   update file cache")
+            self.relWorks.toFile(path=self.relWorksFn)
+
+    def _del_relWork(self, *, ID) -> None:
+        """
+        delete a relWork from self.etree.
+        ID is a lxml node
+        """
+        self._log(f"   removing unpublic relWork {ID.text}")
+        relWorkSet = ID.getparent().getparent().getparent()
+        relWorkSet.getparent().remove(relWorkSet)
 
     def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
-        print(f"   data file (may be a chunk) exists {chunk_fn}")
-        self._init_relWorks_cache()
+        """
+        Given the path to a lido file, scan it for relWorks and produce a set with the
+        objIds.
+
+        ID is a lxml node
+        """
+        print(f"   _file_to_ID_cache exists {chunk_fn}")
         chunkET = etree.parse(str(chunk_fn))
 
         relWorksL = chunkET.xpath(
@@ -285,7 +226,7 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
             namespaces=NSMAP,
         )
 
-        print(f"   chunk has {len(relWorksL)} relWorks")
+        print(f"   _file_to_ID_cache {len(relWorksL)} relWorks")
 
         for ID in relWorksL:
             src = ID.xpath("@l:source", namespaces=NSMAP)[0]
@@ -297,12 +238,14 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
                 raise ValueError(f"ERROR: Unknown type: {src}")
 
             # dont write more than a few thousand items in cache
-            if len(ID_cache) >= relWorks_maxSize:
-                break
+            # if len(ID_cache) >= relWorks_maxSize:
+            #    print("break here")
+            #    break
             if ID.text is not None and mType == "Object":
                 # only add this to ID_cache if not yet in relWorks cache
                 if not self.relWorks.item_exists(mtype="Object", ID=int(ID.text)):
                     ID_cache.add(int(ID.text))
+        print(f"   adding {len(ID_cache)} IDs")
         return ID_cache
 
     def _grow_relWorks_cache(self, ID_cache: set) -> None:
@@ -310,7 +253,9 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
         Make one query with all the IDs from ID_cache, execute the query and save the results
         to self.relWorks, also write to disk
         """
-        print(f"   Length of ID_cache: {len(ID_cache)}")
+        print(
+            f"   _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks)}"
+        )
         client = MpApi(baseURL=baseURL, user=user, pw=pw)
         if len(ID_cache) > 0:
             q = Search(module="Object", limit=-1)
@@ -336,20 +281,24 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
                 self.relWorks = newRelWorksM
             else:
                 # if relWorks exists already, add to it
+                print("   adding")
                 self.relWorks += newRelWorksM
             # save the cache to file after processing every chunk
             # no max_size limitation
             self.relWorks.toFile(path=self.relWorksFn)
 
     def _init_relWorks_cache(self):
+        """
+        Initializes self.refWorks cache. If cache file exists, load it. May
+        also initialize empty self.refWorks.
+        """
         if Path(self.relWorksFn).exists():
             try:
                 self.relWorks
             except:
                 # print("Inline cache not loaded yet")
-                print(f"   About to load existing relWorks cache {self.relWorksFn}")
+                print(f"   Loading existing relWorks cache {self.relWorksFn}")
                 self.relWorks = Module(file=self.relWorksFn)
-                return
                 # if we read relWorks cache from file we dont loop thru data files (chunks)
                 # looking for all the relWorks to fill the cache as best as we can
             # else:
@@ -358,9 +307,39 @@ def _init_relWorks_cache(self):
             print(f"   No relWorks file to load at {self.relWorksFn}")
             self.relWorks = Module()
 
+    def _log(self, msg):
+        print(msg)
+        logging.info(msg)
+
+    def _lookup_ISIL(self, *, institution):
+        """
+        Load vocmap.xml and lookup ISIL for name of institution.
+
+        In the beginning, we die when no ISIL found, but later we might carp more gracefully.
+        """
+        vm_fn = Path(__file__).parents[1] / "vocmap.xml"
+        if not vm_fn.exists():
+            raise SyntaxError(f"File not found {vm_fn}")
+        vocMap = etree.parse(vm_fn)
+        try:
+            ISIL = vocMap.xpath(
+                f"""/vocmap/voc[
+                @name='verwaltendeInstitution'
+            ]/concept[
+                source = '{institution}'
+            ]/target[
+                @name = 'ISIL'
+            ]"""
+            )[0]
+        except:
+            raise SyntaxError(
+                f"vocMap: verwaltendeInstitution '{institution}' not found"
+            )
+        return ISIL.text
+
     def _nextChunk(self, *, fn: Path):
         """
-        Returns the path/name of the next chunk if it exists or errors if the input
+        Returns the path/name of the next chunk if it exists or errors if the src
         is not chunkable or the next chunk does not exist.
 
         Expects path/name of lvl 1 lido file that ends in ".lido.xml".
@@ -397,6 +376,27 @@ def _optimize_relWorks_cache(self, *, query):
         query.validate(mode="search")
         return query
 
+    def _relWork_online(self, *, modType: str, modItemId: int):
+        """
+        Checks if a specific relWork is online. No urlrequest, just examins if
+        SMB-Freigabe = Ja.
+
+        Expects modItemId as int; but str should work as well.
+        """
+        r = self.relWorks.xpath(
+            f"""/m:application/m:modules/m:module[
+                @name = '{modType}']/m:moduleItem[
+                @id = {str(modItemId)}]/m:repeatableGroup[
+                @name = 'ObjPublicationGrp']/m:repeatableGroupItem[
+                    m:vocabularyReference[@name='PublicationVoc']/m:vocabularyReferenceItem[@name='Ja'] 
+                    and m:vocabularyReference[@name='TypeVoc']/m:vocabularyReferenceItem[@id = 2600647]
+                ]"""
+        )
+        if len(r) > 0:
+            return True
+        else:
+            return False
+
     def _relWorks_cache_many(self, *, first):
         """
         creates relatedWorksCache from all chunks
@@ -416,6 +416,9 @@ def _relWorks_cache_many(self, *, first):
         """
         ID_cache = set()  # set of relWork ids, no duplicates
         chunk_fn = Path(first)
+        # if the cache is already at max_size, we dont need this step
+        if len(self.relWorks) >= relWorks_maxSize:
+            return None
         while chunk_fn.exists():
             ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
             try:
@@ -423,8 +426,37 @@ def _relWorks_cache_many(self, *, first):
             except:
                 # print ("   breaking the while")
                 break  # break the while if this is the only data file or the last chunk
+            if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
+                break
         self._grow_relWorks_cache(ID_cache)
 
+    def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
+        """
+        if relWork unpublic delete; otherwise rewrite
+        """
+        id_int = int(ID.text)
+
+        if self._relWork_online(modType=modType, modItemId=id_int):
+            # rewrite ISIL, should look like this:
+            # <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
+            # self._log(f"   looking up ISIL for relWork")
+            objectID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
+            # we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
+            try:
+                verwInst = relWork.xpath(
+                    """//m:moduleReference[
+                        @name='ObjOwnerRef'
+                    ]/m:moduleReferenceItem/m:formattedValue"""
+                )[0]
+            except:
+                self._log(f"WARNING: verwaltendeInstitution empty! {modType} {id_int}")
+            else:
+                ISIL = self._lookup_ISIL(institution=verwInst.text)
+                objectID.text = f"{ISIL}/{str(id_int)}"
+                print(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
+        else:
+            self._del_relWork(objectID=objectID)
+
 
 if __name__ == "__main__":
     import argparse
@@ -432,7 +464,7 @@ def _relWorks_cache_many(self, *, first):
     parser = argparse.ArgumentParser(description="Simple linkResource checker")
     parser.add_argument(
         "-i",
-        "--input",
+        "--src",
         help="point to LIDO file",
         required=True,
     )
@@ -441,6 +473,6 @@ def _relWorks_cache_many(self, *, first):
     args = parser.parse_args()
 
     m = LinkChecker(
-        Input=args.input,
+        src=args.src,
     )
-    m.new_check()
+    m.linkResource_online_http()
diff --git a/zml2lido/qc.py b/zml2lido/qc.py
index 5df6575..99ee000 100644
--- a/zml2lido/qc.py
+++ b/zml2lido/qc.py
@@ -9,15 +9,18 @@
 """
 
 import argparse
-import pathlib from Path
+from pathlib import Path
+
 parser = argparse.ArgumentParser(description="Quality control for LIDO files")
-    parser.add_argument(
-        "-i",
-        "--input",
-        help="specify an input file",
-        required=True,
-    )
-    args = parser.parse_args()
+parser.add_argument(
+    "-i",
+    "--input",
+    help="specify an input file",
+    required=True,
+)
+args = parser.parse_args()
+
 
 class QualityControl:
     def __init__(self, *, input_fn):
+        pass