more refactoring...

mokko · Mar 3, 2024 · 4a59092 · 4a59092
1 parent 17b2db9
commit 4a59092
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 93 deletions.
diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py
@@ -136,7 +136,7 @@ def to_lvl2Single(self, *, src: str | Path) -> Path:
             # self.lc.relWorks_cache_single(fn=src)
             self.lc.rmUnpublishedRecords()  # remove unpublished records (not on SMB-Digital)
             self.lc.fixRelatedWorks()
-            self.lc.saveTree(out_fn)
+            self.lc.save(out_fn)
         else:
             print(f"   lvl2 already exists: {out_fn}")
         return out_fn
@@ -190,7 +190,7 @@ def splitSachbegriffSingle(self, *, src: str) -> Path:
         os.chdir(orig)
         return xslDir / out
 
-    def validate(self, *, p: str | Path | None = None):
+    def validate(self, *, path: Path | None = None):
         """
         It's optionally possible to specify a path for a file that needs validatation. If
         path is None, the file that was specified during __init__ will be validated.
@@ -200,20 +200,20 @@ def validate(self, *, p: str | Path | None = None):
         (Not tested recently for chunks...)
         """
 
-        if p is None:
-            to_val_fn = self.src
+        if path is None:
+            to_val_fn = Path(self.src)
         else:
-            to_val_fn: Path = Path(p)
+            to_val_fn = path
 
-        print(f"VALIDATING LIDO FILE {to_val_fn}")
+        print(f"VALIDATING LIDO FILE '{to_val_fn}'")
         if self.chunks:
             print(" with chunks")
             for chunkFn in self.loopChunks(src=to_val_fn):
                 self.validateSingle(src=chunkFn)
         else:
             self.validateSingle(src=to_val_fn)
 
-    def validateSingle(self, *, src):
+    def validateSingle(self, *, src: Path):
         if not hasattr(self, "schema"):
             print(f" loading schema {lidoXSD}")
             schemaDoc = etree.parse(lidoXSD)

diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py
@@ -9,13 +9,17 @@
     USAGE:
     lc = LinkChecker(src="path/to/file.lido.xml")
 
+    #related works
+    self-relWorks_cache = self.init_relWorks_cache()
+    lc.relWorks_cache_single(fn="path/to/file.lido.xml") # parse fn for relWorks and populate cache
+    lc.relWorks_cache_many(first="path/to/file.lido.xml") # parse fn for relWorks and populate cache
     lc.fixRelatedWorks() # removes dead links in relatedWorks, also adds ISIL
+
     lc.linkResource_online_http() # for all linkResources print online status
-    lc.relWorks_cache_single(fn="path/to/file.lido.xml") # parse fn for relWorks and populate cache
     lc.rmInternalLinks() # remove linkResource with internal links, not used atm
     lc.rmUnpublishedRecords() # removes objects without objectPublishedID
 
-    lc.saveTree(out_fn="path/to/lido.lvl2.xml")
+    lc.save(out_fn="path/to/lido.lvl2.xml")
 
 """
 
@@ -40,15 +44,17 @@ def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
         self._log(f"STATUS: LinkChecker is working on {src}")  # not exactly an error
         self.src = Path(src)
         # self.chunk = chunk
-        self.relWorksFn = self.src.parent / "relWorks.cache.xml"
-        self.tree = etree.parse(str(src))
-        # we used to not prepare the relWorksCache here. Why?
-        self._init_relWorks_cache()
+        self.relWorks_fn = self.src.parent / "relWorks.cache.xml"
+        self.data = etree.parse(str(src))
         self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
+        self.relWorks_cache = self.init_relWorks_cache()  # load file if it exists
 
         if chunks:
             print("prepare relWorks cache (chunks, many)")
-            self._relWorks_cache_many(first=src)  # run only once to make cache
+            self.relWorks_cache_many(first=src)  # run only once to make cache
+        # why wouldnt I load the first file into the cache?
+        else:
+            self.relWorks_cache_single(fn=src)
 
     def fixRelatedWorks(self) -> None:
         """
@@ -60,7 +66,7 @@ def fixRelatedWorks(self) -> None:
             "fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
         )
 
-        relatedWorksL = self.tree.xpath(
+        relatedWorksL = self.data.xpath(
             """/l:lidoWrap/l:lido/l:descriptiveMetadata/l:objectRelationWrap/
             l:relatedWorksWrap/l:relatedWorkSet/l:relatedWork/l:object/l:objectID""",
             namespaces=NSMAP,
@@ -92,17 +98,37 @@ def fixRelatedWorks(self) -> None:
                     # print("WARN: No check for mtype 'Literature'")
                 else:
                     # print(f"fixing relatedWork {mtype} {id_int}")
-                    if not self.relWorks.item_exists(mtype=mtype, ID=id_int):
+                    if not self.relWorks_cache.item_exists(mtype=mtype, ID=id_int):
                         self._add_to_relWorks_cache(mtype=mtype, ID=id_int)
                     # at this point we can rely on item being in relWorks cache
                     self._rewrite_relWork(mtype=mtype, objectID=objectID_N)
 
+    def init_relWorks_cache(self) -> Module:
+        """
+        Initializes self.refWorks cache. If cache file already exists, load it. Else
+        initialize empty self.refWorks.
+        """
+        if Path(self.relWorks_fn).exists():
+            # try:
+            #    self.relWorks
+            # except NameError:
+            # print("Inline cache not loaded yet")
+            print(f"   Loading existing relWorks cache {self.relWorks_fn}")
+            return Module(file=self.relWorks_fn)
+            # else:
+            # print("Inline cache exists already")
+        # if we read relWorks cache from file we dont loop thru data files (chunks)
+        # looking for all the relWorks to fill the cache as best as we can
+        else:
+            print(f"   No relWorks file to load at {self.relWorks_fn}")
+            return Module()
+
     def linkResource_online_http(self) -> None:
         """
-        For all linkResources in self.tree, check if url responds ok using http.
+        For all linkResources in self.data, check if url responds ok using http.
         Prints the result (which is a bit awkward).
         """
-        linkResourceL = self.tree.xpath(
+        linkResourceL = self.data.xpath(
             "/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
             namespaces=NSMAP,
         )
@@ -124,6 +150,40 @@ def linkResource_online_http(self) -> None:
                 else:
                     print("\tsuccess")
 
+    def relWorks_cache_many(self, *, first: str | Path) -> None:
+        """
+        reads relatedWorks into relWorks_cache from all chunks, only until cache is full.
+
+        In case we're in chunk mode, the normal preparation is inefficient, so let's see
+        if we can speed things up by offering a separate cache for chunk mode.
+
+        expects
+        -first: the path to the first chunk
+
+        TODO: Let's first pass all the relWork.IDs into the set and then make one big
+        query. That should be faster. But before we do that, we need to test
+        if current version works non-chunk version.
+
+        If the relWorks_cache gets too big (~1GB xml file), split the chunks
+        into multiple dirs and process separately.
+        """
+        ID_cache = set()  # set of relWork ids, no duplicates
+        chunk_fn = Path(first)
+        # if the cache is already at max_size, we dont do anything
+        # else we keep loading more chunks
+        if len(self.relWorks) >= relWorks_maxSize:
+            return None
+        while chunk_fn.exists():
+            ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
+            try:
+                chunk_fn = self._nextChunk(fn=chunk_fn)
+            except:
+                # print ("   breaking the while")
+                break  # break the while if this is the only data file or the last chunk
+            if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
+                break
+        self._grow_relWorks_cache(ID_cache)
+
     def relWorks_cache_single(self, *, fn: str | Path) -> None:
         """
         Extracts IDs from one file (fn), queriess RIA for those IDs and adds new info to
@@ -145,7 +205,7 @@ def rmInternalLinks(self) -> None:
         Not currently used.
         """
         self._log("resourceSet: Removing sets with remaining internal links")
-        linkResourceL = self.tree.xpath(
+        linkResourceL = self.data.xpath(
             "/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
             namespaces=NSMAP,
         )
@@ -164,7 +224,7 @@ def rmUnpublishedRecords(self) -> None:
         # self._log(
         #    "   LinkChecker: Removing lido records that are not published on recherche.smb"
         # )
-        recordsL = self.tree.xpath(
+        recordsL = self.data.xpath(
             "/l:lidoWrap/l:lido[not(l:objectPublishedID)]", namespaces=NSMAP
         )
         for recordN in recordsL:
@@ -173,13 +233,13 @@ def rmUnpublishedRecords(self) -> None:
             recordN.getparent().remove(recordN)
         self._log("rmUnpublishedRecords: done!")
 
-    def saveTree(self, out_fn: str | Path) -> str:
+    def save(self, out_fn: str | Path) -> str:
         """
         During __init__ we loaded a LIDO file, with this function we write it back to the
         out file location as set during __init__.
         """
         self._log(f"Writing back to {out_fn}")
-        self.tree.write(
+        self.data.write(
             str(out_fn), pretty_print=True, encoding="UTF-8", xml_declaration=True
         )
         return out_fn
@@ -207,9 +267,9 @@ def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
         relWork = self.client.search2(query=q)
         if relWork:  # realistic that query results are empty?
             # appending them to relWork cache
-            self.relWorks += relWork
+            self.relWorks_cache += relWork
             # print ("   update file cache")
-            self.relWorks.toFile(path=self.relWorksFn)
+            self.relWorks_cache.toFile(path=self.relWorks_fn)
 
     def _del_relWork(self, *, ID) -> None:
         """
@@ -254,7 +314,7 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
             if ID.text is not None and mType == "Object":
                 # only add this to ID_cache if not yet in relWorks cache
                 objId = int(ID.text)
-                if not self.relWorks.item_exists(mtype="Object", ID=objId):
+                if not self.relWorks_cache.item_exists(mtype="Object", ID=objId):
                     ID_cache.add(objId)
         print(f"   adding {len(ID_cache)} IDs")
         return ID_cache
@@ -265,7 +325,7 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
         to self.relWorks, also write to disk
         """
         print(
-            f"   _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks)}"
+            f"   _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks_cache)}"
         )
         if len(ID_cache) > 0:
             q = Search(module="Object", limit=-1)
@@ -295,33 +355,13 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
                 self.relWorks += newRelWorksM
             # save the cache to file after processing every chunk
             # no max_size limitation
-            self.relWorks.toFile(path=self.relWorksFn)
+            self.relWorks.toFile(path=self.relWorks_fn)
 
-    def _init_relWorks_cache(self):
-        """
-        Initializes self.refWorks cache. If cache file already exists, load it. Else
-        initialize empty self.refWorks.
-        """
-        if Path(self.relWorksFn).exists():
-            # try:
-            #    self.relWorks
-            # except NameError:
-            # print("Inline cache not loaded yet")
-            print(f"   Loading existing relWorks cache {self.relWorksFn}")
-            self.relWorks = Module(file=self.relWorksFn)
-            # else:
-            # print("Inline cache exists already")
-        # if we read relWorks cache from file we dont loop thru data files (chunks)
-        # looking for all the relWorks to fill the cache as best as we can
-        else:
-            print(f"   No relWorks file to load at {self.relWorksFn}")
-            self.relWorks = Module()
-
-    def _log(self, msg):
+    def _log(self, msg: str) -> None:
         print(msg)
         logging.info(msg)
 
-    def _lookup_ISIL(self, *, institution):
+    def _lookup_ISIL(self, *, institution) -> str:
         """
         Load vocmap.xml and lookup ISIL for name of institution.
 
@@ -347,7 +387,7 @@ def _lookup_ISIL(self, *, institution):
             )
         return ISIL.text
 
-    def _nextChunk(self, *, fn: Path):
+    def _nextChunk(self, *, fn: str | Path) -> Path:
         """
         Returns the path/name of the next chunk if it exists or errors if the src
         is not chunkable or the next chunk does not exist.
@@ -365,11 +405,11 @@ def _nextChunk(self, *, fn: Path):
             if Path(new_path).exists():
                 return new_path
             else:
-                raise FileNotFoundError("chunk does not exist")
+                raise FileNotFoundError(f"ERROR: chunk does not exist '{new_path}'")
         else:
-            raise SyntaxError("not chunkable")
+            raise SyntaxError("ERROR: Filename not chunkable")
 
-    def _optimize_relWorks_cache(self, *, query):
+    def _optimize_relWorks_cache(self, *, query: Search) -> Search:
         """
         let's shrink (optimize) the xml. We only need a couple fields
         """
@@ -384,16 +424,16 @@ def _optimize_relWorks_cache(self, *, query):
         query.validate(mode="search")
         return query
 
-    def _relWork_online(self, *, modType: str, modItemId: int):
+    def _relWork_online(self, *, mtype: str, modItemId: int) -> bool:
         """
         Checks if a specific relWork is online. No urlrequest, just examins if
         SMB-Freigabe = Ja.
 
         Expects modItemId as int; but str should work as well.
         """
-        r = self.relWorks.xpath(
+        r = self.relWorks_cache.xpath(
             f"""/m:application/m:modules/m:module[
-                @name = '{modType}']/m:moduleItem[
+                @name = '{mtype}']/m:moduleItem[
                 @id = {str(modItemId)}]/m:repeatableGroup[
                 @name = 'ObjPublicationGrp']/m:repeatableGroupItem[
                     m:vocabularyReference[@name='PublicationVoc']/m:vocabularyReferenceItem[@name='Ja'] 
@@ -405,53 +445,26 @@ def _relWork_online(self, *, modType: str, modItemId: int):
         else:
             return False
 
-    def _relWorks_cache_many(self, *, first):
-        """
-        creates relatedWorksCache from all chunks
-
-        In case we're in chunk mode, the normal preparation is inefficient, so let's see
-        if we can speed things up by offering a separate cache for chunk mode.
-
-        expects
-        -first: the path to the first chunk (as str or Path)
-
-        TODO: Let's first pass all the relWork.IDs into the set and then make one big
-        query. That should be faster. But before we do that, we need to test
-        if current version works non-chunk version.
-
-        If the relWorksCache gets too big (~1GB xml file), split the chunks
-        into multiple dirs and process separately.
-        """
-        ID_cache = set()  # set of relWork ids, no duplicates
-        chunk_fn = Path(first)
-        # if the cache is already at max_size, we dont need this step
-        if len(self.relWorks) >= relWorks_maxSize:
-            return None
-        while chunk_fn.exists():
-            ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
-            try:
-                chunk_fn = self._nextChunk(fn=chunk_fn)
-            except:
-                # print ("   breaking the while")
-                break  # break the while if this is the only data file or the last chunk
-            if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
-                break
-        self._grow_relWorks_cache(ID_cache)
-
     def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
         """
-        if relWork unpublic delete; otherwise rewrite
+        if relWork unpublic, delete it from lvl2 lido file; otherwise rewrite relWork using ISIL
         """
         id_int = int(objectID.text)
 
-        if self._relWork_online(modType=mtype, modItemId=id_int):
+        # we can rely on item being in cache, says I
+        try:
+            relWorkM = self.relWorks_cache[(mtype, id_int)]
+        except:
+            print(f"WARNING: no relWork found for {mtype} {id_int}")
+
+        if self._relWork_online(mtype=mtype, modItemId=id_int):
             # rewrite ISIL, should look like this:
             # <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
             # self._log(f"   looking up ISIL for relWork")
             objectID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
             # we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
             try:
-                verwInst = relWork.xpath(
+                verwInst = relWorkM.xpath(
                     """//m:moduleReference[
                         @name='ObjOwnerRef'
                     ]/m:moduleReferenceItem/m:formattedValue"""
@@ -463,7 +476,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
                 objectID.text = f"{ISIL}/{str(id_int)}"
                 print(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
         else:
-            self._del_relWork(ID=objectID)
+            self._del_relWork(ID=objectID)  # rm from lido lvl2
 
 
 if __name__ == "__main__":