committing working version before transition to new relWorksCache

mokko · Mar 8, 2024 · 1071953 · 1071953
1 parent 4a59092
commit 1071953
Show file tree

Hide file tree

Showing 7 changed files with 111,430 additions and 51 deletions.
diff --git a/test/group416397-chunk1.lido.xml b/test/group416397-chunk1.lido.xml
diff --git a/test/relWorks_cache.xml b/test/relWorks_cache.xml
@@ -0,0 +1,31 @@
+<application xmlns="http://www.zetcom.com/ria/ws/module">
+  <modules>
+    <module name="Object" totalSize="1">
+      <moduleItem hasAttachments="true" id="2268694" uuid="2268694">
+        <systemField dataType="Timestamp" name="__lastModified">
+          <value>2024-02-20 12:44:39.884</value>
+          <formattedValue language="de">20.02.2024 12:44</formattedValue>
+        </systemField>
+        <repeatableGroup name="ObjPublicationGrp" size="1">
+          <repeatableGroupItem id="56748382" uuid="a833945d-a46a-4e5c-9047-d3b76fe0f30f">
+            <vocabularyReference name="PublicationVoc" id="62649" instanceName="ObjPublicationVgr">
+              <vocabularyReferenceItem id="1810139" name="Ja">
+                <formattedValue language="de">Ja</formattedValue>
+              </vocabularyReferenceItem>
+            </vocabularyReference>
+            <vocabularyReference name="TypeVoc" id="62650" instanceName="ObjPublicationTypeVgr">
+              <vocabularyReferenceItem id="2600647" name="Daten freigegeben für SMB-digital">
+                <formattedValue language="de">Daten freigegeben für SMB-digital</formattedValue>
+              </vocabularyReferenceItem>
+            </vocabularyReference>
+          </repeatableGroupItem>
+        </repeatableGroup>
+        <moduleReference name="ObjOwnerRef" targetModule="Address" multiplicity="N:1" size="1">
+          <moduleReferenceItem moduleItemId="67678" uuid="67678">
+            <formattedValue language="de">Ethnologisches Museum, Staatliche Museen zu Berlin</formattedValue>
+          </moduleReferenceItem>
+        </moduleReference>
+      </moduleItem>
+    </module>
+  </modules>
+</application>
diff --git a/test/test_file.py b/test/test_file.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+from zml2lido.file import per_chunk, unzip
+
+
+def test_per_chunk():
+    p = Path(r"C:\m3\zml2lido\sdata\GG\20240307\query516069-chunk1.lido.xml")
+    if not p.exists():
+        raise FileNotFound("p not found!")
+    assert 2 == len(list(per_chunk(p)))
diff --git a/test/test_relWorksCache.py b/test/test_relWorksCache.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+from zml2lido.relWorksCache import RelWorksCache
+
+
+def test_init():
+    rw = RelWorksCache()
+    assert rw.maxSize == 20_000
+    rw = RelWorksCache(maxSize=40_000)
+    assert rw.maxSize == 40_000
+    # print (f"{rw.maxSize=}")
+
+
+def test_add_relWork():
+    """
+    We test add_relWork and save...
+    """
+    fn = Path("relWorks_cache.xml")
+    if fn.exists():
+        fn.unlink()
+    rw = RelWorksCache()
+    rw.add_relWork(mtype="Object", ID=2268694)
+    rw.save(path=fn)
+    assert 1 == len(rw.cache)
+    rw.add_relWork(mtype="Object", ID=3486950)
+    assert 2 == len(rw.cache)
+    # print (f"{rw=}")
+
+
+def test_load_cache_file():
+    fn = Path("relWorks_cache.xml")
+    if fn.exists():
+        fn.unlink()
+    rw = RelWorksCache()
+    rw.add_relWork(mtype="Object", ID=2268694)
+    rw.save(path=fn)
+
+    rw2 = RelWorksCache()
+    rw2.load_cache_file(path=fn)
+    assert 1 == rw2.length()
+
+
+def test_lido_to_ids():
+    """
+    Also tests 'add_from_lido_file'
+    """
+    rw = RelWorksCache()
+    lido_fn = Path("group416397-chunk1.lido.xml")
+    ids = rw._lido_to_ids(path=lido_fn)
+    assert 171 == len(ids)
+    rw.add_from_lido_file(path=lido_fn)
+    ids2 = rw._lido_to_ids(path=lido_fn)
+    assert 0 == len(ids2)
+    assert 171 == rw.length()
+    print(f"{rw.length()}")
diff --git a/zml2lido/file.py b/zml2lido/file.py
@@ -0,0 +1,41 @@
+"""
+WORK IN PROGRESS - File helpers for zml2lido
+
+We're currently only zipping zml files, not lido files automatically
+
+unpacked_path = unzip(Path("group1234-chunk1.zip") 
+
+for chunk in per_chunk(chunk_path):
+	do_something_with(chunk)
+
+"""
+from zipfile import ZipFile
+from pathlib import Path
+import re
+
+
+def per_chunk(path: Path):
+    """
+    Loop through chunks easily. Not yet used in production.
+    """
+    path2 = path
+    while path2.exists():
+        yield path2
+        stem = str(path2).split(".lido.xml")[0]
+        m = re.search(r"-chunk(\d+)$", stem)
+        if m:
+            no = int(m.group(1))
+            new_no = no + 1
+            head = re.sub(r"\d+$", "", stem)
+            path2 = Path(f"{head}{new_no}.lido.xml")
+        else:
+            raise Exception("Not chunkable")
+
+
+def unzip(path: Path):
+    parent_dir = path.parent
+    member = Path(path.name).with_suffix(".xml")
+    temp_fn = parent_dir / member
+    with ZipFile(path, "r") as zippy:
+        zippy.extract(str(member), path=parent_dir)
+    return temp_fn
diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py
@@ -21,6 +21,14 @@
 
     lc.save(out_fn="path/to/lido.lvl2.xml")
 
+    composition over inheritance
+    #relWorksCache is a Module() with limited fields    
+    rw = relWorks(maxSize=20_000) # load cache file or nothing
+    rw.add(mtype, ID)? # add a single item to cache. Do we respect max_size? was: _add_to_relWorks_cache 
+    rw.add_from_lido_file(fn=path) # grow cache by new items from a single file; respects max_size
+    rw.add_from_lido_chunks(first=path) # grow cache by new items from a single file; respects max_size
+    rw.exists(mtype="Object", ID=1234) # true if item exists in cache 
+
 """
 
 import logging
@@ -42,17 +50,16 @@
 class LinkChecker:
     def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
         self._log(f"STATUS: LinkChecker is working on {src}")  # not exactly an error
-        self.src = Path(src)
         # self.chunk = chunk
-        self.relWorks_fn = self.src.parent / "relWorks.cache.xml"
         self.data = etree.parse(str(src))
         self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
+        self.relWorks_fn = Path(src).parent / "relWorks.cache.xml"
         self.relWorks_cache = self.init_relWorks_cache()  # load file if it exists
 
+        # why wouldnt I load the first file into the cache?
         if chunks:
             print("prepare relWorks cache (chunks, many)")
             self.relWorks_cache_many(first=src)  # run only once to make cache
-        # why wouldnt I load the first file into the cache?
         else:
             self.relWorks_cache_single(fn=src)
 
@@ -61,6 +68,8 @@ def fixRelatedWorks(self) -> None:
         Frank doesn't want dead links in relatedWorks. So we loop thru them, check
         if they are SMB-approved (using MpApi) and, if not, we remove them. We're
         also include ISILs in the same step.
+
+        relWork's objectID is at relWork/object/objectID
         """
         self._log(
             "fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
@@ -101,24 +110,16 @@ def fixRelatedWorks(self) -> None:
                     if not self.relWorks_cache.item_exists(mtype=mtype, ID=id_int):
                         self._add_to_relWorks_cache(mtype=mtype, ID=id_int)
                     # at this point we can rely on item being in relWorks cache
-                    self._rewrite_relWork(mtype=mtype, objectID=objectID_N)
+                    self._rewrite_relWork(mtype=mtype, objectID_N=objectID_N)
 
     def init_relWorks_cache(self) -> Module:
         """
         Initializes self.refWorks cache. If cache file already exists, load it. Else
         initialize empty self.refWorks.
         """
         if Path(self.relWorks_fn).exists():
-            # try:
-            #    self.relWorks
-            # except NameError:
-            # print("Inline cache not loaded yet")
             print(f"   Loading existing relWorks cache {self.relWorks_fn}")
             return Module(file=self.relWorks_fn)
-            # else:
-            # print("Inline cache exists already")
-        # if we read relWorks cache from file we dont loop thru data files (chunks)
-        # looking for all the relWorks to fill the cache as best as we can
         else:
             print(f"   No relWorks file to load at {self.relWorks_fn}")
             return Module()
@@ -171,7 +172,7 @@ def relWorks_cache_many(self, *, first: str | Path) -> None:
         chunk_fn = Path(first)
         # if the cache is already at max_size, we dont do anything
         # else we keep loading more chunks
-        if len(self.relWorks) >= relWorks_maxSize:
+        if len(self.relWorks_cache) >= relWorks_maxSize:
             return None
         while chunk_fn.exists():
             ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
@@ -180,7 +181,7 @@ def relWorks_cache_many(self, *, first: str | Path) -> None:
             except:
                 # print ("   breaking the while")
                 break  # break the while if this is the only data file or the last chunk
-            if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
+            if len(ID_cache) + len(self.relWorks_cache) >= relWorks_maxSize:
                 break
         self._grow_relWorks_cache(ID_cache)
 
@@ -191,9 +192,8 @@ def relWorks_cache_single(self, *, fn: str | Path) -> None:
 
         This function currently seems to be so slow that it's useless.
         """
-        fn = Path(fn)
         ID_cache = set()  # set of relWork ids, no duplicates
-        ID_cache = self._file_to_ID_cache(fn, ID_cache)
+        ID_cache = self._file_to_ID_cache(Path(fn), ID_cache)
         print(f"growing relWorks with ids from {fn}")
         self._grow_relWorks_cache(ID_cache)
 
@@ -271,13 +271,13 @@ def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
             # print ("   update file cache")
             self.relWorks_cache.toFile(path=self.relWorks_fn)
 
-    def _del_relWork(self, *, ID) -> None:
+    def _del_relWork(self, *, ID_N: Any) -> None:
         """
         delete a relWork from self.etree.
         ID is a lxml node
         """
-        self._log(f"   removing unpublic relWork {ID.text}")
-        relWorkSet = ID.getparent().getparent().getparent()
+        self._log(f"   removing unpublic relWork {ID_N.text}")
+        relWorkSet = ID_N.getparent().getparent().getparent()
         relWorkSet.getparent().remove(relWorkSet)
 
     def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
@@ -298,8 +298,8 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
 
         print(f"   _file_to_ID_cache {len(relWorksL)} relWorks")
 
-        for ID in relWorksL:
-            src = ID.xpath("@l:source", namespaces=NSMAP)[0]
+        for ID_N in relWorksL:
+            src = ID_N.xpath("@l:source", namespaces=NSMAP)[0]
             if src == "OBJ.ID":
                 mType = "Object"
             elif src == "LIT.ID":
@@ -311,18 +311,22 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
             # if len(ID_cache) >= relWorks_maxSize:
             #    print("break here")
             #    break
-            if ID.text is not None and mType == "Object":
+            if ID_N.text is not None and mType == "Object":
                 # only add this to ID_cache if not yet in relWorks cache
-                objId = int(ID.text)
+                objId = int(ID_N.text)
                 if not self.relWorks_cache.item_exists(mtype="Object", ID=objId):
                     ID_cache.add(objId)
         print(f"   adding {len(ID_cache)} IDs")
         return ID_cache
 
     def _grow_relWorks_cache(self, ID_cache: set) -> None:
         """
-        Make one query with all the IDs from ID_cache, execute the query and save the results
-        to self.relWorks, also write to disk
+        Make a query with the IDs from ID_cache, execute the query and save the results
+        to self.relWorks_cache, also write to disk.
+
+        Do we need a check if IDs are already in relWorks_cache? This should speed up
+        the procedure. Without it existing results should not get added to the cache,
+        but in the new version we dont need to download them.
         """
         print(
             f"   _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks_cache)}"
@@ -332,30 +336,29 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
             if len(ID_cache) > 1:
                 q.OR()  # only or if more than 1
 
-            for id_ in sorted(ID_cache):
-                q.addCriterion(
-                    operator="equalsField",
-                    field="__id",
-                    value=str(id_),
+            changed = False
+            for id_int in sorted(ID_cache):
+                if not self.relWorks_cache.item_exists(mtype="Object", ID=id_int):
+                    changed = True
+                    q.addCriterion(
+                        operator="equalsField",
+                        field="__id",
+                        value=str(id_int),
+                    )
+            if changed:
+                q = self._optimize_relWorks_cache(query=q)
+                # q.toFile(path="sdata/debug.search.xml")
+                print(
+                    f"   populating relWorks cache {len(ID_cache)} (max size {relWorks_maxSize})"
                 )
-            q = self._optimize_relWorks_cache(query=q)
-            # q.toFile(path="sdata/debug.search.xml")
-            print(
-                f"   populating relWorks cache {len(ID_cache)} (max size {relWorks_maxSize})"
-            )
-            newRelWorksM = self.client.search2(query=q)
-            try:
-                self.relWorks
-            except:
+                newRelWorksM = self.client.search2(query=q)
                 # make a new cache (might be faster than adding to it)
-                self.relWorks = newRelWorksM
-            else:
-                # if relWorks exists already, add to it
+                self.relWorks_cache = newRelWorksM
                 print("   adding")
-                self.relWorks += newRelWorksM
-            # save the cache to file after processing every chunk
-            # no max_size limitation
-            self.relWorks.toFile(path=self.relWorks_fn)
+                self.relWorks_cache += newRelWorksM
+                # save the cache to file after processing every chunk
+                # no max_size limitation
+                self.relWorks_cache.toFile(path=self.relWorks_fn)
 
     def _log(self, msg: str) -> None:
         print(msg)
@@ -445,11 +448,11 @@ def _relWork_online(self, *, mtype: str, modItemId: int) -> bool:
         else:
             return False
 
-    def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
+    def _rewrite_relWork(self, *, mtype: str, objectID_N: Any) -> None:
         """
         if relWork unpublic, delete it from lvl2 lido file; otherwise rewrite relWork using ISIL
         """
-        id_int = int(objectID.text)
+        id_int = int(objectID_N.text)
 
         # we can rely on item being in cache, says I
         try:
@@ -461,7 +464,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
             # rewrite ISIL, should look like this:
             # <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
             # self._log(f"   looking up ISIL for relWork")
-            objectID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
+            objectID_N.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
             # we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
             try:
                 verwInst = relWorkM.xpath(
@@ -473,10 +476,10 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
                 self._log(f"WARNING: verwaltendeInstitution empty! {mtype} {id_int}")
             else:
                 ISIL = self._lookup_ISIL(institution=verwInst.text)
-                objectID.text = f"{ISIL}/{str(id_int)}"
+                objectID_N.text = f"{ISIL}/{str(id_int)}"
                 print(f"   relWork {id_int}: {verwInst.text} -> {ISIL}")
         else:
-            self._del_relWork(ID=objectID)  # rm from lido lvl2
+            self._del_relWork(ID_N=objectID_N)  # rm from lido lvl2
 
 
 if __name__ == "__main__":