Skip to content

Commit

Permalink
committing working version before transition to new relWorksCache
Browse files Browse the repository at this point in the history
  • Loading branch information
mokko committed Mar 8, 2024
1 parent 4a59092 commit 1071953
Show file tree
Hide file tree
Showing 7 changed files with 111,430 additions and 51 deletions.
111,093 changes: 111,093 additions & 0 deletions test/group416397-chunk1.lido.xml

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions test/relWorks_cache.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<application xmlns="http://www.zetcom.com/ria/ws/module">
<modules>
<module name="Object" totalSize="1">
<moduleItem hasAttachments="true" id="2268694" uuid="2268694">
<systemField dataType="Timestamp" name="__lastModified">
<value>2024-02-20 12:44:39.884</value>
<formattedValue language="de">20.02.2024 12:44</formattedValue>
</systemField>
<repeatableGroup name="ObjPublicationGrp" size="1">
<repeatableGroupItem id="56748382" uuid="a833945d-a46a-4e5c-9047-d3b76fe0f30f">
<vocabularyReference name="PublicationVoc" id="62649" instanceName="ObjPublicationVgr">
<vocabularyReferenceItem id="1810139" name="Ja">
<formattedValue language="de">Ja</formattedValue>
</vocabularyReferenceItem>
</vocabularyReference>
<vocabularyReference name="TypeVoc" id="62650" instanceName="ObjPublicationTypeVgr">
<vocabularyReferenceItem id="2600647" name="Daten freigegeben für SMB-digital">
<formattedValue language="de">Daten freigegeben für SMB-digital</formattedValue>
</vocabularyReferenceItem>
</vocabularyReference>
</repeatableGroupItem>
</repeatableGroup>
<moduleReference name="ObjOwnerRef" targetModule="Address" multiplicity="N:1" size="1">
<moduleReferenceItem moduleItemId="67678" uuid="67678">
<formattedValue language="de">Ethnologisches Museum, Staatliche Museen zu Berlin</formattedValue>
</moduleReferenceItem>
</moduleReference>
</moduleItem>
</module>
</modules>
</application>
9 changes: 9 additions & 0 deletions test/test_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path
from zml2lido.file import per_chunk, unzip


def test_per_chunk():
p = Path(r"C:\m3\zml2lido\sdata\GG\20240307\query516069-chunk1.lido.xml")
if not p.exists():
raise FileNotFound("p not found!")
assert 2 == len(list(per_chunk(p)))
54 changes: 54 additions & 0 deletions test/test_relWorksCache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from pathlib import Path
from zml2lido.relWorksCache import RelWorksCache


def test_init():
rw = RelWorksCache()
assert rw.maxSize == 20_000
rw = RelWorksCache(maxSize=40_000)
assert rw.maxSize == 40_000
# print (f"{rw.maxSize=}")


def test_add_relWork():
"""
We test add_relWork and save...
"""
fn = Path("relWorks_cache.xml")
if fn.exists():
fn.unlink()
rw = RelWorksCache()
rw.add_relWork(mtype="Object", ID=2268694)
rw.save(path=fn)
assert 1 == len(rw.cache)
rw.add_relWork(mtype="Object", ID=3486950)
assert 2 == len(rw.cache)
# print (f"{rw=}")


def test_load_cache_file():
fn = Path("relWorks_cache.xml")
if fn.exists():
fn.unlink()
rw = RelWorksCache()
rw.add_relWork(mtype="Object", ID=2268694)
rw.save(path=fn)

rw2 = RelWorksCache()
rw2.load_cache_file(path=fn)
assert 1 == rw2.length()


def test_lido_to_ids():
"""
Also tests 'add_from_lido_file'
"""
rw = RelWorksCache()
lido_fn = Path("group416397-chunk1.lido.xml")
ids = rw._lido_to_ids(path=lido_fn)
assert 171 == len(ids)
rw.add_from_lido_file(path=lido_fn)
ids2 = rw._lido_to_ids(path=lido_fn)
assert 0 == len(ids2)
assert 171 == rw.length()
print(f"{rw.length()}")
41 changes: 41 additions & 0 deletions zml2lido/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
WORK IN PROGRESS - File helpers for zml2lido
We're currently only zipping zml files, not lido files automatically
unpacked_path = unzip(Path("group1234-chunk1.zip")
for chunk in per_chunk(chunk_path):
do_something_with(chunk)
"""
from zipfile import ZipFile
from pathlib import Path
import re


def per_chunk(path: Path):
"""
Loop through chunks easily. Not yet used in production.
"""
path2 = path
while path2.exists():
yield path2
stem = str(path2).split(".lido.xml")[0]
m = re.search(r"-chunk(\d+)$", stem)
if m:
no = int(m.group(1))
new_no = no + 1
head = re.sub(r"\d+$", "", stem)
path2 = Path(f"{head}{new_no}.lido.xml")
else:
raise Exception("Not chunkable")


def unzip(path: Path):
parent_dir = path.parent
member = Path(path.name).with_suffix(".xml")
temp_fn = parent_dir / member
with ZipFile(path, "r") as zippy:
zippy.extract(str(member), path=parent_dir)
return temp_fn
105 changes: 54 additions & 51 deletions zml2lido/linkChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
lc.save(out_fn="path/to/lido.lvl2.xml")
composition over inheritance
#relWorksCache is a Module() with limited fields
rw = relWorks(maxSize=20_000) # load cache file or nothing
rw.add(mtype, ID)? # add a single item to cache. Do we respect max_size? was: _add_to_relWorks_cache
rw.add_from_lido_file(fn=path) # grow cache by new items from a single file; respects max_size
rw.add_from_lido_chunks(first=path) # grow cache by new items from a single file; respects max_size
rw.exists(mtype="Object", ID=1234) # true if item exists in cache
"""

import logging
Expand All @@ -42,17 +50,16 @@
class LinkChecker:
def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
self._log(f"STATUS: LinkChecker is working on {src}") # not exactly an error
self.src = Path(src)
# self.chunk = chunk
self.relWorks_fn = self.src.parent / "relWorks.cache.xml"
self.data = etree.parse(str(src))
self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
self.relWorks_fn = Path(src).parent / "relWorks.cache.xml"
self.relWorks_cache = self.init_relWorks_cache() # load file if it exists

# why wouldnt I load the first file into the cache?
if chunks:
print("prepare relWorks cache (chunks, many)")
self.relWorks_cache_many(first=src) # run only once to make cache
# why wouldnt I load the first file into the cache?
else:
self.relWorks_cache_single(fn=src)

Expand All @@ -61,6 +68,8 @@ def fixRelatedWorks(self) -> None:
Frank doesn't want dead links in relatedWorks. So we loop thru them, check
if they are SMB-approved (using MpApi) and, if not, we remove them. We're
also include ISILs in the same step.
relWork's objectID is at relWork/object/objectID
"""
self._log(
"fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
Expand Down Expand Up @@ -101,24 +110,16 @@ def fixRelatedWorks(self) -> None:
if not self.relWorks_cache.item_exists(mtype=mtype, ID=id_int):
self._add_to_relWorks_cache(mtype=mtype, ID=id_int)
# at this point we can rely on item being in relWorks cache
self._rewrite_relWork(mtype=mtype, objectID=objectID_N)
self._rewrite_relWork(mtype=mtype, objectID_N=objectID_N)

def init_relWorks_cache(self) -> Module:
"""
Initializes self.refWorks cache. If cache file already exists, load it. Else
initialize empty self.refWorks.
"""
if Path(self.relWorks_fn).exists():
# try:
# self.relWorks
# except NameError:
# print("Inline cache not loaded yet")
print(f" Loading existing relWorks cache {self.relWorks_fn}")
return Module(file=self.relWorks_fn)
# else:
# print("Inline cache exists already")
# if we read relWorks cache from file we dont loop thru data files (chunks)
# looking for all the relWorks to fill the cache as best as we can
else:
print(f" No relWorks file to load at {self.relWorks_fn}")
return Module()
Expand Down Expand Up @@ -171,7 +172,7 @@ def relWorks_cache_many(self, *, first: str | Path) -> None:
chunk_fn = Path(first)
# if the cache is already at max_size, we dont do anything
# else we keep loading more chunks
if len(self.relWorks) >= relWorks_maxSize:
if len(self.relWorks_cache) >= relWorks_maxSize:
return None
while chunk_fn.exists():
ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
Expand All @@ -180,7 +181,7 @@ def relWorks_cache_many(self, *, first: str | Path) -> None:
except:
# print (" breaking the while")
break # break the while if this is the only data file or the last chunk
if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
if len(ID_cache) + len(self.relWorks_cache) >= relWorks_maxSize:
break
self._grow_relWorks_cache(ID_cache)

Expand All @@ -191,9 +192,8 @@ def relWorks_cache_single(self, *, fn: str | Path) -> None:
This function currently seems to be so slow that it's useless.
"""
fn = Path(fn)
ID_cache = set() # set of relWork ids, no duplicates
ID_cache = self._file_to_ID_cache(fn, ID_cache)
ID_cache = self._file_to_ID_cache(Path(fn), ID_cache)
print(f"growing relWorks with ids from {fn}")
self._grow_relWorks_cache(ID_cache)

Expand Down Expand Up @@ -271,13 +271,13 @@ def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
# print (" update file cache")
self.relWorks_cache.toFile(path=self.relWorks_fn)

def _del_relWork(self, *, ID) -> None:
def _del_relWork(self, *, ID_N: Any) -> None:
"""
delete a relWork from self.etree.
ID is a lxml node
"""
self._log(f" removing unpublic relWork {ID.text}")
relWorkSet = ID.getparent().getparent().getparent()
self._log(f" removing unpublic relWork {ID_N.text}")
relWorkSet = ID_N.getparent().getparent().getparent()
relWorkSet.getparent().remove(relWorkSet)

def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
Expand All @@ -298,8 +298,8 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:

print(f" _file_to_ID_cache {len(relWorksL)} relWorks")

for ID in relWorksL:
src = ID.xpath("@l:source", namespaces=NSMAP)[0]
for ID_N in relWorksL:
src = ID_N.xpath("@l:source", namespaces=NSMAP)[0]
if src == "OBJ.ID":
mType = "Object"
elif src == "LIT.ID":
Expand All @@ -311,18 +311,22 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
# if len(ID_cache) >= relWorks_maxSize:
# print("break here")
# break
if ID.text is not None and mType == "Object":
if ID_N.text is not None and mType == "Object":
# only add this to ID_cache if not yet in relWorks cache
objId = int(ID.text)
objId = int(ID_N.text)
if not self.relWorks_cache.item_exists(mtype="Object", ID=objId):
ID_cache.add(objId)
print(f" adding {len(ID_cache)} IDs")
return ID_cache

def _grow_relWorks_cache(self, ID_cache: set) -> None:
"""
Make one query with all the IDs from ID_cache, execute the query and save the results
to self.relWorks, also write to disk
Make a query with the IDs from ID_cache, execute the query and save the results
to self.relWorks_cache, also write to disk.
Do we need a check if IDs are already in relWorks_cache? This should speed up
the procedure. Without it existing results should not get added to the cache,
but in the new version we dont need to download them.
"""
print(
f" _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks_cache)}"
Expand All @@ -332,30 +336,29 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
if len(ID_cache) > 1:
q.OR() # only or if more than 1

for id_ in sorted(ID_cache):
q.addCriterion(
operator="equalsField",
field="__id",
value=str(id_),
changed = False
for id_int in sorted(ID_cache):
if not self.relWorks_cache.item_exists(mtype="Object", ID=id_int):
changed = True
q.addCriterion(
operator="equalsField",
field="__id",
value=str(id_int),
)
if changed:
q = self._optimize_relWorks_cache(query=q)
# q.toFile(path="sdata/debug.search.xml")
print(
f" populating relWorks cache {len(ID_cache)} (max size {relWorks_maxSize})"
)
q = self._optimize_relWorks_cache(query=q)
# q.toFile(path="sdata/debug.search.xml")
print(
f" populating relWorks cache {len(ID_cache)} (max size {relWorks_maxSize})"
)
newRelWorksM = self.client.search2(query=q)
try:
self.relWorks
except:
newRelWorksM = self.client.search2(query=q)
# make a new cache (might be faster than adding to it)
self.relWorks = newRelWorksM
else:
# if relWorks exists already, add to it
self.relWorks_cache = newRelWorksM
print(" adding")
self.relWorks += newRelWorksM
# save the cache to file after processing every chunk
# no max_size limitation
self.relWorks.toFile(path=self.relWorks_fn)
self.relWorks_cache += newRelWorksM
# save the cache to file after processing every chunk
# no max_size limitation
self.relWorks_cache.toFile(path=self.relWorks_fn)

def _log(self, msg: str) -> None:
print(msg)
Expand Down Expand Up @@ -445,11 +448,11 @@ def _relWork_online(self, *, mtype: str, modItemId: int) -> bool:
else:
return False

def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
def _rewrite_relWork(self, *, mtype: str, objectID_N: Any) -> None:
"""
if relWork unpublic, delete it from lvl2 lido file; otherwise rewrite relWork using ISIL
"""
id_int = int(objectID.text)
id_int = int(objectID_N.text)

# we can rely on item being in cache, says I
try:
Expand All @@ -461,7 +464,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
# rewrite ISIL, should look like this:
# <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
# self._log(f" looking up ISIL for relWork")
objectID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
objectID_N.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
# we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
try:
verwInst = relWorkM.xpath(
Expand All @@ -473,10 +476,10 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
self._log(f"WARNING: verwaltendeInstitution empty! {mtype} {id_int}")
else:
ISIL = self._lookup_ISIL(institution=verwInst.text)
objectID.text = f"{ISIL}/{str(id_int)}"
objectID_N.text = f"{ISIL}/{str(id_int)}"
print(f" relWork {id_int}: {verwInst.text} -> {ISIL}")
else:
self._del_relWork(ID=objectID) # rm from lido lvl2
self._del_relWork(ID_N=objectID_N) # rm from lido lvl2


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 1071953

Please sign in to comment.