Skip to content

Commit

Permalink
more refactoring...
Browse files Browse the repository at this point in the history
  • Loading branch information
mokko committed Mar 3, 2024
1 parent 17b2db9 commit 4a59092
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 93 deletions.
14 changes: 7 additions & 7 deletions zml2lido/lidoTool.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def to_lvl2Single(self, *, src: str | Path) -> Path:
# self.lc.relWorks_cache_single(fn=src)
self.lc.rmUnpublishedRecords() # remove unpublished records (not on SMB-Digital)
self.lc.fixRelatedWorks()
self.lc.saveTree(out_fn)
self.lc.save(out_fn)
else:
print(f" lvl2 already exists: {out_fn}")
return out_fn
Expand Down Expand Up @@ -190,7 +190,7 @@ def splitSachbegriffSingle(self, *, src: str) -> Path:
os.chdir(orig)
return xslDir / out

def validate(self, *, p: str | Path | None = None):
def validate(self, *, path: Path | None = None):
"""
It's optionally possible to specify a path for a file that needs validatation. If
path is None, the file that was specified during __init__ will be validated.
Expand All @@ -200,20 +200,20 @@ def validate(self, *, p: str | Path | None = None):
(Not tested recently for chunks...)
"""

if p is None:
to_val_fn = self.src
if path is None:
to_val_fn = Path(self.src)
else:
to_val_fn: Path = Path(p)
to_val_fn = path

print(f"VALIDATING LIDO FILE {to_val_fn}")
print(f"VALIDATING LIDO FILE '{to_val_fn}'")
if self.chunks:
print(" with chunks")
for chunkFn in self.loopChunks(src=to_val_fn):
self.validateSingle(src=chunkFn)
else:
self.validateSingle(src=to_val_fn)

def validateSingle(self, *, src):
def validateSingle(self, *, src: Path):
if not hasattr(self, "schema"):
print(f" loading schema {lidoXSD}")
schemaDoc = etree.parse(lidoXSD)
Expand Down
185 changes: 99 additions & 86 deletions zml2lido/linkChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
USAGE:
lc = LinkChecker(src="path/to/file.lido.xml")
#related works
self-relWorks_cache = self.init_relWorks_cache()
lc.relWorks_cache_single(fn="path/to/file.lido.xml") # parse fn for relWorks and populate cache
lc.relWorks_cache_many(first="path/to/file.lido.xml") # parse fn for relWorks and populate cache
lc.fixRelatedWorks() # removes dead links in relatedWorks, also adds ISIL
lc.linkResource_online_http() # for all linkResources print online status
lc.relWorks_cache_single(fn="path/to/file.lido.xml") # parse fn for relWorks and populate cache
lc.rmInternalLinks() # remove linkResource with internal links, not used atm
lc.rmUnpublishedRecords() # removes objects without objectPublishedID
lc.saveTree(out_fn="path/to/lido.lvl2.xml")
lc.save(out_fn="path/to/lido.lvl2.xml")
"""

Expand All @@ -40,15 +44,17 @@ def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
self._log(f"STATUS: LinkChecker is working on {src}") # not exactly an error
self.src = Path(src)
# self.chunk = chunk
self.relWorksFn = self.src.parent / "relWorks.cache.xml"
self.tree = etree.parse(str(src))
# we used to not prepare the relWorksCache here. Why?
self._init_relWorks_cache()
self.relWorks_fn = self.src.parent / "relWorks.cache.xml"
self.data = etree.parse(str(src))
self.client = MpApi(baseURL=baseURL, user=user, pw=pw)
self.relWorks_cache = self.init_relWorks_cache() # load file if it exists

if chunks:
print("prepare relWorks cache (chunks, many)")
self._relWorks_cache_many(first=src) # run only once to make cache
self.relWorks_cache_many(first=src) # run only once to make cache
# why wouldnt I load the first file into the cache?
else:
self.relWorks_cache_single(fn=src)

def fixRelatedWorks(self) -> None:
"""
Expand All @@ -60,7 +66,7 @@ def fixRelatedWorks(self) -> None:
"fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
)

relatedWorksL = self.tree.xpath(
relatedWorksL = self.data.xpath(
"""/l:lidoWrap/l:lido/l:descriptiveMetadata/l:objectRelationWrap/
l:relatedWorksWrap/l:relatedWorkSet/l:relatedWork/l:object/l:objectID""",
namespaces=NSMAP,
Expand Down Expand Up @@ -92,17 +98,37 @@ def fixRelatedWorks(self) -> None:
# print("WARN: No check for mtype 'Literature'")
else:
# print(f"fixing relatedWork {mtype} {id_int}")
if not self.relWorks.item_exists(mtype=mtype, ID=id_int):
if not self.relWorks_cache.item_exists(mtype=mtype, ID=id_int):
self._add_to_relWorks_cache(mtype=mtype, ID=id_int)
# at this point we can rely on item being in relWorks cache
self._rewrite_relWork(mtype=mtype, objectID=objectID_N)

def init_relWorks_cache(self) -> Module:
"""
Initializes self.refWorks cache. If cache file already exists, load it. Else
initialize empty self.refWorks.
"""
if Path(self.relWorks_fn).exists():
# try:
# self.relWorks
# except NameError:
# print("Inline cache not loaded yet")
print(f" Loading existing relWorks cache {self.relWorks_fn}")
return Module(file=self.relWorks_fn)
# else:
# print("Inline cache exists already")
# if we read relWorks cache from file we dont loop thru data files (chunks)
# looking for all the relWorks to fill the cache as best as we can
else:
print(f" No relWorks file to load at {self.relWorks_fn}")
return Module()

def linkResource_online_http(self) -> None:
"""
For all linkResources in self.tree, check if url responds ok using http.
For all linkResources in self.data, check if url responds ok using http.
Prints the result (which is a bit awkward).
"""
linkResourceL = self.tree.xpath(
linkResourceL = self.data.xpath(
"/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
namespaces=NSMAP,
)
Expand All @@ -124,6 +150,40 @@ def linkResource_online_http(self) -> None:
else:
print("\tsuccess")

def relWorks_cache_many(self, *, first: str | Path) -> None:
"""
reads relatedWorks into relWorks_cache from all chunks, only until cache is full.
In case we're in chunk mode, the normal preparation is inefficient, so let's see
if we can speed things up by offering a separate cache for chunk mode.
expects
-first: the path to the first chunk
TODO: Let's first pass all the relWork.IDs into the set and then make one big
query. That should be faster. But before we do that, we need to test
if current version works non-chunk version.
If the relWorks_cache gets too big (~1GB xml file), split the chunks
into multiple dirs and process separately.
"""
ID_cache = set() # set of relWork ids, no duplicates
chunk_fn = Path(first)
# if the cache is already at max_size, we dont do anything
# else we keep loading more chunks
if len(self.relWorks) >= relWorks_maxSize:
return None
while chunk_fn.exists():
ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
try:
chunk_fn = self._nextChunk(fn=chunk_fn)
except:
# print (" breaking the while")
break # break the while if this is the only data file or the last chunk
if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
break
self._grow_relWorks_cache(ID_cache)

def relWorks_cache_single(self, *, fn: str | Path) -> None:
"""
Extracts IDs from one file (fn), queriess RIA for those IDs and adds new info to
Expand All @@ -145,7 +205,7 @@ def rmInternalLinks(self) -> None:
Not currently used.
"""
self._log("resourceSet: Removing sets with remaining internal links")
linkResourceL = self.tree.xpath(
linkResourceL = self.data.xpath(
"/l:lidoWrap/l:lido/l:administrativeMetadata/l:resourceWrap/l:resourceSet/l:resourceRepresentation/l:linkResource",
namespaces=NSMAP,
)
Expand All @@ -164,7 +224,7 @@ def rmUnpublishedRecords(self) -> None:
# self._log(
# " LinkChecker: Removing lido records that are not published on recherche.smb"
# )
recordsL = self.tree.xpath(
recordsL = self.data.xpath(
"/l:lidoWrap/l:lido[not(l:objectPublishedID)]", namespaces=NSMAP
)
for recordN in recordsL:
Expand All @@ -173,13 +233,13 @@ def rmUnpublishedRecords(self) -> None:
recordN.getparent().remove(recordN)
self._log("rmUnpublishedRecords: done!")

def saveTree(self, out_fn: str | Path) -> str:
def save(self, out_fn: str | Path) -> str:
"""
During __init__ we loaded a LIDO file, with this function we write it back to the
out file location as set during __init__.
"""
self._log(f"Writing back to {out_fn}")
self.tree.write(
self.data.write(
str(out_fn), pretty_print=True, encoding="UTF-8", xml_declaration=True
)
return out_fn
Expand Down Expand Up @@ -207,9 +267,9 @@ def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
relWork = self.client.search2(query=q)
if relWork: # realistic that query results are empty?
# appending them to relWork cache
self.relWorks += relWork
self.relWorks_cache += relWork
# print (" update file cache")
self.relWorks.toFile(path=self.relWorksFn)
self.relWorks_cache.toFile(path=self.relWorks_fn)

def _del_relWork(self, *, ID) -> None:
"""
Expand Down Expand Up @@ -254,7 +314,7 @@ def _file_to_ID_cache(self, chunk_fn: Path, ID_cache: set) -> set:
if ID.text is not None and mType == "Object":
# only add this to ID_cache if not yet in relWorks cache
objId = int(ID.text)
if not self.relWorks.item_exists(mtype="Object", ID=objId):
if not self.relWorks_cache.item_exists(mtype="Object", ID=objId):
ID_cache.add(objId)
print(f" adding {len(ID_cache)} IDs")
return ID_cache
Expand All @@ -265,7 +325,7 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
to self.relWorks, also write to disk
"""
print(
f" _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks)}"
f" _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks_cache)}"
)
if len(ID_cache) > 0:
q = Search(module="Object", limit=-1)
Expand Down Expand Up @@ -295,33 +355,13 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
self.relWorks += newRelWorksM
# save the cache to file after processing every chunk
# no max_size limitation
self.relWorks.toFile(path=self.relWorksFn)
self.relWorks.toFile(path=self.relWorks_fn)

def _init_relWorks_cache(self):
"""
Initializes self.refWorks cache. If cache file already exists, load it. Else
initialize empty self.refWorks.
"""
if Path(self.relWorksFn).exists():
# try:
# self.relWorks
# except NameError:
# print("Inline cache not loaded yet")
print(f" Loading existing relWorks cache {self.relWorksFn}")
self.relWorks = Module(file=self.relWorksFn)
# else:
# print("Inline cache exists already")
# if we read relWorks cache from file we dont loop thru data files (chunks)
# looking for all the relWorks to fill the cache as best as we can
else:
print(f" No relWorks file to load at {self.relWorksFn}")
self.relWorks = Module()

def _log(self, msg):
def _log(self, msg: str) -> None:
print(msg)
logging.info(msg)

def _lookup_ISIL(self, *, institution):
def _lookup_ISIL(self, *, institution) -> str:
"""
Load vocmap.xml and lookup ISIL for name of institution.
Expand All @@ -347,7 +387,7 @@ def _lookup_ISIL(self, *, institution):
)
return ISIL.text

def _nextChunk(self, *, fn: Path):
def _nextChunk(self, *, fn: str | Path) -> Path:
"""
Returns the path/name of the next chunk if it exists or errors if the src
is not chunkable or the next chunk does not exist.
Expand All @@ -365,11 +405,11 @@ def _nextChunk(self, *, fn: Path):
if Path(new_path).exists():
return new_path
else:
raise FileNotFoundError("chunk does not exist")
raise FileNotFoundError(f"ERROR: chunk does not exist '{new_path}'")
else:
raise SyntaxError("not chunkable")
raise SyntaxError("ERROR: Filename not chunkable")

def _optimize_relWorks_cache(self, *, query):
def _optimize_relWorks_cache(self, *, query: Search) -> Search:
"""
let's shrink (optimize) the xml. We only need a couple fields
"""
Expand All @@ -384,16 +424,16 @@ def _optimize_relWorks_cache(self, *, query):
query.validate(mode="search")
return query

def _relWork_online(self, *, modType: str, modItemId: int):
def _relWork_online(self, *, mtype: str, modItemId: int) -> bool:
"""
Checks if a specific relWork is online. No urlrequest, just examins if
SMB-Freigabe = Ja.
Expects modItemId as int; but str should work as well.
"""
r = self.relWorks.xpath(
r = self.relWorks_cache.xpath(
f"""/m:application/m:modules/m:module[
@name = '{modType}']/m:moduleItem[
@name = '{mtype}']/m:moduleItem[
@id = {str(modItemId)}]/m:repeatableGroup[
@name = 'ObjPublicationGrp']/m:repeatableGroupItem[
m:vocabularyReference[@name='PublicationVoc']/m:vocabularyReferenceItem[@name='Ja']
Expand All @@ -405,53 +445,26 @@ def _relWork_online(self, *, modType: str, modItemId: int):
else:
return False

def _relWorks_cache_many(self, *, first):
"""
creates relatedWorksCache from all chunks
In case we're in chunk mode, the normal preparation is inefficient, so let's see
if we can speed things up by offering a separate cache for chunk mode.
expects
-first: the path to the first chunk (as str or Path)
TODO: Let's first pass all the relWork.IDs into the set and then make one big
query. That should be faster. But before we do that, we need to test
if current version works non-chunk version.
If the relWorksCache gets too big (~1GB xml file), split the chunks
into multiple dirs and process separately.
"""
ID_cache = set() # set of relWork ids, no duplicates
chunk_fn = Path(first)
# if the cache is already at max_size, we dont need this step
if len(self.relWorks) >= relWorks_maxSize:
return None
while chunk_fn.exists():
ID_cache = self._file_to_ID_cache(chunk_fn, ID_cache)
try:
chunk_fn = self._nextChunk(fn=chunk_fn)
except:
# print (" breaking the while")
break # break the while if this is the only data file or the last chunk
if len(ID_cache) + len(self.refWorks) >= relWorks_maxSize:
break
self._grow_relWorks_cache(ID_cache)

def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
"""
if relWork unpublic delete; otherwise rewrite
if relWork unpublic, delete it from lvl2 lido file; otherwise rewrite relWork using ISIL
"""
id_int = int(objectID.text)

if self._relWork_online(modType=mtype, modItemId=id_int):
# we can rely on item being in cache, says I
try:
relWorkM = self.relWorks_cache[(mtype, id_int)]
except:
print(f"WARNING: no relWork found for {mtype} {id_int}")

if self._relWork_online(mtype=mtype, modItemId=id_int):
# rewrite ISIL, should look like this:
# <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
# self._log(f" looking up ISIL for relWork")
objectID.attrib["{http://www.lido-schema.org}source"] = "ISIL/ID"
# we're assuming there is always a verwaltendeInstitution, but that is not enforced by RIA!
try:
verwInst = relWork.xpath(
verwInst = relWorkM.xpath(
"""//m:moduleReference[
@name='ObjOwnerRef'
]/m:moduleReferenceItem/m:formattedValue"""
Expand All @@ -463,7 +476,7 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
objectID.text = f"{ISIL}/{str(id_int)}"
print(f" relWork {id_int}: {verwInst.text} -> {ISIL}")
else:
self._del_relWork(ID=objectID)
self._del_relWork(ID=objectID) # rm from lido lvl2


if __name__ == "__main__":
Expand Down

0 comments on commit 4a59092

Please sign in to comment.