Skip to content

Commit

Permalink
trying to repair lido
Browse files Browse the repository at this point in the history
  • Loading branch information
mokko committed Mar 1, 2024
1 parent d8c2964 commit 305466e
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 31 deletions.
2 changes: 1 addition & 1 deletion zml2lido/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def saxon():
"-x", "--xsl", help="(xslt) transformation filename", required=True
)
args = parser.parse_args()
m = LidoTool(Input=args.source) # just to run saxon
m = LidoTool(src=args.source) # just to run saxon...

m.saxon(Input=args.source, xsl=args.xsl, output=args.output)

Expand Down
11 changes: 6 additions & 5 deletions zml2lido/lidoTool.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ def to_lvl2Single(self, *, src: str | Path) -> Path:
"""
out_fn = self._lvl2_path(src)
# print(f"lvl2: {out_fn}")
try: # only load the first time
self.lc: LinkChecker
except:
self.lc = LinkChecker(src=src, chunks=self.chunks) # reads cache
# try: # only load the first time
# self.lc: LinkChecker
# except:
self.lc = LinkChecker(src=src, chunks=self.chunks) # reads cache
if not out_fn.exists() or self.force:
# self.lc.relWorks_cache_single(fn=src)
self.lc.rmUnpublishedRecords() # remove unpublished records (not on SMB-Digital)
Expand Down Expand Up @@ -321,7 +321,8 @@ def saxon(self, *, src: str | Path, output: str | Path, xsl: str | Path) -> None
print(cmd)

subprocess.run(
cmd, check=True # , stderr=subprocess.STDOUT
cmd,
check=True, # , stderr=subprocess.STDOUT
) # overwrites output file without saying anything

#
Expand Down
48 changes: 23 additions & 25 deletions zml2lido/linkChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self, *, src: str | Path, chunks: bool = False) -> None:
self.tree = etree.parse(str(src))
# we used to not prepare the relWorksCache here. Why?
self._init_relWorks_cache()
self.client = MpApi(baseURL=baseURL, user=user, pw=pw)

if chunks:
print("prepare relWorks cache (chunks, many)")
self._relWorks_cache_many(first=src) # run only once to make cache
Expand All @@ -43,13 +45,10 @@ def fixRelatedWorks(self) -> None:
if they are SMB-approved (using MpApi) and, if not, we remove them. We're
also include ISILs in the same step.
"""

self._log(
"fixRelatedWorks: Removing relatedWorks that are not online and getting ISILs"
)

client = MpApi(baseURL=baseURL, user=user, pw=pw)

relatedWorksL = self.tree.xpath(
"""/l:lidoWrap/l:lido/l:descriptiveMetadata/l:objectRelationWrap/
l:relatedWorksWrap/l:relatedWorkSet/l:relatedWork/l:object/l:objectID""",
Expand Down Expand Up @@ -184,17 +183,17 @@ def _add_to_relWorks_cache(self, *, mtype: str, ID: int) -> None:
Caution: Does not include a check if relWork is already in cache.
"""
print(f" getting item from online RIA {modType} {id_int}")
print(f" getting item from online RIA {mtype} {ID}")
# if not, get it now and add to cache
q = Search(module=mType, limit=-1)
q = Search(module=mtype, limit=-1)
q.addCriterion(
operator="equalsField",
field="__id",
value=str(id_int),
value=str(ID),
)
q = self._optimize_relWorks_cache(query=q)
# q.toFile(path="sdata/debug.search.xml")
relWork = client.search2(query=q)
relWork = self.client.search2(query=q)
if relWork: # realistic that query results are empty?
# appending them to relWork cache
self.relWorks += relWork
Expand Down Expand Up @@ -256,7 +255,6 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
print(
f" _grow_relWorks_cache: new IDs: {len(ID_cache)} relWorks:{len(self.relWorks)}"
)
client = MpApi(baseURL=baseURL, user=user, pw=pw)
if len(ID_cache) > 0:
q = Search(module="Object", limit=-1)
if len(ID_cache) > 1:
Expand All @@ -273,7 +271,7 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:
print(
f" populating relWorks cache {len(ID_cache)} (max size {relWorks_maxSize})"
)
newRelWorksM = client.search2(query=q)
newRelWorksM = self.client.search2(query=q)
try:
self.relWorks
except:
Expand All @@ -289,20 +287,20 @@ def _grow_relWorks_cache(self, ID_cache: set) -> None:

def _init_relWorks_cache(self):
"""
Initializes self.refWorks cache. If cache file exists, load it. May
also initialize empty self.refWorks.
Initializes self.refWorks cache. If cache file already exists, load it. Else
initialize empty self.refWorks.
"""
if Path(self.relWorksFn).exists():
try:
self.relWorks
except:
# print("Inline cache not loaded yet")
print(f" Loading existing relWorks cache {self.relWorksFn}")
self.relWorks = Module(file=self.relWorksFn)
# if we read relWorks cache from file we dont loop thru data files (chunks)
# looking for all the relWorks to fill the cache as best as we can
# try:
# self.relWorks
# except NameError:
# print("Inline cache not loaded yet")
print(f" Loading existing relWorks cache {self.relWorksFn}")
self.relWorks = Module(file=self.relWorksFn)
# else:
# print("Inline cache exists already")
# if we read relWorks cache from file we dont loop thru data files (chunks)
# looking for all the relWorks to fill the cache as best as we can
else:
print(f" No relWorks file to load at {self.relWorksFn}")
self.relWorks = Module()
Expand Down Expand Up @@ -401,8 +399,8 @@ def _relWorks_cache_many(self, *, first):
"""
creates relatedWorksCache from all chunks
In case, we in chunk mode, the normal preparation is inefficient, so let's see
if we can speed things up by offering a separate cache for chunk mode
In case we're in chunk mode, the normal preparation is inefficient, so let's see
if we can speed things up by offering a separate cache for chunk mode.
expects
-first: the path to the first chunk (as str or Path)
Expand Down Expand Up @@ -434,9 +432,9 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
"""
if relWork unpublic delete; otherwise rewrite
"""
id_int = int(ID.text)
id_int = int(objectID.text)

if self._relWork_online(modType=modType, modItemId=id_int):
if self._relWork_online(modType=mtype, modItemId=id_int):
# rewrite ISIL, should look like this:
# <lido:objectID lido:type="local" lido:source="ISIL/ID">de-MUS-018313/744501</lido:objectID>
# self._log(f" looking up ISIL for relWork")
Expand All @@ -449,13 +447,13 @@ def _rewrite_relWork(self, *, mtype: str, objectID: Any) -> None:
]/m:moduleReferenceItem/m:formattedValue"""
)[0]
except:
self._log(f"WARNING: verwaltendeInstitution empty! {modType} {id_int}")
self._log(f"WARNING: verwaltendeInstitution empty! {mtype} {id_int}")
else:
ISIL = self._lookup_ISIL(institution=verwInst.text)
objectID.text = f"{ISIL}/{str(id_int)}"
print(f" relWork {id_int}: {verwInst.text} -> {ISIL}")
else:
self._del_relWork(objectID=objectID)
self._del_relWork(ID=objectID)


if __name__ == "__main__":
Expand Down

0 comments on commit 305466e

Please sign in to comment.