Skip to content

Commit

Permalink
relWorksCache: save at the right moments
Browse files Browse the repository at this point in the history
  • Loading branch information
mokko committed Jun 17, 2024
1 parent 4f3041e commit 5ac56ee
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 9 deletions.
2 changes: 1 addition & 1 deletion zml2lido/linkChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def fixRelatedWorks(self) -> None:
self.rwc.lookup_relWork(mtype=mtype, ID=id_int)
# at this point we can rely on item being in relWorks cache
self._rewrite_relWork(mtype=mtype, objectID_N=objectID_N)
if idx % 10:
if idx % 100:
print("Saving relWorks cache")
self.rwc.save()

Expand Down
19 changes: 11 additions & 8 deletions zml2lido/relWorksCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,8 @@ def lookup_from_lido_chunks(self, *, path: Path) -> None:
for p in per_chunk(path=path):
self.lookup_from_lido_file(path=p)
print(f"Cache size: {len(self.cache)}")
self.save()
if len(self.cache) > self.maxSize:
print("Cache is big enough. Let's go {len(self.cache)}")
self.save()
break
if len(self.cache) >= self.maxSize:
break # dont continue to loop, if cache is already at maxSize

def lookup_from_lido_file(self, *, path: Path) -> None:
"""
Expand All @@ -69,12 +66,15 @@ def lookup_from_lido_file(self, *, path: Path) -> None:
been processed?
"""
print(f"relWorksCache: lookup_from_lido_file {path}")
IDs = self._lido_to_ids(path=path)
# all ids from a single lido file that are not yet in cache
IDs = self._lido_to_ids_not_in_cache(path=path)
for mtype, id_int in IDs:
self.lookup_relWork(mtype=mtype, ID=id_int)
if len(self.cache) > self.maxSize:
print("relWorksCache has reached maxSize")
self.save()
break
self.save()

def lookup_relWork(self, *, mtype: str, ID: int) -> None:
"""
Expand All @@ -88,6 +88,7 @@ def lookup_relWork(self, *, mtype: str, ID: int) -> None:
value=str(ID),
)
q = self._optimize_query(query=q)
print(f"{len(self.cache)} looking up relWork {mtype} {ID}")
relWorkM = self.client.search2(query=q)
if relWorkM: # realistic that query results are empty?
self.cache += relWorkM # appending them to relWork cache
Expand Down Expand Up @@ -153,15 +154,15 @@ def save(self) -> Path:
self.cache.toFile(path=path)
except KeyboardInterrupt:
print(
"Catching keyboard interrupt while saving of relWorksCache; try again..."
"Catching keyboard interrupt while saving relWorksCache; try again..."
)
return path

#
# private
#

def _lido_to_ids(self, path: Path) -> set[tuple[str, int]]:
def _lido_to_ids_not_in_cache(self, path: Path) -> set[tuple[str, int]]:
"""
Given the path to lido file, we return a (distinct) set of items that
are not yet in relWorks cache.
Expand All @@ -177,6 +178,7 @@ def _lido_to_ids(self, path: Path) -> set[tuple[str, int]]:
l:relatedWorksWrap/l:relatedWorkSet/l:relatedWork/l:object/l:objectID""",
namespaces=NSMAP,
)
print("lido to ids...")

id_cache = set()
for ID_N in relWorksL:
Expand All @@ -193,6 +195,7 @@ def _lido_to_ids(self, path: Path) -> set[tuple[str, int]]:
id_cache.add((mtype, id_int))
# else:
# print(f"item {mtype} {id_int} already in relWorks cache")
print("done")
return id_cache

def _optimize_query(self, *, query: Search) -> Search:
Expand Down

0 comments on commit 5ac56ee

Please sign in to comment.