Skip to content

Commit

Permalink
lido receives new command line option to disable rescaning lvl1 files…
Browse files Browse the repository at this point in the history
… as prepration of relWorksCache
  • Loading branch information
mokko committed Jun 19, 2024
1 parent 8b46606 commit 494748d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 8 deletions.
19 changes: 18 additions & 1 deletion zml2lido/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ def lido():
required=False,
action="store_true",
)
parser.add_argument(
"-d",
"--disablerescan",
action="store_true",
default=False,
help="set to disable rescanning available lvl1 lido files to pre-populate relWorksCache",
)
parser.add_argument("-i", "--input", help="zml input file", required=True)
parser.add_argument("-j", "--job", help="pick job (e.g. smb or dd)", required=True)
parser.add_argument(
Expand All @@ -80,8 +87,18 @@ def lido():

print(f"JOB: {args.job}")

if args.disablerescan is True:
print(f"rescan is off (False)")
rescan = False
else:
rescan = True

lt = LidoTool(
src=args.input, force=args.force, validation=args.validate, chunks=args.chunks
src=args.input,
force=args.force,
validation=args.validate,
chunks=args.chunks,
rescan=rescan,
)
lt.execute(args.job)

Expand Down
7 changes: 5 additions & 2 deletions zml2lido/lidoTool.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(
force: bool = False,
validation: bool = False,
chunks: bool = False,
rescan: bool = True,
) -> None:
"""
src: lido file or first chunk
Expand All @@ -75,7 +76,7 @@ def __init__(
self.force = force
self.chunks = chunks
self.script_dir = Path(__file__).parents[1]

self.rescan = rescan
self.src = self._sanitize(src=src) # returns Path
self.outdir = self._prepareOutdir()
print(f" outdir {self.outdir}")
Expand Down Expand Up @@ -143,7 +144,9 @@ def to_lvl2_single(self, *, src: Path) -> Path:
except AttributeError:
# only initalize and load lido files into relWorksCache once
# need src here for path atm
self.lc = LinkChecker(src=src, chunks=self.chunks)
self.lc = LinkChecker(
src=src, chunks=self.chunks, rescan_lvl1_at_init=self.rescan
)
out_fn = self._lvl2_path(src)
if not out_fn.exists() or self.force:
self.lc.load_lvl1(src=src)
Expand Down
10 changes: 5 additions & 5 deletions zml2lido/linkChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@
# from zml2lido import NSMAP
NSMAP = {"l": "http://www.lido-schema.org"}

rescan_lvl1_files_at_init = False


class LinkChecker:
def __init__(self, *, src: Path, chunks: bool = False) -> None:
def __init__(
self, *, src: Path, chunks: bool = False, rescan_lvl1_at_init: bool = True
) -> None:
logging.debug(
f"STATUS: LinkChecker is working on {src}"
) # not exactly an error
Expand All @@ -46,7 +46,7 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None:
self.rwc = RelWorksCache(maxSize=20_000, cache_dir=cache_dir)
self.rwc.load_cache_file() # load file if it exists once atb

if rescan_lvl1_files_at_init:
if rescan_lvl1_at_init:
# run only once to update cache
if self.chunks:
print("prepare relWorks cache (chunks, many)")
Expand Down Expand Up @@ -108,7 +108,7 @@ def fixRelatedWorks(self) -> None:
print(
f"{idx}/{len(relatedWorksL)}{objectID_N.text} already rewritten"
)
if idx % 100:
if idx % 100 == 0:
self.rwc.save_if_changed()

def linkResource_online_http(self) -> None:
Expand Down

0 comments on commit 494748d

Please sign in to comment.