From 494748d49812aee6ab02a55b248343f21bdf4a08 Mon Sep 17 00:00:00 2001 From: Maurice Mengel Date: Wed, 19 Jun 2024 16:02:34 +0200 Subject: [PATCH] lido receives new command line option to disable rescaning lvl1 files as prepration of relWorksCache --- zml2lido/__init__.py | 19 ++++++++++++++++++- zml2lido/lidoTool.py | 7 +++++-- zml2lido/linkChecker.py | 10 +++++----- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/zml2lido/__init__.py b/zml2lido/__init__.py index 6c9251e..52aa80a 100644 --- a/zml2lido/__init__.py +++ b/zml2lido/__init__.py @@ -67,6 +67,13 @@ def lido(): required=False, action="store_true", ) + parser.add_argument( + "-d", + "--disablerescan", + action="store_true", + default=False, + help="set to disable rescanning available lvl1 lido files to pre-populate relWorksCache", + ) parser.add_argument("-i", "--input", help="zml input file", required=True) parser.add_argument("-j", "--job", help="pick job (e.g. smb or dd)", required=True) parser.add_argument( @@ -80,8 +87,18 @@ def lido(): print(f"JOB: {args.job}") + if args.disablerescan is True: + print(f"rescan is off (False)") + rescan = False + else: + rescan = True + lt = LidoTool( - src=args.input, force=args.force, validation=args.validate, chunks=args.chunks + src=args.input, + force=args.force, + validation=args.validate, + chunks=args.chunks, + rescan=rescan, ) lt.execute(args.job) diff --git a/zml2lido/lidoTool.py b/zml2lido/lidoTool.py index 43b998a..c9af6f1 100644 --- a/zml2lido/lidoTool.py +++ b/zml2lido/lidoTool.py @@ -63,6 +63,7 @@ def __init__( force: bool = False, validation: bool = False, chunks: bool = False, + rescan: bool = True, ) -> None: """ src: lido file or first chunk @@ -75,7 +76,7 @@ def __init__( self.force = force self.chunks = chunks self.script_dir = Path(__file__).parents[1] - + self.rescan = rescan self.src = self._sanitize(src=src) # returns Path self.outdir = self._prepareOutdir() print(f" outdir {self.outdir}") @@ -143,7 +144,9 @@ def to_lvl2_single(self, *, src: Path) -> Path: except AttributeError: # only initalize and load lido files into relWorksCache once # need src here for path atm - self.lc = LinkChecker(src=src, chunks=self.chunks) + self.lc = LinkChecker( + src=src, chunks=self.chunks, rescan_lvl1_at_init=self.rescan + ) out_fn = self._lvl2_path(src) if not out_fn.exists() or self.force: self.lc.load_lvl1(src=src) diff --git a/zml2lido/linkChecker.py b/zml2lido/linkChecker.py index eef536e..74000c3 100644 --- a/zml2lido/linkChecker.py +++ b/zml2lido/linkChecker.py @@ -30,11 +30,11 @@ # from zml2lido import NSMAP NSMAP = {"l": "http://www.lido-schema.org"} -rescan_lvl1_files_at_init = False - class LinkChecker: - def __init__(self, *, src: Path, chunks: bool = False) -> None: + def __init__( + self, *, src: Path, chunks: bool = False, rescan_lvl1_at_init: bool = True + ) -> None: logging.debug( f"STATUS: LinkChecker is working on {src}" ) # not exactly an error @@ -46,7 +46,7 @@ def __init__(self, *, src: Path, chunks: bool = False) -> None: self.rwc = RelWorksCache(maxSize=20_000, cache_dir=cache_dir) self.rwc.load_cache_file() # load file if it exists once atb - if rescan_lvl1_files_at_init: + if rescan_lvl1_at_init: # run only once to update cache if self.chunks: print("prepare relWorks cache (chunks, many)") @@ -108,7 +108,7 @@ def fixRelatedWorks(self) -> None: print( f"{idx}/{len(relatedWorksL)}{objectID_N.text} already rewritten" ) - if idx % 100: + if idx % 100 == 0: self.rwc.save_if_changed() def linkResource_online_http(self) -> None: