Skip to content

Commit

Permalink
Return and log exception information while loading metadata for organize
Browse files Browse the repository at this point in the history
joblib does not setup automagically some kind of logging for Parallel.
Filed dedicated to possibly see it implemented

- #1495

For the sake of current use case (e.g. troubleshooting
#1494) it should largely suffice to
return and log information about exception which was raised while loading
metadata.  This is what is done in this PR and while using buggy hdmf we do get
nice logging in the log file at DEBUG level.  No attempts were made to reduce
possibly a flood of duplicate log messages since per file metadata would have
unique values
  • Loading branch information
yarikoptic committed Sep 5, 2024
1 parent 90946b7 commit 6b579c0
Showing 1 changed file with 29 additions and 13 deletions.
42 changes: 29 additions & 13 deletions dandi/organize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os.path as op
from pathlib import Path, PurePosixPath
import re
import traceback
import uuid

import ruamel.yaml
Expand Down Expand Up @@ -841,44 +842,59 @@ def act(func, *args, **kwargs):
# react to those
# Doesn't play nice with Parallel
# with tqdm.tqdm(desc="Files", total=len(paths), unit="file", unit_scale=False) as pbar:
failed = []

def _get_metadata(path):
# Avoid heavy import by importing within function:
from .metadata.nwb import get_metadata

meta, exc = {}, None
try:
meta = get_metadata(path)
except Exception as exc:
meta = {}
failed.append(path)
# pbar.desc = "Files (%d failed)" % len(failed)
lgr.debug("Failed to get metadata for %s: %s", path, exc)
except Exception as e:
exc = (
e.__class__,
str(e),
traceback.TracebackException.from_exception(e),
)
# meta = {}
# lgr.debug("Failed to get metadata for %s: %s", path, exc)
# pbar.update(1)
meta["path"] = path
return meta
return meta, exc

if (
not devel_debug and jobs != 1 and not len(paths) == 1
): # Do not use joblib at all if number_of_jobs=1
# Note: It is Python (pynwb) intensive, not IO, so ATM there is little
# to no benefit from Parallel without using multiproc! But that would
# complicate progress bar indication... TODO
metadata = list(
metadata_excs = list(
Parallel(n_jobs=jobs, verbose=10)(
delayed(_get_metadata)(path) for path in paths
)
)
else:
metadata = list(map(_get_metadata, paths))
if failed:
metadata_excs = list(map(_get_metadata, paths))
exceptions = [e for _, e in metadata_excs if e]
if exceptions:
lgr.warning(
"Failed to load metadata for %d out of %d files",
len(failed),
"Failed to load metadata for %d out of %d files "
"due to following types of exceptions: %s. "
"Details of the exceptions will be shown at DEBUG level",
len(exceptions),
len(paths),
", ".join(e[0].__name__ for e in exceptions),
)
for m, e in metadata_excs:
if not e:
continue
lgr.debug(
"Loading metadata for path %s resulted in following exception:\n%s",
m["path"],
"\n".join(e[-1].format()),
)

metadata, skip_invalid = filter_invalid_metadata_rows(metadata)
metadata, skip_invalid = filter_invalid_metadata_rows([m for m, e in metadata_excs])
if skip_invalid:
msg = (
"%d out of %d files were found not containing all necessary "
Expand Down

0 comments on commit 6b579c0

Please sign in to comment.