diff --git a/fgpyo/util/metric.py b/fgpyo/util/metric.py index 9a62775e..c64de555 100644 --- a/fgpyo/util/metric.py +++ b/fgpyo/util/metric.py @@ -192,7 +192,12 @@ def _parsers(cls) -> Dict[type, Callable[[str], Any]]: return {} @classmethod - def read(cls, path: Path, ignore_extra_fields: bool = True) -> Iterator["Metric[MetricType]"]: + def read( + cls, + path: Path, + ignore_extra_fields: bool = True, + comment_prefix: str = "#", + ) -> Iterator["Metric[MetricType]"]: """Reads in zero or more metrics from the given path. The metric file must contain a matching header. @@ -203,13 +208,23 @@ def read(cls, path: Path, ignore_extra_fields: bool = True) -> Iterator["Metric[ Args: path: the path to the metrics file. ignore_extra_fields: True to ignore any extra columns, False to raise an exception. + comment_prefix: Any lines beginning with this string will be ignored before parsing the + header. + + Raises: + ValueError: if the file does not contain a header. """ parsers = cls._parsers() with io.to_reader(path) as reader: - header: List[str] = reader.readline().rstrip("\r\n").split("\t") + try: + header = cls._read_header(reader, comment_prefix=comment_prefix) + fieldnames: List[str] = header.fieldnames + except ValueError: + raise ValueError(f"No header found in file: {path}") + # check the header class_fields = set(cls.header()) - file_fields = set(header) + file_fields = set(fieldnames) missing_from_class = file_fields.difference(class_fields) missing_from_file = class_fields.difference(file_fields) @@ -247,15 +262,15 @@ def read(cls, path: Path, ignore_extra_fields: bool = True) -> Iterator["Metric[ values: List[str] = line.rstrip("\r\n").split("\t") # raise an exception if there aren't the same number of values as the header - if len(header) != len(values): + if len(fieldnames) != len(values): raise ValueError( - f"In file: {path}, expected {len(header)} columns, got {len(values)} on " - f"line {lineno}: {line}" + f"In file: {path}, expected {len(fieldnames)} columns, " + f"got {len(values)} on line {lineno}: {line}" ) # build the metric instance: Metric[MetricType] = inspect.attr_from( - cls=cls, kwargs=dict(zip(header, values)), parsers=parsers + cls=cls, kwargs=dict(zip(fieldnames, values)), parsers=parsers ) yield instance diff --git a/fgpyo/util/tests/test_metric.py b/fgpyo/util/tests/test_metric.py index 4b9ca5fc..3f613414 100644 --- a/fgpyo/util/tests/test_metric.py +++ b/fgpyo/util/tests/test_metric.py @@ -590,3 +590,29 @@ def test_read_header_can_read_picard(tmp_path: Path) -> None: header = Metric._read_header(metrics_file, comment_prefix="#") assert header.fieldnames == ["SAMPLE", "FOO", "BAR"] + + +@pytest.mark.parametrize( + "lines", + [ + [], + [""], + ["# comment"], + ["", "# comment"], + ], +) +def test_read_validates_no_header(tmp_path: Path, lines: List[str]) -> None: + """ + Test our handling of a file with no header. + + 1. The helper `Metric._read_header` returns None + 2. `Metric.read` raises `ValueError` + """ + + metrics_path = tmp_path / "bad_metrics" + + with metrics_path.open("w") as metrics_file: + metrics_file.writelines(lines) + + with pytest.raises(ValueError, match="No header found"): + [m for m in dataclasses_data_and_classes.DummyMetric.read(metrics_path)]