diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 2fd39b94e..8c150f0f6 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -5637,6 +5637,12 @@ def diff_via_mappings( type=click.File(mode="r"), help="Path to YAML file corresponding to a list of normalized relation between two columns", ) +@click.option( + "--autolabel/--no-autolabel", + default=False, + show_default=True, + help="Autolabel columns", +) @output_option @click.argument("table_file") def fill_table( @@ -5648,6 +5654,7 @@ def fill_table( allow_missing: bool, relation: tuple, relation_file: str, + autolabel: bool, schema: str, ): """ @@ -5740,6 +5747,11 @@ def fill_table( with open(table_file) as input_file: input_table = table_filler.parse_table(input_file, delimiter=delimiter) + if autolabel: + hdr = input_table[0] + for col in list(hdr.keys()): + if col.endswith("_id"): + hdr[col.replace("_id", "_label")] = None if schema: metadata = tf.extract_metadata_from_linkml(schema) elif relation or relation_file: diff --git a/src/oaklib/implementations/obograph/obograph_implementation.py b/src/oaklib/implementations/obograph/obograph_implementation.py index 35ddc2cad..285dce3b5 100644 --- a/src/oaklib/implementations/obograph/obograph_implementation.py +++ b/src/oaklib/implementations/obograph/obograph_implementation.py @@ -438,7 +438,7 @@ def as_obograph(self) -> Graph: def logical_definitions( self, - subjects: Iterable[CURIE], + subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, objects: Iterable[CURIE] = None, **kwargs, diff --git a/src/oaklib/utilities/table_filler.py b/src/oaklib/utilities/table_filler.py index 175821fa4..a04b55192 100644 --- a/src/oaklib/utilities/table_filler.py +++ b/src/oaklib/utilities/table_filler.py @@ -27,7 +27,7 @@ @dataclass class ColumnDependency: """ - Models an interdependency between an identifier column and a column with a dependent value + Models an interdependency between an identifier column and a column with a dependent value. """ primary_key: COLUMN_NAME @@ -160,7 +160,7 @@ def write_table( @dataclass class TableFiller: """ - An engine for filling in missing columns in tables based on metadata about these columns + An engine for filling in missing columns in tables based on metadata about these columns. """ ontology_interface: BasicOntologyInterface = None @@ -186,7 +186,54 @@ def fill_table_file( def fill_table(self, rows: List[ROW], table_metadata: TableMetadata = None): """ - Fills in missing values for a list of rows + Fills in missing values for a list of rows. + + This will populate missing columns based on ontology lookups and a metadata specification. + + If no metadata specification is passed, it will do a best-attempt job to guess your intent + based on conventions and the first row of the table (see :ref:`infer_metadata`). + + For example, with a table with a ``su`` and ``ob`` column we can hint (a) that these are + foreign keys by using the ``_id`` convention, and (b) we want corresponding labels by + including empty columns with the same base name suffixed with ``_label``. + + >>> from oaklib import get_adapter + >>> from oaklib.utilities.table_filler import TableFiller + >>> rows = [{"su_id": "GO:0005634", "su_label": None, "ob_id": "GO:0031965", "ob_label": None}] + >>> filler = TableFiller(get_adapter("sqlite:obo:go")) + >>> filler.fill_table(rows) + >>> for row in rows: + ... print(row) + {'su_id': 'GO:0005634', 'su_label': 'nucleus', 'ob_id': 'GO:0031965', 'ob_label': 'nuclear membrane'} + + For brevity this example only has one row, but this works for any number. + + To provide more explicit control, create a metadata object. + + >>> from oaklib.utilities.table_filler import TableMetadata, ColumnDependency + >>> cd1 = ColumnDependency("su", "label", "su_label") + >>> cd2 = ColumnDependency("ob", "label", "ob_label") + >>> metadata = TableMetadata(dependencies=[cd1, cd2]) + >>> rows = [{"su": "GO:0005634", "ob": "GO:0031965"}] + >>> filler.fill_table(rows, metadata) + >>> for row in rows: + ... print(row) + {'su': 'GO:0005634', 'ob': 'GO:0031965', 'su_label': 'nucleus', 'ob_label': 'nuclear membrane'} + + Note in this case we didn't need to provide any hints in the column names, or provide blank cells. + + However, if we want to control column ordering then the first row should contain blank cells indicating + desired column ordering, as before. + + We can use this for other properties too: + + >>> cd1 = ColumnDependency("term", "IAO:0100001", "replacement") + >>> metadata = TableMetadata(dependencies=[cd1, cd2]) + >>> rows = [{"term": "GO:0000108", "info": "foo"}] + >>> filler.fill_table(rows, metadata) + >>> for row in rows: + ... print(row) + {'term': 'GO:0000108', 'info': 'foo', 'replacement': ['GO:0000109']} :param rows: list of rows, which each row is a dict. Edited in place. :param table_metadata: @@ -260,9 +307,6 @@ def fill_table_column(self, rows: List[ROW], dependency: ColumnDependency): if pk_vals: if isinstance(oi, BasicOntologyInterface): for curie in pk_vals: - params = dependency.parameters - if not params: - params = {} mappings = [x for _, x in oi.simple_mappings_by_curie(curie)] fwd_mapping[curie] = mappings else: @@ -270,17 +314,62 @@ def fill_table_column(self, rows: List[ROW], dependency: ColumnDependency): if dc_vals: raise NotImplementedError else: - raise NotImplementedError(f"Rel = {rel}") + if pk_vals: + if isinstance(oi, BasicOntologyInterface): + for curie in pk_vals: + vals = [x for x in oi.entity_metadata_map(curie).get(rel, [])] + fwd_mapping[curie] = vals + else: + raise ValueError(f"{oi} must implement OboGraphInterface for ancestors option") + if dc_vals: + raise NotImplementedError apply_dict(rows, fwd_mapping, pk, dc, dependency) apply_dict(rows, rev_mapping, dc, pk, dependency) def infer_metadata(self, row: ROW) -> TableMetadata: """ - Infers the metadata given a sample row, based entirely on conventions + Infers the metadata given a sample row, based entirely on conventions. + + By convention, ``id`` and ``label`` have special meaning. + + >>> from oaklib.utilities.table_filler import TableFiller + >>> filler = TableFiller() + >>> metadata = filler.infer_metadata({"id": "GO:0005634", "label": None}) + >>> dep = metadata.dependencies[0] + >>> dep.primary_key, dep.relation, dep.dependent_column + ('id', 'label', 'label') + + If your table consists of foreign keys, then using the convention of suffixing + with the name of the lookup slot will work: + + >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_label": None}) + >>> dep = metadata.dependencies[0] + >>> dep.primary_key, dep.relation, dep.dependent_column + ('foo_id', 'label', 'foo_label') + + Note that (``_name``) also works: + + >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_name": None}) + >>> dep = metadata.dependencies[0] + >>> dep.primary_key, dep.relation, dep.dependent_column + ('foo_id', 'label', 'foo_name') + + Other conventions are: + + - ``definition`` is the definition of the entity + - ``mappings`` is a list of mappings + - ``ancestors`` is a list of ancestors + + + >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_ancestors": None}) + >>> dep = metadata.dependencies[0] + >>> dep.primary_key, dep.relation, dep.dependent_column + ('foo_id', 'ancestors', 'foo_ancestors') :param row: :return: """ + logging.info(f"Inferring metadata from row: {row}") tm = TableMetadata(dependencies=[]) for k in [LABEL_KEY, DEFINITION_KEY, MAPPINGS_KEY, ANCESTORS_KEY]: if k in row: @@ -303,6 +392,8 @@ def infer_metadata(self, row: ROW) -> TableMetadata: inferred[base_name][LABEL_KEY] = col if base_col == DEFINITION_KEY: inferred[base_name][DEFINITION_KEY] = col + if base_col == ANCESTORS_KEY: + inferred[base_name][ANCESTORS_KEY] = col for v in inferred.values(): if ID_KEY in v: id_col = v[ID_KEY] @@ -312,6 +403,10 @@ def infer_metadata(self, row: ROW) -> TableMetadata: tm.dependencies.append( ColumnDependency(id_col, DEFINITION_KEY, v[DEFINITION_KEY]) ) + if ANCESTORS_KEY in v: + tm.dependencies.append( + ColumnDependency(id_col, ANCESTORS_KEY, v[ANCESTORS_KEY]) + ) return tm @@ -319,7 +414,7 @@ def extract_metadata_from_linkml( self, schema: Union[str, SchemaDefinition], class_name: str = None ) -> TableMetadata: """ - Extract dependencies using a LinkML schema + Extract dependencies using a LinkML schema. The primary_key in the dependency is the slot that is designated the identifier @@ -327,7 +422,7 @@ def extract_metadata_from_linkml( For example, with the following schema - .. packages-block :: + .. code-block:: yaml classes: Person: