Adding arbitrary properties to fill-table. Fixes #625

INCATools · Aug 2, 2023 · 57aff57 · 57aff57
1 parent 5eb1f4e
commit 57aff57
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 11 deletions.
diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py
@@ -5637,6 +5637,12 @@ def diff_via_mappings(
     type=click.File(mode="r"),
     help="Path to YAML file corresponding to a list of normalized relation between two columns",
 )
+@click.option(
+    "--autolabel/--no-autolabel",
+    default=False,
+    show_default=True,
+    help="Autolabel columns",
+)
 @output_option
 @click.argument("table_file")
 def fill_table(
@@ -5648,6 +5654,7 @@ def fill_table(
     allow_missing: bool,
     relation: tuple,
     relation_file: str,
+    autolabel: bool,
     schema: str,
 ):
     """
@@ -5740,6 +5747,11 @@ def fill_table(
 
     with open(table_file) as input_file:
         input_table = table_filler.parse_table(input_file, delimiter=delimiter)
+        if autolabel:
+            hdr = input_table[0]
+            for col in list(hdr.keys()):
+                if col.endswith("_id"):
+                    hdr[col.replace("_id", "_label")] = None
         if schema:
             metadata = tf.extract_metadata_from_linkml(schema)
         elif relation or relation_file:

diff --git a/src/oaklib/implementations/obograph/obograph_implementation.py b/src/oaklib/implementations/obograph/obograph_implementation.py
@@ -438,7 +438,7 @@ def as_obograph(self) -> Graph:
 
     def logical_definitions(
         self,
-        subjects: Iterable[CURIE],
+        subjects: Iterable[CURIE] = None,
         predicates: Iterable[PRED_CURIE] = None,
         objects: Iterable[CURIE] = None,
         **kwargs,

diff --git a/src/oaklib/utilities/table_filler.py b/src/oaklib/utilities/table_filler.py
@@ -27,7 +27,7 @@
 @dataclass
 class ColumnDependency:
     """
-    Models an interdependency between an identifier column and a column with a dependent value
+    Models an interdependency between an identifier column and a column with a dependent value.
     """
 
     primary_key: COLUMN_NAME
@@ -160,7 +160,7 @@ def write_table(
 @dataclass
 class TableFiller:
     """
-    An engine for filling in missing columns in tables based on metadata about these columns
+    An engine for filling in missing columns in tables based on metadata about these columns.
     """
 
     ontology_interface: BasicOntologyInterface = None
@@ -186,7 +186,54 @@ def fill_table_file(
 
     def fill_table(self, rows: List[ROW], table_metadata: TableMetadata = None):
         """
-        Fills in missing values for a list of rows
+        Fills in missing values for a list of rows.
+
+        This will populate missing columns based on ontology lookups and a metadata specification.
+
+        If no metadata specification is passed, it will do a best-attempt job to guess your intent
+        based on conventions and the first row of the table (see :ref:`infer_metadata`).
+
+        For example, with a table with a ``su`` and ``ob`` column we can hint (a) that these are
+        foreign keys by using the ``_id`` convention, and (b) we want corresponding labels by
+        including empty columns with the same base name suffixed with ``_label``.
+
+        >>> from oaklib import get_adapter
+        >>> from oaklib.utilities.table_filler import TableFiller
+        >>> rows = [{"su_id": "GO:0005634", "su_label": None, "ob_id": "GO:0031965",  "ob_label": None}]
+        >>> filler = TableFiller(get_adapter("sqlite:obo:go"))
+        >>> filler.fill_table(rows)
+        >>> for row in rows:
+        ...     print(row)
+        {'su_id': 'GO:0005634', 'su_label': 'nucleus', 'ob_id': 'GO:0031965', 'ob_label': 'nuclear membrane'}
+
+        For brevity this example only has one row, but this works for any number.
+
+        To provide more explicit control, create a metadata object.
+
+        >>> from oaklib.utilities.table_filler import TableMetadata, ColumnDependency
+        >>> cd1 = ColumnDependency("su", "label", "su_label")
+        >>> cd2 = ColumnDependency("ob", "label", "ob_label")
+        >>> metadata = TableMetadata(dependencies=[cd1, cd2])
+        >>> rows = [{"su": "GO:0005634", "ob": "GO:0031965"}]
+        >>> filler.fill_table(rows, metadata)
+        >>> for row in rows:
+        ...     print(row)
+        {'su': 'GO:0005634', 'ob': 'GO:0031965', 'su_label': 'nucleus', 'ob_label': 'nuclear membrane'}
+
+        Note in this case we didn't need to provide any hints in the column names, or provide blank cells.
+
+        However, if we want to control column ordering then the first row should contain blank cells indicating
+        desired column ordering, as before.
+
+        We can use this for other properties too:
+
+        >>> cd1 = ColumnDependency("term", "IAO:0100001", "replacement")
+        >>> metadata = TableMetadata(dependencies=[cd1, cd2])
+        >>> rows = [{"term": "GO:0000108", "info": "foo"}]
+        >>> filler.fill_table(rows, metadata)
+        >>> for row in rows:
+        ...     print(row)
+        {'term': 'GO:0000108', 'info': 'foo', 'replacement': ['GO:0000109']}
 
         :param rows: list of rows, which each row is a dict. Edited in place.
         :param table_metadata:
@@ -260,27 +307,69 @@ def fill_table_column(self, rows: List[ROW], dependency: ColumnDependency):
             if pk_vals:
                 if isinstance(oi, BasicOntologyInterface):
                     for curie in pk_vals:
-                        params = dependency.parameters
-                        if not params:
-                            params = {}
                         mappings = [x for _, x in oi.simple_mappings_by_curie(curie)]
                         fwd_mapping[curie] = mappings
                 else:
                     raise ValueError(f"{oi} must implement OboGraphInterface for ancestors option")
             if dc_vals:
                 raise NotImplementedError
         else:
-            raise NotImplementedError(f"Rel = {rel}")
+            if pk_vals:
+                if isinstance(oi, BasicOntologyInterface):
+                    for curie in pk_vals:
+                        vals = [x for x in oi.entity_metadata_map(curie).get(rel, [])]
+                        fwd_mapping[curie] = vals
+                else:
+                    raise ValueError(f"{oi} must implement OboGraphInterface for ancestors option")
+            if dc_vals:
+                raise NotImplementedError
         apply_dict(rows, fwd_mapping, pk, dc, dependency)
         apply_dict(rows, rev_mapping, dc, pk, dependency)
 
     def infer_metadata(self, row: ROW) -> TableMetadata:
         """
-        Infers the metadata given a sample row, based entirely on conventions
+        Infers the metadata given a sample row, based entirely on conventions.
+
+        By convention, ``id`` and ``label`` have special meaning.
+
+        >>> from oaklib.utilities.table_filler import TableFiller
+        >>> filler = TableFiller()
+        >>> metadata = filler.infer_metadata({"id": "GO:0005634", "label": None})
+        >>> dep = metadata.dependencies[0]
+        >>> dep.primary_key, dep.relation, dep.dependent_column
+        ('id', 'label', 'label')
+
+        If your table consists of foreign keys, then using the convention of suffixing
+        with the name of the lookup slot will work:
+
+        >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_label": None})
+        >>> dep = metadata.dependencies[0]
+        >>> dep.primary_key, dep.relation, dep.dependent_column
+        ('foo_id', 'label', 'foo_label')
+
+        Note that (``_name``) also works:
+
+        >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_name": None})
+        >>> dep = metadata.dependencies[0]
+        >>> dep.primary_key, dep.relation, dep.dependent_column
+        ('foo_id', 'label', 'foo_name')
+
+        Other conventions are:
+
+        - ``definition`` is the definition of the entity
+        - ``mappings`` is a list of mappings
+        - ``ancestors`` is a list of ancestors
+
+
+        >>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_ancestors": None})
+        >>> dep = metadata.dependencies[0]
+        >>> dep.primary_key, dep.relation, dep.dependent_column
+        ('foo_id', 'ancestors', 'foo_ancestors')
 
         :param row:
         :return:
         """
+        logging.info(f"Inferring metadata from row: {row}")
         tm = TableMetadata(dependencies=[])
         for k in [LABEL_KEY, DEFINITION_KEY, MAPPINGS_KEY, ANCESTORS_KEY]:
             if k in row:
@@ -303,6 +392,8 @@ def infer_metadata(self, row: ROW) -> TableMetadata:
                             inferred[base_name][LABEL_KEY] = col
                         if base_col == DEFINITION_KEY:
                             inferred[base_name][DEFINITION_KEY] = col
+                        if base_col == ANCESTORS_KEY:
+                            inferred[base_name][ANCESTORS_KEY] = col
         for v in inferred.values():
             if ID_KEY in v:
                 id_col = v[ID_KEY]
@@ -312,22 +403,26 @@ def infer_metadata(self, row: ROW) -> TableMetadata:
                     tm.dependencies.append(
                         ColumnDependency(id_col, DEFINITION_KEY, v[DEFINITION_KEY])
                     )
+                if ANCESTORS_KEY in v:
+                    tm.dependencies.append(
+                        ColumnDependency(id_col, ANCESTORS_KEY, v[ANCESTORS_KEY])
+                    )
 
         return tm
 
     def extract_metadata_from_linkml(
         self, schema: Union[str, SchemaDefinition], class_name: str = None
     ) -> TableMetadata:
         """
-        Extract dependencies using a LinkML schema
+        Extract dependencies using a LinkML schema.
 
         The primary_key in the dependency is the slot that is designated the identifier
 
         Labels, definitions, etc are determined from the slot_ur
 
         For example, with the following schema
 
-        .. packages-block ::
+        .. code-block:: yaml
 
             classes:
               Person: