Skip to content

Commit

Permalink
Adding arbitrary properties to fill-table. Fixes #625
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall committed Aug 2, 2023
1 parent 5eb1f4e commit 57aff57
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 11 deletions.
12 changes: 12 additions & 0 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5637,6 +5637,12 @@ def diff_via_mappings(
type=click.File(mode="r"),
help="Path to YAML file corresponding to a list of normalized relation between two columns",
)
@click.option(
"--autolabel/--no-autolabel",
default=False,
show_default=True,
help="Autolabel columns",
)
@output_option
@click.argument("table_file")
def fill_table(
Expand All @@ -5648,6 +5654,7 @@ def fill_table(
allow_missing: bool,
relation: tuple,
relation_file: str,
autolabel: bool,
schema: str,
):
"""
Expand Down Expand Up @@ -5740,6 +5747,11 @@ def fill_table(

with open(table_file) as input_file:
input_table = table_filler.parse_table(input_file, delimiter=delimiter)
if autolabel:
hdr = input_table[0]
for col in list(hdr.keys()):
if col.endswith("_id"):
hdr[col.replace("_id", "_label")] = None
if schema:
metadata = tf.extract_metadata_from_linkml(schema)
elif relation or relation_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def as_obograph(self) -> Graph:

def logical_definitions(
self,
subjects: Iterable[CURIE],
subjects: Iterable[CURIE] = None,
predicates: Iterable[PRED_CURIE] = None,
objects: Iterable[CURIE] = None,
**kwargs,
Expand Down
115 changes: 105 additions & 10 deletions src/oaklib/utilities/table_filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
@dataclass
class ColumnDependency:
"""
Models an interdependency between an identifier column and a column with a dependent value
Models an interdependency between an identifier column and a column with a dependent value.
"""

primary_key: COLUMN_NAME
Expand Down Expand Up @@ -160,7 +160,7 @@ def write_table(
@dataclass
class TableFiller:
"""
An engine for filling in missing columns in tables based on metadata about these columns
An engine for filling in missing columns in tables based on metadata about these columns.
"""

ontology_interface: BasicOntologyInterface = None
Expand All @@ -186,7 +186,54 @@ def fill_table_file(

def fill_table(self, rows: List[ROW], table_metadata: TableMetadata = None):
"""
Fills in missing values for a list of rows
Fills in missing values for a list of rows.
This will populate missing columns based on ontology lookups and a metadata specification.
If no metadata specification is passed, it will do a best-attempt job to guess your intent
based on conventions and the first row of the table (see :ref:`infer_metadata`).
For example, with a table with a ``su`` and ``ob`` column we can hint (a) that these are
foreign keys by using the ``_id`` convention, and (b) we want corresponding labels by
including empty columns with the same base name suffixed with ``_label``.
>>> from oaklib import get_adapter
>>> from oaklib.utilities.table_filler import TableFiller
>>> rows = [{"su_id": "GO:0005634", "su_label": None, "ob_id": "GO:0031965", "ob_label": None}]
>>> filler = TableFiller(get_adapter("sqlite:obo:go"))
>>> filler.fill_table(rows)
>>> for row in rows:
... print(row)
{'su_id': 'GO:0005634', 'su_label': 'nucleus', 'ob_id': 'GO:0031965', 'ob_label': 'nuclear membrane'}
For brevity this example only has one row, but this works for any number.
To provide more explicit control, create a metadata object.
>>> from oaklib.utilities.table_filler import TableMetadata, ColumnDependency
>>> cd1 = ColumnDependency("su", "label", "su_label")
>>> cd2 = ColumnDependency("ob", "label", "ob_label")
>>> metadata = TableMetadata(dependencies=[cd1, cd2])
>>> rows = [{"su": "GO:0005634", "ob": "GO:0031965"}]
>>> filler.fill_table(rows, metadata)
>>> for row in rows:
... print(row)
{'su': 'GO:0005634', 'ob': 'GO:0031965', 'su_label': 'nucleus', 'ob_label': 'nuclear membrane'}
Note in this case we didn't need to provide any hints in the column names, or provide blank cells.
However, if we want to control column ordering then the first row should contain blank cells indicating
desired column ordering, as before.
We can use this for other properties too:
>>> cd1 = ColumnDependency("term", "IAO:0100001", "replacement")
>>> metadata = TableMetadata(dependencies=[cd1, cd2])
>>> rows = [{"term": "GO:0000108", "info": "foo"}]
>>> filler.fill_table(rows, metadata)
>>> for row in rows:
... print(row)
{'term': 'GO:0000108', 'info': 'foo', 'replacement': ['GO:0000109']}
:param rows: list of rows, which each row is a dict. Edited in place.
:param table_metadata:
Expand Down Expand Up @@ -260,27 +307,69 @@ def fill_table_column(self, rows: List[ROW], dependency: ColumnDependency):
if pk_vals:
if isinstance(oi, BasicOntologyInterface):
for curie in pk_vals:
params = dependency.parameters
if not params:
params = {}
mappings = [x for _, x in oi.simple_mappings_by_curie(curie)]
fwd_mapping[curie] = mappings
else:
raise ValueError(f"{oi} must implement OboGraphInterface for ancestors option")
if dc_vals:
raise NotImplementedError
else:
raise NotImplementedError(f"Rel = {rel}")
if pk_vals:
if isinstance(oi, BasicOntologyInterface):
for curie in pk_vals:
vals = [x for x in oi.entity_metadata_map(curie).get(rel, [])]
fwd_mapping[curie] = vals
else:
raise ValueError(f"{oi} must implement OboGraphInterface for ancestors option")
if dc_vals:
raise NotImplementedError
apply_dict(rows, fwd_mapping, pk, dc, dependency)
apply_dict(rows, rev_mapping, dc, pk, dependency)

def infer_metadata(self, row: ROW) -> TableMetadata:
"""
Infers the metadata given a sample row, based entirely on conventions
Infers the metadata given a sample row, based entirely on conventions.
By convention, ``id`` and ``label`` have special meaning.
>>> from oaklib.utilities.table_filler import TableFiller
>>> filler = TableFiller()
>>> metadata = filler.infer_metadata({"id": "GO:0005634", "label": None})
>>> dep = metadata.dependencies[0]
>>> dep.primary_key, dep.relation, dep.dependent_column
('id', 'label', 'label')
If your table consists of foreign keys, then using the convention of suffixing
with the name of the lookup slot will work:
>>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_label": None})
>>> dep = metadata.dependencies[0]
>>> dep.primary_key, dep.relation, dep.dependent_column
('foo_id', 'label', 'foo_label')
Note that (``_name``) also works:
>>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_name": None})
>>> dep = metadata.dependencies[0]
>>> dep.primary_key, dep.relation, dep.dependent_column
('foo_id', 'label', 'foo_name')
Other conventions are:
- ``definition`` is the definition of the entity
- ``mappings`` is a list of mappings
- ``ancestors`` is a list of ancestors
>>> metadata = filler.infer_metadata({"foo_id": "GO:0005634", "foo_ancestors": None})
>>> dep = metadata.dependencies[0]
>>> dep.primary_key, dep.relation, dep.dependent_column
('foo_id', 'ancestors', 'foo_ancestors')
:param row:
:return:
"""
logging.info(f"Inferring metadata from row: {row}")
tm = TableMetadata(dependencies=[])
for k in [LABEL_KEY, DEFINITION_KEY, MAPPINGS_KEY, ANCESTORS_KEY]:
if k in row:
Expand All @@ -303,6 +392,8 @@ def infer_metadata(self, row: ROW) -> TableMetadata:
inferred[base_name][LABEL_KEY] = col
if base_col == DEFINITION_KEY:
inferred[base_name][DEFINITION_KEY] = col
if base_col == ANCESTORS_KEY:
inferred[base_name][ANCESTORS_KEY] = col
for v in inferred.values():
if ID_KEY in v:
id_col = v[ID_KEY]
Expand All @@ -312,22 +403,26 @@ def infer_metadata(self, row: ROW) -> TableMetadata:
tm.dependencies.append(
ColumnDependency(id_col, DEFINITION_KEY, v[DEFINITION_KEY])
)
if ANCESTORS_KEY in v:
tm.dependencies.append(
ColumnDependency(id_col, ANCESTORS_KEY, v[ANCESTORS_KEY])
)

return tm

def extract_metadata_from_linkml(
self, schema: Union[str, SchemaDefinition], class_name: str = None
) -> TableMetadata:
"""
Extract dependencies using a LinkML schema
Extract dependencies using a LinkML schema.
The primary_key in the dependency is the slot that is designated the identifier
Labels, definitions, etc are determined from the slot_ur
For example, with the following schema
.. packages-block ::
.. code-block:: yaml
classes:
Person:
Expand Down

0 comments on commit 57aff57

Please sign in to comment.