Skip to content

Commit

Permalink
First pass at interface for association search (#646)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall authored Aug 29, 2023
1 parent 4e68ea7 commit 8d2a124
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 4 deletions.
99 changes: 95 additions & 4 deletions src/oaklib/interfaces/association_provider_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple

from oaklib.datamodels.association import Association, PairwiseCoAssociation
from oaklib.datamodels.similarity import TermSetPairwiseSimilarity
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces.basic_ontology_interface import BasicOntologyInterface
from oaklib.interfaces.obograph_interface import OboGraphInterface
from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
from oaklib.types import CURIE, PRED_CURIE, SUBSET_CURIE
from oaklib.utilities.associations.association_index import AssociationIndex
from oaklib.utilities.iterator_utils import chunk
Expand Down Expand Up @@ -216,12 +218,12 @@ def associations_subjects(self, **kwargs) -> Iterator[CURIE]:
>>> from oaklib import get_adapter
>>> from oaklib.datamodels.vocabulary import IS_A, PART_OF
>>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml")
>>> genes = ["PomBase:SPAC1142.02c", "PomBase:SPAC3H1.05", "PomBase:SPAC1142.06", "PomBase:SPAC4G8.02c"]
>>> for assoc in adapter.associations(genes, object_closure_predicates=[IS_A, PART_OF]):
... print(f"{assoc.object} {adapter.label(assoc.object)}")
>>> preds = [IS_A, PART_OF]
>>> for gene in adapter.associations_subjects(objects=["GO:0045047"], object_closure_predicates=preds):
... print(gene)
<BLANKLINE>
...
GO:0006620 post-translational protein targeting to endoplasmic reticulum membrane
PomBase:SPBC1271.05c
...
:param kwargs: same arguments as for :ref:`associations`
Expand All @@ -236,6 +238,95 @@ def associations_subjects(self, **kwargs) -> Iterator[CURIE]:
yield s
yielded.add(s)

def associations_subject_search(
self,
subjects: Iterable[CURIE] = None,
predicates: Iterable[PRED_CURIE] = None,
objects: Iterable[CURIE] = None,
property_filter: Dict[PRED_CURIE, Any] = None,
subject_closure_predicates: Optional[List[PRED_CURIE]] = None,
predicate_closure_predicates: Optional[List[PRED_CURIE]] = None,
object_closure_predicates: Optional[List[PRED_CURIE]] = None,
subject_prefixes: Optional[List[str]] = None,
include_similarity_object: bool = False,
method: Optional[str] = None,
limit: Optional[int] = 10,
sort_by_similarity: bool = True,
**kwargs,
) -> Iterator[Tuple[float, Optional[TermSetPairwiseSimilarity], CURIE]]:
"""
Search over all subjects in the association index.
This relies on the SemanticSimilarityInterface.
.. note::
this is currently quite slow, this will be optimized in future
:param subjects: optional set of subjects (e.g. genes) to search against
:param predicates: only use associations with this predicate
:param objects: this is the query - the asserted objects for all subjects
:param property_filter: passed to associations query
:param subject_closure_predicates: passed to associations query
:param predicate_closure_predicates: passed to associations query
:param object_closure_predicates: closure to use over the ontology
:param subject_prefixes: only consider subjects with these prefixes
:param include_similarity_object: include the similarity object in the result
:param method: similarity method to use
:param limit: max number of results to return
:param kwargs:
:return: iterator over ordered pairs of (score, sim, subject)
"""
all_assocs = []
if not subjects:
all_assocs = list(
self.associations(
predicates=predicates, subject_closure_predicates=subject_closure_predicates
)
)
subjects = list({a.subject for a in all_assocs})
rows = []
n = 0
for subject in subjects:
if subject_prefixes:
if not any(subject.startswith(prefix) for prefix in subject_prefixes):
continue
if all_assocs:
assocs = [a for a in all_assocs if a.subject == subject]
else:
assocs = self.associations(
subjects=[subject],
predicates=predicates,
property_filter=property_filter,
predicate_closure_predicates=predicate_closure_predicates,
)
terms = list({a.object for a in assocs})
if not isinstance(self, SemanticSimilarityInterface):
raise NotImplementedError
sim = self.termset_pairwise_similarity(
objects, terms, predicates=object_closure_predicates, labels=False
)
score = sim.best_score
if include_similarity_object:
row = (score, sim, subject)
else:
row = (score, None, subject)
if sort_by_similarity:
rows.append(row)
else:
yield row
n += 1
if limit and n >= limit:
break
if rows:
# sort each row tuple by the first element of the tuple
n = 0
for row in sorted(rows, key=lambda x: x[0], reverse=True):
yield row
n += 1
if limit and n >= limit:
break

def association_pairwise_coassociations(
self,
curies1: Iterable[CURIE],
Expand Down
35 changes: 35 additions & 0 deletions tests/test_implementations/test_sqldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,41 @@ def test_store_associations(self):
oi.autosave = True
self.compliance_tester.test_store_associations(oi)

def test_associations(self):
spec = InputSpecification(
ontology_resources={"go": {"selector": str(DB)}},
association_resources={"gaf": {"selector": str(INPUT_GAF)}},
)
adapter = get_adapter(spec)
assocs = list(adapter.associations())
genes = list({a.subject for a in assocs})
self.assertGreater(len(assocs), 10)
self.assertGreater(len(genes), 10)
assoc0 = assocs[0]
gene = assoc0.subject
term = assoc0.object
assocs2 = list(adapter.associations(subjects=[gene]))
self.assertCountEqual([a for a in assocs if a.subject == gene], assocs2)
assocs2 = list(adapter.associations(objects=[term]))
self.assertCountEqual([a for a in assocs if a.object == term], assocs2)
# semsim
for gene in genes[0:5]:
terms = list({a.object for a in adapter.associations(subjects=[gene])})
results = list(
adapter.associations_subject_search(
objects=terms, object_closure_predicates=[IS_A, PART_OF], limit=100
)
)
best_score = None
found = False
for score, _, match in results:
if best_score is None:
best_score = score
if match == gene:
found = True
self.assertAlmostEquals(best_score, score)
self.assertTrue(found)

def test_class_enrichment(self):
shutil.copyfile(DB, MUTABLE_DB)
oi = SqlImplementation(OntologyResource(slug=f"sqlite:///{MUTABLE_DB}"))
Expand Down

0 comments on commit 8d2a124

Please sign in to comment.