From 0c277cd7646ad286d8ef9206ba956a5ecf0cc780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 3 Mar 2023 15:26:10 -0500 Subject: [PATCH] Update the documentation (#53) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🧑‍🏫 update portal page * 🧑‍🏫 update portal example * ⚡ bench * 🧑‍🏫 portal use-case * 🧑‍🏫 phylo * 👐 update badges * ⚙️ remove some AbstractTrees methods that are not needed * 👐 update badge --- README.md | 4 +-- docs/make.jl | 2 +- docs/src/namefinding.md | 8 ----- docs/src/phylo.md | 9 +++--- docs/src/portal.md | 53 +++++++++++++++++++-------------- src/interfaces/abstracttrees.jl | 51 ------------------------------- 6 files changed, 39 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index d13e439..fcaea84 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![DOI](https://zenodo.org/badge/312718490.svg)](https://zenodo.org/badge/latestdoi/312718490) -![CI](https://github.com/EcoJulia/NCBITaxonomy.jl/workflows/CI/badge.svg) [![codecov](https://codecov.io/gh/EcoJulia/NCBITaxonomy.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/EcoJulia/NCBITaxonomy.jl) +![CI](https://github.com/PoisotLab/NCBITaxonomy.jl/workflows/CI/badge.svg) [![codecov](https://codecov.io/gh/PoisotLab/NCBITaxonomy.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/PoisotLab/NCBITaxonomy.jl) -![Documentation](https://github.com/EcoJulia/NCBITaxonomy.jl/workflows/Documentation/badge.svg) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://ecojulia.github.io/NCBITaxonomy.jl/stable) [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://ecojulia.github.io/NCBITaxonomy.jl/dev) +![Documentation](https://github.com/PoisotLab/NCBITaxonomy.jl/workflows/Documentation/badge.svg) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://ecojulia.github.io/NCBITaxonomy.jl/stable) [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://ecojulia.github.io/NCBITaxonomy.jl/dev) This package provides an interface to the [NCBI Taxonomy][ncbitax]. When installed, it will download the *latest* version of the taxonomy files from the diff --git a/docs/make.jl b/docs/make.jl index 8343e7d..5342d18 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,4 @@ -using Documenter, NCBITaxonomy +using Documenter, NCBITaxonomy, AbstractTrees makedocs( sitename="NCBITaxonomy", diff --git a/docs/src/namefinding.md b/docs/src/namefinding.md index 2abda0f..1c1866c 100644 --- a/docs/src/namefinding.md +++ b/docs/src/namefinding.md @@ -105,14 +105,6 @@ viruses = virusfilter() @time taxon(viruses, "Bumbulu ebolavirus"; strict=false); ``` -A `namefilter` can be built in a number of ways, including by passing a list of -taxa: - -```@example taxon -diplectanids = namefilter(descendants(ncbi"Diplectanidae")) -taxon(diplectanids, "Lamellodiscus") -``` - ## Standard namefilters To save some time, there are namefilters pre-populated with the large-level diff --git a/docs/src/phylo.md b/docs/src/phylo.md index 82a417a..e1e7976 100644 --- a/docs/src/phylo.md +++ b/docs/src/phylo.md @@ -27,20 +27,21 @@ tree_leaves = collect(Leaves(tree_root)) We can double-check that these taxa all have the correct common ancestor: -```@example treee +```@example tree commonancestor(tree_leaves) ``` -At this point, we can start creating our tree object. Before we do this, we will add a few overloads to +At this point, we can start creating our tree object. Before we do this, we will +add a few overloads to the `Phylo.jl` functions: -```@example treee +```@example tree Phylo.RootedTree(taxa::Vector{NCBITaxon}) = RootedTree([t.name for t in taxa]) Phylo._hasnode(tr::RootedTree, tax::NCBITaxon) = Phylo._hasnode(tr, tax.name) Phylo._getnode(tr::RootedTree, tax::NCBITaxon) = Phylo._getnode(tr, tax.name) Phylo._createnode!(tr::RootedTree, tax::NCBITaxon) = Phylo._createnode!(tr, tax.name) ``` -```@example treee +```@example tree tree = RootedTree(tree_leaves) ``` diff --git a/docs/src/portal.md b/docs/src/portal.md index 8a0569e..774c9d8 100644 --- a/docs/src/portal.md +++ b/docs/src/portal.md @@ -21,9 +21,16 @@ species = JSON.parsefile(species_file) ## Cleaning up the portal names -There is are two things we want to do at this point: extract the species names -from the file, and then validate that they are spelled correctly, or that they -are the most recent taxonomic name according to NCBI. +There are two things we want to do at this point: extract the species names from +the file, and then validate that they are spelled correctly, or that they are +the most recent taxonomic name according to NCBI. + +The portal data are already identified as belonging to a group of taxa, so we +can get a unique list of them: + +```@example portal +taxo_groups = unique([tax["taxa"] for tax in species]) +``` We will store our results in a data frame: @@ -34,7 +41,9 @@ cleanup = DataFrame( name = String[], rank = Symbol[], order = String[], - taxid = Int[] + taxid = Int[], + same = Bool[], + fuzzy = Bool[] ) ``` @@ -45,10 +54,12 @@ with them: for sp in species portal_name = sp["species"] == "sp." ? sp["genus"] : sp["genus"]*" "*sp["species"] local ncbi_tax + local fuzzy = false try ncbi_tax = taxon(portal_name) catch y if isa(y, NameHasNoDirectMatch) + fuzzy = true ncbi_tax = taxon(portal_name; strict=false) else continue @@ -59,7 +70,7 @@ for sp in species ( sp["species_id"], portal_name, ncbi_tax.name, rank(ncbi_tax), first(filter(t -> isequal(:order)(rank(t)), lineage(ncbi_tax))).name, - ncbi_tax.id + ncbi_tax.id, portal_name == ncbi_tax.name, fuzzy ) ) end @@ -77,31 +88,29 @@ vernacular, or spelling issues: filter(r -> r.portal != r.name, cleanup) ``` -Note that these results should *always* be manually curated. For example, -some species have been match to *Hemiptera*, which sounds suspect: +Out of these, some required to use fuzzy matching to get a proper name, so we +can look at there taxa, as they are likely to require manual curation: ```@example portal -filter(r -> r.order ∈ ["Hemiptera"], cleanup) +filter(r -> r.fuzzy, cleanup) ``` -## Fixing the mis-identified species +Out of these, only `Lizard` has a strange identification as a `Hemiptera`: -Well, the obvious choice here is *manual cleaning*. This is a good solution. -Another thing that `NCBITaxonomy` offers is the ability to build a `namefilter` -from a list of known NCBI taxa. This is good if we know that the names we expect -to find are part of a reference list. +```@example portal +filter(t -> isequal(:class)(rank(t)), lineage(ncbi"Lisarda")) +``` -In this case, we know that the species are going to be vertebrates, so we can use -the `vertebratefinder` function to restrict the search to these groups: +Right. We can dig into this example a little more, because it shows how much +*data entry* can condition the success of name finding. ```@example portal -vert = vertebratefilter(true) # We want taxa that are specific divisions of vertebrates as well -taxon(vert, "Lizard"; strict=false) +similarnames("Lizard"; threshold=0.7) ``` -## Wrapping-up +The *Lisarda* taxon (which is an insect!) is the closest match, simply because +"Lizards" is not a classification we can use -- lizards are a paraphyletic +group, containing a handful of different groups. Based on the information +available, the only information we can say about the taxon identified as +"Lizards" is that it belongs to *Squamata*. -This vignette illustrates how to go through a list of names, and match them -against the NCBI taxonomy. We have seen a number of functions from -`NCBITaxonomy`, including fuzzy string searching, using custom string distances, -and limiting the taxonomic scope of the search. diff --git a/src/interfaces/abstracttrees.jl b/src/interfaces/abstracttrees.jl index 7073387..b50b24a 100644 --- a/src/interfaces/abstracttrees.jl +++ b/src/interfaces/abstracttrees.jl @@ -40,56 +40,5 @@ function AbstractTrees.parent(tax::NCBITaxon) end end -""" - _siblings(tax::NCBITaxon) - -Returns a list of siblings (node descended from the same parent) for the taxon -given as argument. -""" -function _siblings(tax::NCBITaxon) - return (AbstractTrees.children ∘ AbstractTrees.parent)(tax) -end - -""" - AbstractTrees.nextsibling(tax::NCBITaxon) - -Returns the taxon stored immediately after the one given as argument (among the -list of siblings). -""" -function AbstractTrees.nextsibling(tax::NCBITaxon) - it = _siblings(tax) - i = findfirst(isequal(tax), it) - if i < length(it) - return it[i + 1] - else - return nothing - end -end - -""" - AbstractTrees.prevsibling(tax::NCBITaxon) - -Returns the taxon stored immediately before the one given as argument (among the -list of siblings). -""" -function AbstractTrees.prevsibling(tax::NCBITaxon) - it = _siblings(tax) - i = findfirst(isequal(tax), it) - if i > 1 - return it[i - 1] - else - return nothing - end -end - -""" - AbstractTrees.isroot(tax::NCBITaxon) - -Ensure that the node `root:1` is always a root. -""" -function AbstractTrees.isroot(tax::NCBITaxon) - return tax.id == 1 -end - Base.IteratorEltype(::Type{<:TreeIterator{NCBITaxon}}) = Base.HasEltype() Base.eltype(::Type{<:TreeIterator{NCBITaxon}}) = NCBITaxon \ No newline at end of file