Skip to content

Commit

Permalink
ingest: Switch to NCBI Datasets CLI to fetch data
Browse files Browse the repository at this point in the history
Replace the config URLs for the NCBI Virus API with the NCBI Datasets
CLI commands that are driven by the `ncbi_taxon_id` present in the
config.

NCBI datasets downloads a virus dataset ZIP file that includes a
metadata JSON Lines file and a sequences FASTA file. To maintain a record
of the single NDJSON file on S3, extract the sequences FASTA file and
format the metadata into a TSV file that are parsed into a single NDJSON
file using `augur curate passthru`. The metadata TSV is created using
the NCBI `dataformat` command so that we do not have to parse the nested
JSON lines files ourselves and header fields are renamed to match the
previous fields we used for NCBI Virus.

The NDJSON file created here no longer includes equivalent fields
for "title" or "publication".
  • Loading branch information
joverlee521 committed Sep 12, 2023
1 parent 4098eb6 commit 166b348
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 24 deletions.
10 changes: 5 additions & 5 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Sources of sequences to include in the ingest run
sources: ['genbank']
conda_environment: "workflow/envs/nextstrain.yaml"
fetch:
genbank_url:
a: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208893%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"
b: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208895%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"
general: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%2811250%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org"

ncbi_taxon_id:
a: "208893"
b: "208895"
general: "11250"

# Params for the transform rulegeneral
transform:
Expand Down
17 changes: 17 additions & 0 deletions ingest/source-data/ncbi-dataset-field-map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
key value
Accession genbank_accession_rev
Source database database
Isolate Lineage strain
Geographic Region region
Geographic Location location
Isolate Collection date collected
Release date submitted
Update date updated
Length length
Host Name host
Isolate Lineage source isolation_source
BioProjects bioproject_accession
BioSample accession biosample_accession
SRA Accessions sra_accession
Submitter Names authors
Submitter Affiliation submitting_organization
119 changes: 100 additions & 19 deletions ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,41 +13,122 @@ Produces final output as
"""

rule fetch_from_genbank:

rule fetch_ncbi_dataset_package:
params:
ncbi_taxon_id=lambda w: config["ncbi_taxon_id"][w.subtype],
output:
dataset_package=temp("data/{subtype}/ncbi_dataset.zip"),
retries: 5 # Requires snakemake 7.7.0 or later
benchmark:
"benchmarks/{subtype}/fetch_ncbi_dataset_package.txt"
shell:
"""
datasets download virus genome taxon {params.ncbi_taxon_id:q} \
--no-progressbar \
--filename {output.dataset_package}
"""


rule extract_ncbi_dataset_sequences:
input:
dataset_package="data/{subtype}/ncbi_dataset.zip",
output:
ncbi_dataset_sequences=temp("data/{subtype}/ncbi_dataset_sequences.fasta"),
benchmark:
"benchmarks/{subtype}/extract_ncbi_dataset_sequences.txt"
shell:
"""
unzip -jp {input.dataset_package} \
ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences}
"""


def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
"""
Return list of NCBI Dataset report field mnemonics for fields that we want
to parse out of the dataset report. The column names in the output TSV
are different from the mnemonics.
See NCBI Dataset docs for full list of available fields and their column
names in the output:
https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
"""
fields = [
"accession",
"sourcedb",
"isolate-lineage",
"geo-region",
"geo-location",
"isolate-collection-date",
"release-date",
"update-date",
"length",
"host-name",
"isolate-lineage-source",
"bioprojects",
"biosample-acc",
"sra-accs",
"submitter-names",
"submitter-affiliation",
]
return ",".join(fields)


rule format_ncbi_dataset_report:
# Formats the headers to be the same as before we used NCBI Datasets
# The only fields we do not have equivalents for are "title" and "publications"
input:
dataset_package="data/{subtype}/ncbi_dataset.zip",
ncbi_field_map="source-data/ncbi-dataset-field-map.tsv",
output:
csv = "data/genbank.csv"
ncbi_dataset_tsv=temp("data/{subtype}/ncbi_dataset_report.tsv"),
params:
URL_a = config['fetch']['genbank_url']['a'],
URL_b = config['fetch']['genbank_url']['b'],
URL_general = config['fetch']['genbank_url']['general']
fields_to_include=_get_ncbi_dataset_field_mnemonics,
benchmark:
"benchmarks/{subtype}/format_ncbi_dataset_report.txt"
shell:
"""
curl "{params.URL_a}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}
curl "{params.URL_b}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}
curl "{params.URL_general}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}
dataformat tsv virus-genome \
--package {input.dataset_package} \
--fields {params.fields_to_include:q} \
| csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
| csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
| tsv-select -H -f genbank_accession --rest last \
> {output.ncbi_dataset_tsv}
"""

rule csv_to_ndjson:

rule format_ncbi_datasets_ndjson:
input:
csv = rules.fetch_from_genbank.output.csv
ncbi_dataset_sequences="data/{subtype}/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv="data/{subtype}/ncbi_dataset_report.tsv",
output:
ndjson = "data/genbank.ndjson"
ndjson="data/{subtype}/ncbi.ndjson",
log:
"logs/{subtype}/format_ncbi_datasets_ndjson.txt",
benchmark:
"benchmarks/{subtype}/format_ncbi_datasets_ndjson.txt"
shell:
"""
python bin/csv-to-ndjson.py \
--input {input.csv} \
--output {output.ndjson}
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column genbank_accession_rev \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
2> {log} > {output.ndjson}
"""


rule fetch_all_sequences:
input:
all_sources = "data/genbank.ndjson"
all_sources=expand(
"data/{subtype}/ncbi.ndjson", subtype=list(config["ncbi_taxon_id"].keys())
),
output:
sequences_ndjson = "data/sequences.ndjson"
sequences_ndjson="data/sequences.ndjson",
shell:
"""
cat {input.all_sources} > {output.sequences_ndjson}
Expand Down

0 comments on commit 166b348

Please sign in to comment.