From 166b3486dc4d806d3d396ef9e110bc253dd04ded Mon Sep 17 00:00:00 2001 From: Jover Date: Tue, 12 Sep 2023 16:43:33 -0700 Subject: [PATCH] ingest: Switch to NCBI Datasets CLI to fetch data Replace the config URLs for the NCBI Virus API with the NCBI Datasets CLI commands that are driven by the `ncbi_taxon_id` present in the config. NCBI datasets downloads a virus dataset ZIP file that includes a metadata JSON Lines file and a sequences FASTA file. To maintain a record of the single NDJSON file on S3, extract the sequences FASTA file and format the metadata into a TSV file that are parsed into a single NDJSON file using `augur curate passthru`. The metadata TSV is created using the NCBI `dataformat` command so that we do not have to parse the nested JSON lines files ourselves and header fields are renamed to match the previous fields we used for NCBI Virus. The NDJSON file created here no longer includes equivalent fields for "title" or "publication". --- ingest/config/config.yaml | 10 +- ingest/source-data/ncbi-dataset-field-map.tsv | 17 +++ .../snakemake_rules/fetch_sequences.smk | 119 +++++++++++++++--- 3 files changed, 122 insertions(+), 24 deletions(-) create mode 100644 ingest/source-data/ncbi-dataset-field-map.tsv diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index 0e7c6e1..345f4fa 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -1,11 +1,11 @@ # Sources of sequences to include in the ingest run sources: ['genbank'] conda_environment: "workflow/envs/nextstrain.yaml" -fetch: - genbank_url: - a: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208893%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" - b: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208895%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" - general: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%2811250%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" + +ncbi_taxon_id: + a: "208893" + b: "208895" + general: "11250" # Params for the transform rulegeneral transform: diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv new file mode 100644 index 0000000..eb79418 --- /dev/null +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -0,0 +1,17 @@ +key value +Accession genbank_accession_rev +Source database database +Isolate Lineage strain +Geographic Region region +Geographic Location location +Isolate Collection date collected +Release date submitted +Update date updated +Length length +Host Name host +Isolate Lineage source isolation_source +BioProjects bioproject_accession +BioSample accession biosample_accession +SRA Accessions sra_accession +Submitter Names authors +Submitter Affiliation submitting_organization diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index b2d667c..58635b3 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -13,41 +13,122 @@ Produces final output as """ -rule fetch_from_genbank: + +rule fetch_ncbi_dataset_package: + params: + ncbi_taxon_id=lambda w: config["ncbi_taxon_id"][w.subtype], + output: + dataset_package=temp("data/{subtype}/ncbi_dataset.zip"), + retries: 5 # Requires snakemake 7.7.0 or later + benchmark: + "benchmarks/{subtype}/fetch_ncbi_dataset_package.txt" + shell: + """ + datasets download virus genome taxon {params.ncbi_taxon_id:q} \ + --no-progressbar \ + --filename {output.dataset_package} + """ + + +rule extract_ncbi_dataset_sequences: + input: + dataset_package="data/{subtype}/ncbi_dataset.zip", + output: + ncbi_dataset_sequences=temp("data/{subtype}/ncbi_dataset_sequences.fasta"), + benchmark: + "benchmarks/{subtype}/extract_ncbi_dataset_sequences.txt" + shell: + """ + unzip -jp {input.dataset_package} \ + ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} + """ + + +def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: + """ + Return list of NCBI Dataset report field mnemonics for fields that we want + to parse out of the dataset report. The column names in the output TSV + are different from the mnemonics. + + See NCBI Dataset docs for full list of available fields and their column + names in the output: + https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields + """ + fields = [ + "accession", + "sourcedb", + "isolate-lineage", + "geo-region", + "geo-location", + "isolate-collection-date", + "release-date", + "update-date", + "length", + "host-name", + "isolate-lineage-source", + "bioprojects", + "biosample-acc", + "sra-accs", + "submitter-names", + "submitter-affiliation", + ] + return ",".join(fields) + + +rule format_ncbi_dataset_report: + # Formats the headers to be the same as before we used NCBI Datasets + # The only fields we do not have equivalents for are "title" and "publications" + input: + dataset_package="data/{subtype}/ncbi_dataset.zip", + ncbi_field_map="source-data/ncbi-dataset-field-map.tsv", output: - csv = "data/genbank.csv" + ncbi_dataset_tsv=temp("data/{subtype}/ncbi_dataset_report.tsv"), params: - URL_a = config['fetch']['genbank_url']['a'], - URL_b = config['fetch']['genbank_url']['b'], - URL_general = config['fetch']['genbank_url']['general'] + fields_to_include=_get_ncbi_dataset_field_mnemonics, + benchmark: + "benchmarks/{subtype}/format_ncbi_dataset_report.txt" shell: """ - curl "{params.URL_a}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} - curl "{params.URL_b}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} - curl "{params.URL_general}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} + dataformat tsv virus-genome \ + --package {input.dataset_package} \ + --fields {params.fields_to_include:q} \ + | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ + | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ + | tsv-select -H -f genbank_accession --rest last \ + > {output.ncbi_dataset_tsv} """ -rule csv_to_ndjson: + +rule format_ncbi_datasets_ndjson: input: - csv = rules.fetch_from_genbank.output.csv + ncbi_dataset_sequences="data/{subtype}/ncbi_dataset_sequences.fasta", + ncbi_dataset_tsv="data/{subtype}/ncbi_dataset_report.tsv", output: - ndjson = "data/genbank.ndjson" + ndjson="data/{subtype}/ncbi.ndjson", + log: + "logs/{subtype}/format_ncbi_datasets_ndjson.txt", + benchmark: + "benchmarks/{subtype}/format_ncbi_datasets_ndjson.txt" shell: """ - python bin/csv-to-ndjson.py \ - --input {input.csv} \ - --output {output.ndjson} + augur curate passthru \ + --metadata {input.ncbi_dataset_tsv} \ + --fasta {input.ncbi_dataset_sequences} \ + --seq-id-column genbank_accession_rev \ + --seq-field sequence \ + --unmatched-reporting warn \ + --duplicate-reporting warn \ + 2> {log} > {output.ndjson} """ rule fetch_all_sequences: input: - all_sources = "data/genbank.ndjson" + all_sources=expand( + "data/{subtype}/ncbi.ndjson", subtype=list(config["ncbi_taxon_id"].keys()) + ), output: - sequences_ndjson = "data/sequences.ndjson" + sequences_ndjson="data/sequences.ndjson", shell: """ cat {input.all_sources} > {output.sequences_ndjson}