diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index 0e7c6e1..345f4fa 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -1,11 +1,11 @@ # Sources of sequences to include in the ingest run sources: ['genbank'] conda_environment: "workflow/envs/nextstrain.yaml" -fetch: - genbank_url: - a: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208893%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" - b: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28208895%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" - general: "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%2811250%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession%3Aid%2Cgenbank_accession_rev%3AAccVer_s%2Cdatabase%3ASourceDB_s%2Cstrain%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cisolation_source%3AIsolation_csv%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Csequence%3ANucleotide_seq&sort=SourceDB_s+desc%2C+CollectionDate_s+asc%2C+id+asc&email=hello%40nextstrain.org" + +ncbi_taxon_id: + a: "208893" + b: "208895" + general: "11250" # Params for the transform rulegeneral transform: diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv new file mode 100644 index 0000000..eb79418 --- /dev/null +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -0,0 +1,17 @@ +key value +Accession genbank_accession_rev +Source database database +Isolate Lineage strain +Geographic Region region +Geographic Location location +Isolate Collection date collected +Release date submitted +Update date updated +Length length +Host Name host +Isolate Lineage source isolation_source +BioProjects bioproject_accession +BioSample accession biosample_accession +SRA Accessions sra_accession +Submitter Names authors +Submitter Affiliation submitting_organization diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index b2d667c..58635b3 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -13,41 +13,122 @@ Produces final output as """ -rule fetch_from_genbank: + +rule fetch_ncbi_dataset_package: + params: + ncbi_taxon_id=lambda w: config["ncbi_taxon_id"][w.subtype], + output: + dataset_package=temp("data/{subtype}/ncbi_dataset.zip"), + retries: 5 # Requires snakemake 7.7.0 or later + benchmark: + "benchmarks/{subtype}/fetch_ncbi_dataset_package.txt" + shell: + """ + datasets download virus genome taxon {params.ncbi_taxon_id:q} \ + --no-progressbar \ + --filename {output.dataset_package} + """ + + +rule extract_ncbi_dataset_sequences: + input: + dataset_package="data/{subtype}/ncbi_dataset.zip", + output: + ncbi_dataset_sequences=temp("data/{subtype}/ncbi_dataset_sequences.fasta"), + benchmark: + "benchmarks/{subtype}/extract_ncbi_dataset_sequences.txt" + shell: + """ + unzip -jp {input.dataset_package} \ + ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} + """ + + +def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: + """ + Return list of NCBI Dataset report field mnemonics for fields that we want + to parse out of the dataset report. The column names in the output TSV + are different from the mnemonics. + + See NCBI Dataset docs for full list of available fields and their column + names in the output: + https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields + """ + fields = [ + "accession", + "sourcedb", + "isolate-lineage", + "geo-region", + "geo-location", + "isolate-collection-date", + "release-date", + "update-date", + "length", + "host-name", + "isolate-lineage-source", + "bioprojects", + "biosample-acc", + "sra-accs", + "submitter-names", + "submitter-affiliation", + ] + return ",".join(fields) + + +rule format_ncbi_dataset_report: + # Formats the headers to be the same as before we used NCBI Datasets + # The only fields we do not have equivalents for are "title" and "publications" + input: + dataset_package="data/{subtype}/ncbi_dataset.zip", + ncbi_field_map="source-data/ncbi-dataset-field-map.tsv", output: - csv = "data/genbank.csv" + ncbi_dataset_tsv=temp("data/{subtype}/ncbi_dataset_report.tsv"), params: - URL_a = config['fetch']['genbank_url']['a'], - URL_b = config['fetch']['genbank_url']['b'], - URL_general = config['fetch']['genbank_url']['general'] + fields_to_include=_get_ncbi_dataset_field_mnemonics, + benchmark: + "benchmarks/{subtype}/format_ncbi_dataset_report.txt" shell: """ - curl "{params.URL_a}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} - curl "{params.URL_b}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} - curl "{params.URL_general}" --fail --silent --show-error --http1.1 \ - --header 'User-Agent: https://github.com/nextstrain/rsv (hello@nextstrain.org)' >> {output} + dataformat tsv virus-genome \ + --package {input.dataset_package} \ + --fields {params.fields_to_include:q} \ + | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ + | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ + | tsv-select -H -f genbank_accession --rest last \ + > {output.ncbi_dataset_tsv} """ -rule csv_to_ndjson: + +rule format_ncbi_datasets_ndjson: input: - csv = rules.fetch_from_genbank.output.csv + ncbi_dataset_sequences="data/{subtype}/ncbi_dataset_sequences.fasta", + ncbi_dataset_tsv="data/{subtype}/ncbi_dataset_report.tsv", output: - ndjson = "data/genbank.ndjson" + ndjson="data/{subtype}/ncbi.ndjson", + log: + "logs/{subtype}/format_ncbi_datasets_ndjson.txt", + benchmark: + "benchmarks/{subtype}/format_ncbi_datasets_ndjson.txt" shell: """ - python bin/csv-to-ndjson.py \ - --input {input.csv} \ - --output {output.ndjson} + augur curate passthru \ + --metadata {input.ncbi_dataset_tsv} \ + --fasta {input.ncbi_dataset_sequences} \ + --seq-id-column genbank_accession_rev \ + --seq-field sequence \ + --unmatched-reporting warn \ + --duplicate-reporting warn \ + 2> {log} > {output.ndjson} """ rule fetch_all_sequences: input: - all_sources = "data/genbank.ndjson" + all_sources=expand( + "data/{subtype}/ncbi.ndjson", subtype=list(config["ncbi_taxon_id"].keys()) + ), output: - sequences_ndjson = "data/sequences.ndjson" + sequences_ndjson="data/sequences.ndjson", shell: """ cat {input.all_sources} > {output.sequences_ndjson}