Skip to content

Commit

Permalink
Merge pull request #1 from nextstrain/update_ingest
Browse files Browse the repository at this point in the history
Update ingest to accommodate oropouche segments
  • Loading branch information
miparedes authored Aug 12, 2024
2 parents f80d5f9 + 19de24c commit 8daff24
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 94 deletions.
25 changes: 9 additions & 16 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ workdir: workflow.current_basedir
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

segments = ['L', 'M', 'S']

wildcard_constraints:
segment = "|".join(segments)

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
# Nextstrain-maintained ingest workflows will produce metadata files with the
Expand All @@ -17,8 +22,9 @@ configfile: "defaults/config.yaml"
# TODO: Add link to centralized docs on standard Nextstrain metadata fields
rule all:
input:
"results/sequences.fasta",
"results/metadata.tsv",
sequences=expand("results/{segment}/sequences.fasta", segment=segments),
metadata=expand("results/{segment}/metadata.tsv", segment=segments),
metadata_all="results/all/metadata.tsv",


# Note that only PATHOGEN-level customizations should be added to these
Expand All @@ -28,6 +34,7 @@ rule all:
# by build-specific rules.
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"


# We are pushing to standardize ingest workflows with Nextclade runs to include
Expand All @@ -41,21 +48,7 @@ include: "rules/curate.smk"
# to the final metadata TSV.
# To run nextclade.smk rules, include the `defaults/nextclade_config.yaml`
# config file with `nextstrain build ingest --configfile defaults/nextclade_config.yaml`.
if "nextclade" in config:

include: "rules/nextclade.smk"

else:

rule create_final_metadata:
input:
metadata="data/subset_metadata.tsv"
output:
metadata="results/metadata.tsv"
shell:
"""
mv {input.metadata} {output.metadata}
"""

# Allow users to import custom rules provided via the config.
# This allows users to run custom rules that can extend or override the workflow.
Expand Down
6 changes: 5 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
entrez_search_term: ""

# Required to fetch from NCBI Datasets
ncbi_taxon_id: ""
ncbi_taxon_id: "118655"

# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
Expand Down Expand Up @@ -119,3 +119,7 @@ curate:
"abbr_authors",
"institution",
]

nextclade:
segment_reference: "../shared/oropouche_{segment}.fasta"
min_seed_cover: 0.01
8 changes: 4 additions & 4 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ REQUIRED INPUTS:
OUTPUTS:
metadata = data/subset_metadata.tsv
sequences = results/sequences.fasta
sequences = results/all/sequences.fasta
"""

Expand Down Expand Up @@ -62,7 +62,7 @@ rule curate:
annotations=config["curate"]["annotations"],
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences.fasta",
sequences="results/all/sequences.fasta",
log:
"logs/curate.txt",
benchmark:
Expand Down Expand Up @@ -121,11 +121,11 @@ rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
output:
subset_metadata="data/subset_metadata.tsv",
metadata="results/all/metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.subset_metadata}
{input.metadata} > {output.metadata}
"""
5 changes: 1 addition & 4 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ to provide the correct parameter.
- Only returns metadata fields that are available through NCBI Datasets
- Only works for viral genomes
2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/)
- requires `entrez_search_term` config
- Returns all available data via a GenBank file
- Requires a custom script to parse the necessary fields from the GenBank file
"""

# This ruleorder determines which rule to use to produce the final NCBI NDJSON file.
Expand Down
97 changes: 28 additions & 69 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -1,95 +1,54 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
and sequences to split the sequences into L, M, and S segments.
REQUIRED INPUTS:
metadata = data/subset_metadata.tsv
sequences = results/sequences.fasta
metadata = data/subset_metadata.tsv
all_metadata = results/all/metadata.tsv
sequences = results/all/sequences.fasta
OUTPUTS:
metadata = results/metadata.tsv
nextclade = results/nextclade.tsv
alignment = results/alignment.fasta
translations = results/translations.zip
metadata = results/{segment}/metadata.tsv
sequences = results/{segment}/sequences.fasta
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
"""
DATASET_NAME = config["nextclade"]["dataset_name"]


rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade3 dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""


rule run_nextclade:
rule run_nextclade_to_identify_segment:
input:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
sequences="results/sequences.fasta",
sequences = "results/all/sequences.fasta",
segment_reference = config["nextclade"]["segment_reference"],
output:
nextclade="results/nextclade.tsv",
alignment="results/alignment.fasta",
translations="results/translations.zip",
sequences = "results/{segment}/sequences.fasta",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "results/translations/{cds}.fasta",
min_seed_cover = config["nextclade"]["min_seed_cover"],
shell:
"""
nextclade3 run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment} \
--output-translations {params.translations}
zip -rj {output.translations} results/translations
nextclade run \
--input-ref {input.segment_reference} \
--output-fasta {output.sequences} \
--min-seed-cover {params.min_seed_cover} \
--silent \
{input.sequences}
"""


rule join_metadata_and_nextclade:
rule subset_metadata_by_segment:
input:
nextclade="results/nextclade.tsv",
metadata="data/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
metadata = "results/all/metadata.tsv",
sequences = "results/{segment}/sequences.fasta",
output:
metadata="results/metadata.tsv",
metadata = "results/{segment}/metadata.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
strain_id_field = config["curate"]["output_id_field"],
shell:
"""
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk -tl cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -tl rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id_field} \
--output-metadata {output.metadata}
"""
100 changes: 100 additions & 0 deletions shared/oropouche_L.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
>NC_005776.1 Oropouche virus segment L, complete genome
AGTAGTGTGCTCCTATTCCGAAACAAACAAAAACAATCTCAAAATGTCACAACTGTTGCTCAACCAATAT
CGGAATAGGATATTGCACTGCCGTGAACCTGAGATAGCAAAGGATATATGGCGAGATCTATTAAATGATC
GACACAATTACTTTTCTCGGGAATTTTGCAGAGCTGCAAATCTTGAGTACAGAAATGATGTTCCTGCTGA
GGATATTTGTGCTGAAGTTCTTGATGGTTATAAAGCAAGGAAAGTTCGCTTTTGTACACCTGATAATTAC
TTACTACATGATGGAAAGATGTATATAATAGACTTCAAAGTGTCTGTAGACGACCGATCTTCTAGAATCA
CAAGGGAGAAATATAATGAGATTTTTGGAGAGGTATTCAATCCAGAAGGTGTAGATTTTGAAATTGTTAT
TATTAGATTAGATCCTTCAAATATGACGATACATGTGGACTCTCGAGATTTCGTGAATACAATTGGGCCG
ATTACATTAAACATTAGTATGCAATGGTTTTTTGATATGAAAGACTTCTTGTTCGGGAAATTTCGGGATG
ATGATAAATTCCATGCTATAATAAGTCAAGGAGAATTCACAATGACATTGCCATGGATTGAAGAAGACAC
CCCAGAATTGCTTACTCATCCTATATACAATGAATTCATGAGTTCAATGCCAGAGGCAGAACAGGCCCTA
TTCAAGGAAGCATTGGAATTCAAATCATTTGGGGCAGAAAAATGGAATATCTTTTTGAAGGGGGTGATGT
CAAAGTATGGTGAATATTATAAAGAATTTACTAAAGGACATGCTCATTCTATATTTCTGACAACAGGGGA
CTACCCCAAGCCAGACAAAGACCAAATTTCAGCAGGTTGGAGAGAAATGGTAAACAGAGTAAGCTCTGAA
CGTGACATGTCAAATGACATAAATCAGGAAAAACCAAGCATGCATTTTATATGGGCAAAGAATGATTCAA
ATAGCAACAATAATATACAAAAGCTAATCAAACTATCTAAATCACTGCAAGCTATGAGCGGGACAGGGAG
CTATGTAAATGCTTTCAAGTCATTAGGGAGATTAATGGATATATCATCAGATGTTAAAAAATATGAATCA
TTTTGTGGGAAATTGAAATCTCTGGCAAGGTCTAGTATAAAAAAACTTGACAGGAAAATAGAGCCAATAC
AAATTGGGACTGCAACTGTCTTATGGGAACAGCAATTTAAACTAGATACAGATGTTATAAAAAGAGAAGA
CAGAATACATTTAATGAAAGATTATCTTGGGATCGGTAAGCACAAATCATTTTCAAAGAAATTAAACAAC
GACATAAATACTGATAAGCCTAAAATATTAAATTTCAACAATGATGATATAGTCAGGAAATGCAAAGATA
AATATAATCAAGTCATACATAACCTATCCCAAATCAATGAATTAGATAAGATTGGAAACTACCTAGAGCA
CTTTTCAGCTAAAATTAGTGCCTGCAGTGTAGAAATGTGGGATTTTATATATAATACAACCAAAACTAAA
TACTGGCAATGCATCAATGACTATTCCACCCTAATGAAAAACATGTTAGCTGTCTCTCAATATAATAGAC
ACAATACGTTTAGAATTGTCTCATGTGCAAACAATAATGTATTTGGTCTAGTAATGCCAAGCTCAGATAT
AAAGACAAAAAAAGCAACTTTAGTCTATGCAATAATGGCTCTCCATAATGAGGAGGCAGAAATAGCAGAA
CTTGGCTCACTCTACTCAACTTTTAAGACAGCAACAGGATATATTTCAATATCAAAGGCTTTTAGGCTGG
ATAAAGAAAGATGCCAACGCATAGTATCCTCTCCAGGCTTGTTCCTCATGACAAGCTGCCTATTATTCAA
CGGTAACAAGAGTTTAGAATTTGATAAATTACTAGGATTTTCATTTTTTACGTCAATATCAATTACGAAA
GCTATGCTCTCCCTTACTGAGCCTTCACGTTATATGATCATGAACTCGTTAGCAGTTTCCAGCCATGTAA
GAGAGTATATATCTGAAAAATTCTCCCCTTATACAAAAACATCATTTTCTGTGGTAATGACAGACTTAAT
CAAGAAGGGTTGCTATTCAGCATATGAACAGAGAAAAAAAGTACAAATAAGAGACATAAAATTAACAGAT
TATGATATAACACAAAAGGGAGTGGATTCCAAAAGAGATCTTAAATCTATTTGGTTCCCAGGAAAGGTAA
ACCTGAAAGAATATTTAAACCAAATTTATCTACCATTTTATTTTAACTCTAAAGGATTACATGAAAAACA
TCATGTCTTGATAGATTTGGCTAAAACAGTACTAGAAATCGAAAAAGAGCAAAGGGAGTCATTACCTGAG
CCATGGTCAGAGATACCTGCTAAGCGACTGTCACTTAATGTTTTAATTTACTCATTGCAGGAACTGAATT
TAGATACTTCAAGACATAATTTTGTAAGAAGCCGGGTGGAAAACGCAAATAATTTCAACAGATCTATAAC
GACAATATCTACTTTTACCAGCTCAAAATCATGCATTAAGATTGGTGATTTTGAAGAAGAAAAAAGAGAA
AAACTAAGAATGATACAAAAGAAACTTGCAAAGGATATTTCTAAATTAACCATAGCCAACCCAGCATTCT
TAGATGAGATCACAAACGAACATGAGATAAGGCATTCAACTTATGAGGACTTAAAACAATCTATCCCAGA
TTACACAGATTATATGTCTGTGAAAGTTTTTGACAGATTGTACGAGAAGATTACTACCAATGAAATAAAT
GATAAGGAAACAGTCAAGCTGATTCTAGAGACCATGAAAAAACATAAAATATTTCATTTTGGATTCTTCA
ATAAAGGACAAAAAACAGCCAAAGATAGAGAAATATTTTTAGGTGAATTTGAAGCAAAAATGTGTCTGTA
CCTTGTCGAAAGAATAGCTAAAGAGAGGTGCAAATTAAACCCTGAAGAAATGATAAGTGAACCAGGCGAC
TCGAAACTAAGGGTATTAGAGAAGCAATCAGAAGACGAAATCAGGTATATTAGCAATACAATAAAGACAT
TAGGGAATGCCATAGAGAACTTGCAATCTGGATCTTTAAATTGGGCAGATATATGCGAAAACAAAGCAAG
AGGACTTAAGATAGAAATAAATGCTGATATGTCCAAATGGAGTGCCCAAGATGTACTTTTTAAATATTTT
TGGTTGATAGTGCTTGATCCCATCTTATATCCTGCTGAGAGGAAAAGGATAATTTATTTCCTCTGTAATT
ATATGCAGAAAAGGCTTATAATGCCCGATGAATTGCTCACTACTATATTGGATCAAAGAGTTCCTTATTC
AAATGACATAATTGGATTAATGACAAACAATTATAGGTCTAATACAGTAGAAATAAAGCGTAACTGGCTT
CAAGGCAACTTAAATTATACAAGCAGTTACTTACACAGCTGTAGTATGTCTGTGTACAAAGATATAATAA
GAGAAGCAGCAATATTATTAGAAGGAGAAGCCCTTGTGAACTCAATGGTACATTCTGATGATAATCAAAC
ATCTATATGTATGGTGCAGAATAAATTACCAGATGACAATATAATTGAATTTTGCATTAAGATATTCGAG
AAGATATGCTTAACTTTTGGCAATCAGGCAAATATGAAGAAGACATATCTAACTAACTTCATCAAAGAGT
TTGTTTCTTTATTTAATATACATGGAGAACCATTTTCTATATATGGGAGATTTCTACTCACAGCAGTAGG
AGACTGTGCCTATCTAGGGCCTTATGAAGATTTAGCAAGTAGGCTATCTGCAACACAAACTGCTATAAAG
CATGGTTGCCCACCATCACTTGCATGGGTATCTATCGCTCTAAATCACTGGATAACCCACACTACATATA
ATATGTTGCCTGGCCAAAATAATGACCCGTTACCATTCTTCCCTACTAACAATAGAAGTGAAATACCAGT
AGAGATGTGCGGAATACTAGAAAGTGATTTATCAACAATTGCACTAACTGGTTTAGAAGCAGGGAATGTC
ACGTTTCTAACAAATATAGCAAGGAAGTTATCATCCCCAATCTTACAAAGAGAAAGTATTCAAGATCAAT
ACAATTCTATAGAAAAGTGGGATCTGAGCAAATTATCACAGATCGACATTCTAAGGCTTAAAATGCTCAG
GTATATATCTCTTGATAGTTCAGTCACATCTGATGATGGTATGGGGGAGACTAGTGAAATGAGATCTCGA
TCACTTTTAACACCTCGTAAATTCACAACAAGTGGGTCACTTAATAGGTTGAAATCATATAAAGACTTTC
AAGATATAATAGCAGATGAGGACAAGACAAACGAACTATTTGAGAATTTCATTAGACACCCAGAGTTACT
GGTTACAAAAGGCGAAACATTTGAAGAATTTGTTAATACGATATTATTTAGGTACAATTCAAAGAAATTC
AAAGAATCTTTGTCAATACAAAACCCAGCACAGCTTTTTATTGAGCAAATATTATTTTCCAATAAACCAG
TAATTGACTACACTAGCATACATGACAAGATTTTTGGATTACAAGACATGCCAGGAATTGAAGAACTAGA
TACAATTATAGGTCGCAAAACATTTGTTGAGAGTTATGTTCAAATCGTAGATGACTTAAGCAATTTAACA
TTGGATATAAACGATGTCAAGACTATATTTGCCTTTTGTCTTATGAATGACCCACTACTGATCACATCTG
CTAACAATATAATAATGTCTGTTAAGGGACATAGTCAAGAAAGAATAGGTCAATCAGCATGCAAAATGCC
AGAGGTCCGAAGTCTAAAACTCATACATTATTCACCAGCAGTTGTTTTGAGAGCCTATGTGAGAGGGCCA
ACAAATGTACCGAATGTAGATATAGATGAACTTGCAAGGGATCTATCTCATTTAGAAGACTTCATACAAA
GTACAAAACTCAGAGAAAATATGAGAGAGAGAATAGAAATAAATGAGAAGCGGCACTTAGGAAGGGATTT
CAAATTTGAAATCAAAGAACTAACTAGATTTTACCAAGTGTGTTATGATTACATAAAGTCTACAGAACAT
AAAGTCAAGGTATTCATATTGCCATACAAAGTTTTCACATCAATAGAATTCTGCGGGGCACTGACAGGTA
ACTTGATAAATGACAAATTATGGTACATAACGCATTATCTGAAAAATATAGTGTCTACTACACATAAGGC
ACAAATTTCTTCTTCACCTGAATTGGAATTGCAAATTGCTGATGAGGCACTAAGACTAGTAGCACATTTT
GCTGATACTTTCTTGGCATCAGAATCAAGAATACAATTTCTGAAGAAAATTATTGAAGAATTCACATACA
AAGGGATACCTGTAAAACATTTATACTCAAAAATAAAGAACTCCAAGTTGAGGGTTAAATTTCTAGGGAT
TCTTTTATGGTTAGATGATCTAACACAGAATGATCTGGATAAATTTGATGCAGATAAATCAGATGAAAAG
ATTATATGGAATAACTGGCAAGTGTCAAGAGATATGAATACTGGACCAATAGACTTAATGATAAGCGGTT
ACTCTAGACAGCTGCGGATCACTGGGGAAGATGACAAATTGATTGCTGCTGAATTGCAGGTTACTAGATT
GTCAGAAGATTTAATTTATAGACACGGTCAGGCAATGTTGAATAAGCCACACGGCTTAAAGCTTGAAAAA
ATGCAACCTGTGACTGAGATGTCTAAACGATTACATTATATCGTTTTCCAGCAAAGATCACGGAAACGAT
ACTTCTATTCTATATTACCCACCCAAGTAATTGAGGACCATAATTCTAGAGTTGAATCATCTAGGCTAAG
CAGAGATTCAAAATGGGTTCCTGTATGCCCTGTTGCAATATCAAAACTCTACCAACAAGGACGGCCTATA
CTTTCCAAAGTTAGAAATCTGAATATGCAGACTCATTCGCTTTCCAGAATACAAGTTAATGTAGATGAAT
ATGCCATCACGAGAAGAGCACATTTTCAGAAAATGCCTTTCTTCGAAGGACCATCAATCCCTTCTGGTGG
TATGGATTTGTCTGAGTTGATGAAATCTACATCCCTATTAAGCTTGAATTATGATAACATAAAAAATGCA
TCCTTATTGGACATGTCTAGGGTATTTAAGTGCAATGGCAGTGGAGATGACCAAATGGCTTTCGAATTTC
TATCGGACGAAATTTTGGAGCAAGATGTAGTTGAAGAAATAGAATGCAACCCTATATTTTCTATTAGTTA
TACAAAAAGAGGAGAATCCAATATGACTTATAAAAATGCTTTCCACAAAGCCTTAATCTCAGAATGTGAC
AAATTTGAAGAAGCATTTGACTTCCTCGACATGGGATTTTGCTCGAATGAAAATCTTAGTATTCTGGAGG
AAATACATTGGATAATCAGTTATTTAAAAACAAATCAATGGTCTACGGAACTAGACAATTGTATTCACAT
GTGCATGTACAGGAATGGATATGATGCAGAATATCATAAATTTGATATACCCTCTAAATTCCTCAAAGAC
CCAATAAACCGAACAATAAATTGGACTGAAGTCATTGAATTTATATTATTAATTGAAGATTTCCAAACAA
AAATTGAGCCATGGTCTAGTATGAAGTCACACTTCTGTTCAAAAGCACACAGTGTAGCACTAGAGTGTAT
GAAAAATGAGAAAAGATCATTGGCAGAATTTGTAGACAAAAGTAAGAAAACTGGCAAATCCAAATTTGAC
TTCTAAGGTATACACATGTAAAAGTAGTGTTTGTTTCTAAATAGGAGCACACTACT

Loading

0 comments on commit 8daff24

Please sign in to comment.