From efb44f0ead1e37239e72d011d78b121d83179f7d Mon Sep 17 00:00:00 2001 From: Miguel Paredes Date: Tue, 30 Jul 2024 15:09:02 -0700 Subject: [PATCH 1/3] modifying the pathogen repo guide to take in oropouche specifically by changing the config.yaml file --- ingest/defaults/config.yaml | 2 +- ingest/rules/fetch_from_ncbi.smk | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index ecf5be6..0a04d06 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -8,7 +8,7 @@ entrez_search_term: "" # Required to fetch from NCBI Datasets -ncbi_taxon_id: "" +ncbi_taxon_id: "118655" # The list of NCBI Datasets fields to include from NCBI Datasets output # These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 447261e..deeebdd 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -20,10 +20,7 @@ to provide the correct parameter. - Only returns metadata fields that are available through NCBI Datasets - Only works for viral genomes -2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) - - requires `entrez_search_term` config - - Returns all available data via a GenBank file - - Requires a custom script to parse the necessary fields from the GenBank file + """ # This ruleorder determines which rule to use to produce the final NCBI NDJSON file. From 07b5990c76c2da82ab0ac1ebe39c3422d454bb6d Mon Sep 17 00:00:00 2001 From: Miguel Paredes Date: Tue, 30 Jul 2024 16:37:55 -0700 Subject: [PATCH 2/3] following the nextstrain lassa fever example, the ingest was changed in order to accommidate the three different segments of oropouche virus --- ingest/Snakefile | 35 +++++++------ ingest/defaults/config.yaml | 4 ++ ingest/rules/curate.smk | 8 +-- ingest/rules/nextclade.smk | 98 +++++++++++------------------------ shared/oropouche_L.fasta | 100 ++++++++++++++++++++++++++++++++++++ shared/oropouche_M.fasta | 65 +++++++++++++++++++++++ shared/oropouche_S.fasta | 13 +++++ 7 files changed, 233 insertions(+), 90 deletions(-) create mode 100644 shared/oropouche_L.fasta create mode 100644 shared/oropouche_M.fasta create mode 100644 shared/oropouche_S.fasta diff --git a/ingest/Snakefile b/ingest/Snakefile index ee2f88f..8474409 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -8,6 +8,11 @@ workdir: workflow.current_basedir # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" +segments = ['L', 'M', 'S'] + +wildcard_constraints: + segment = "|".join(segments) + # This is the default rule that Snakemake will run when there are no specified targets. # The default output of the ingest workflow is usually the curated metadata and sequences. # Nextstrain-maintained ingest workflows will produce metadata files with the @@ -17,8 +22,9 @@ configfile: "defaults/config.yaml" # TODO: Add link to centralized docs on standard Nextstrain metadata fields rule all: input: - "results/sequences.fasta", - "results/metadata.tsv", + sequences=expand("results/{segment}/sequences.fasta", segment=segments), + metadata=expand("results/{segment}/metadata.tsv", segment=segments), + metadata_all="results/all/metadata.tsv", # Note that only PATHOGEN-level customizations should be added to these @@ -28,6 +34,7 @@ rule all: # by build-specific rules. include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" +include: "rules/nextclade.smk" # We are pushing to standardize ingest workflows with Nextclade runs to include @@ -41,21 +48,15 @@ include: "rules/curate.smk" # to the final metadata TSV. # To run nextclade.smk rules, include the `defaults/nextclade_config.yaml` # config file with `nextstrain build ingest --configfile defaults/nextclade_config.yaml`. -if "nextclade" in config: - - include: "rules/nextclade.smk" - -else: - - rule create_final_metadata: - input: - metadata="data/subset_metadata.tsv" - output: - metadata="results/metadata.tsv" - shell: - """ - mv {input.metadata} {output.metadata} - """ +rule create_final_metadata: + input: + metadata="data/subset_metadata.tsv" + output: + metadata="results/all/metadata.tsv" + shell: + """ + cp {input.metadata} {output.metadata} + """ # Allow users to import custom rules provided via the config. # This allows users to run custom rules that can extend or override the workflow. diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 0a04d06..f51469c 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -119,3 +119,7 @@ curate: "abbr_authors", "institution", ] + +nextclade: + segment_reference: "../shared/oropouche_{segment}.fasta" + min_seed_cover: 0.01 \ No newline at end of file diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 71ffa18..44855e3 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -8,7 +8,7 @@ REQUIRED INPUTS: OUTPUTS: metadata = data/subset_metadata.tsv - sequences = results/sequences.fasta + sequences = results/all/sequences.fasta """ @@ -62,7 +62,7 @@ rule curate: annotations=config["curate"]["annotations"], output: metadata="data/all_metadata.tsv", - sequences="results/sequences.fasta", + sequences="results/all/sequences.fasta", log: "logs/curate.txt", benchmark: @@ -121,11 +121,11 @@ rule subset_metadata: input: metadata="data/all_metadata.tsv", output: - subset_metadata="data/subset_metadata.tsv", + metadata="data/subset_metadata.tsv", params: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: """ tsv-select -H -f {params.metadata_fields} \ - {input.metadata} > {output.subset_metadata} + {input.metadata} > {output.metadata} """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 8bda056..a6a3e17 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -1,95 +1,55 @@ """ This part of the workflow handles running Nextclade on the curated metadata -and sequences. +and sequences to split the sequences into L, M, and S segments. REQUIRED INPUTS: - metadata = data/subset_metadata.tsv - sequences = results/sequences.fasta + metadata = data/subset_metadata.tsv + all_metadata = results/all/metadata.tsv + sequences = results/all/sequences.fasta OUTPUTS: - metadata = results/metadata.tsv - nextclade = results/nextclade.tsv - alignment = results/alignment.fasta - translations = results/translations.zip + metadata = results/{segment}/metadata.tsv + sequences = results/{segment}/sequences.fasta See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ -DATASET_NAME = config["nextclade"]["dataset_name"] - -rule get_nextclade_dataset: - """Download Nextclade dataset""" - output: - dataset=f"data/nextclade_data/{DATASET_NAME}.zip", - params: - dataset_name=DATASET_NAME - shell: - """ - nextclade3 dataset get \ - --name={params.dataset_name:q} \ - --output-zip={output.dataset} \ - --verbose - """ - - -rule run_nextclade: +rule run_nextclade_to_identify_segment: input: - dataset=f"data/nextclade_data/{DATASET_NAME}.zip", - sequences="results/sequences.fasta", + metadata = "data/subset_metadata.tsv", + sequences = "results/all/sequences.fasta", + segment_reference = config["nextclade"]["segment_reference"], output: - nextclade="results/nextclade.tsv", - alignment="results/alignment.fasta", - translations="results/translations.zip", + sequences = "results/{segment}/sequences.fasta", params: - # The lambda is used to deactivate automatic wildcard expansion. - # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 - translations=lambda w: "results/translations/{cds}.fasta", + min_seed_cover = config["nextclade"]["min_seed_cover"], shell: """ - nextclade3 run \ - {input.sequences} \ - --input-dataset {input.dataset} \ - --output-tsv {output.nextclade} \ - --output-fasta {output.alignment} \ - --output-translations {params.translations} - - zip -rj {output.translations} results/translations + nextclade run \ + --input-ref {input.segment_reference} \ + --output-fasta {output.sequences} \ + --min-seed-cover {params.min_seed_cover} \ + --silent \ + {input.sequences} """ - -rule join_metadata_and_nextclade: +rule subset_metadata_by_segment: input: - nextclade="results/nextclade.tsv", - metadata="data/subset_metadata.tsv", - nextclade_field_map=config["nextclade"]["field_map"], + metadata = "results/all/metadata.tsv", + sequences = "results/{segment}/sequences.fasta", output: - metadata="results/metadata.tsv", + metadata = "results/{segment}/metadata.tsv", params: - metadata_id_field=config["curate"]["output_id_field"], - nextclade_id_field=config["nextclade"]["id_field"], + strain_id_field = config["curate"]["output_id_field"], shell: """ - export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` - - csvtk -tl cut -f $SUBSET_FIELDS \ - {input.nextclade} \ - | csvtk -tl rename2 \ - -F \ - -f '*' \ - -p '(.+)' \ - -r '{{kv}}' \ - -k {input.nextclade_field_map} \ - | tsv-join -H \ - --filter-file - \ - --key-fields {params.nextclade_id_field} \ - --data-fields {params.metadata_id_field} \ - --append-fields '*' \ - --write-all ? \ - {input.metadata} \ - | tsv-select -H --exclude {params.nextclade_id_field} \ - > {output.metadata} - """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id_field} \ + --output-metadata {output.metadata} + """ \ No newline at end of file diff --git a/shared/oropouche_L.fasta b/shared/oropouche_L.fasta new file mode 100644 index 0000000..cfa7b5a --- /dev/null +++ b/shared/oropouche_L.fasta @@ -0,0 +1,100 @@ +>NC_005776.1 Oropouche virus segment L, complete genome +AGTAGTGTGCTCCTATTCCGAAACAAACAAAAACAATCTCAAAATGTCACAACTGTTGCTCAACCAATAT +CGGAATAGGATATTGCACTGCCGTGAACCTGAGATAGCAAAGGATATATGGCGAGATCTATTAAATGATC +GACACAATTACTTTTCTCGGGAATTTTGCAGAGCTGCAAATCTTGAGTACAGAAATGATGTTCCTGCTGA +GGATATTTGTGCTGAAGTTCTTGATGGTTATAAAGCAAGGAAAGTTCGCTTTTGTACACCTGATAATTAC +TTACTACATGATGGAAAGATGTATATAATAGACTTCAAAGTGTCTGTAGACGACCGATCTTCTAGAATCA +CAAGGGAGAAATATAATGAGATTTTTGGAGAGGTATTCAATCCAGAAGGTGTAGATTTTGAAATTGTTAT +TATTAGATTAGATCCTTCAAATATGACGATACATGTGGACTCTCGAGATTTCGTGAATACAATTGGGCCG +ATTACATTAAACATTAGTATGCAATGGTTTTTTGATATGAAAGACTTCTTGTTCGGGAAATTTCGGGATG +ATGATAAATTCCATGCTATAATAAGTCAAGGAGAATTCACAATGACATTGCCATGGATTGAAGAAGACAC +CCCAGAATTGCTTACTCATCCTATATACAATGAATTCATGAGTTCAATGCCAGAGGCAGAACAGGCCCTA +TTCAAGGAAGCATTGGAATTCAAATCATTTGGGGCAGAAAAATGGAATATCTTTTTGAAGGGGGTGATGT +CAAAGTATGGTGAATATTATAAAGAATTTACTAAAGGACATGCTCATTCTATATTTCTGACAACAGGGGA +CTACCCCAAGCCAGACAAAGACCAAATTTCAGCAGGTTGGAGAGAAATGGTAAACAGAGTAAGCTCTGAA +CGTGACATGTCAAATGACATAAATCAGGAAAAACCAAGCATGCATTTTATATGGGCAAAGAATGATTCAA +ATAGCAACAATAATATACAAAAGCTAATCAAACTATCTAAATCACTGCAAGCTATGAGCGGGACAGGGAG +CTATGTAAATGCTTTCAAGTCATTAGGGAGATTAATGGATATATCATCAGATGTTAAAAAATATGAATCA +TTTTGTGGGAAATTGAAATCTCTGGCAAGGTCTAGTATAAAAAAACTTGACAGGAAAATAGAGCCAATAC +AAATTGGGACTGCAACTGTCTTATGGGAACAGCAATTTAAACTAGATACAGATGTTATAAAAAGAGAAGA +CAGAATACATTTAATGAAAGATTATCTTGGGATCGGTAAGCACAAATCATTTTCAAAGAAATTAAACAAC +GACATAAATACTGATAAGCCTAAAATATTAAATTTCAACAATGATGATATAGTCAGGAAATGCAAAGATA +AATATAATCAAGTCATACATAACCTATCCCAAATCAATGAATTAGATAAGATTGGAAACTACCTAGAGCA +CTTTTCAGCTAAAATTAGTGCCTGCAGTGTAGAAATGTGGGATTTTATATATAATACAACCAAAACTAAA +TACTGGCAATGCATCAATGACTATTCCACCCTAATGAAAAACATGTTAGCTGTCTCTCAATATAATAGAC +ACAATACGTTTAGAATTGTCTCATGTGCAAACAATAATGTATTTGGTCTAGTAATGCCAAGCTCAGATAT +AAAGACAAAAAAAGCAACTTTAGTCTATGCAATAATGGCTCTCCATAATGAGGAGGCAGAAATAGCAGAA +CTTGGCTCACTCTACTCAACTTTTAAGACAGCAACAGGATATATTTCAATATCAAAGGCTTTTAGGCTGG +ATAAAGAAAGATGCCAACGCATAGTATCCTCTCCAGGCTTGTTCCTCATGACAAGCTGCCTATTATTCAA +CGGTAACAAGAGTTTAGAATTTGATAAATTACTAGGATTTTCATTTTTTACGTCAATATCAATTACGAAA +GCTATGCTCTCCCTTACTGAGCCTTCACGTTATATGATCATGAACTCGTTAGCAGTTTCCAGCCATGTAA +GAGAGTATATATCTGAAAAATTCTCCCCTTATACAAAAACATCATTTTCTGTGGTAATGACAGACTTAAT +CAAGAAGGGTTGCTATTCAGCATATGAACAGAGAAAAAAAGTACAAATAAGAGACATAAAATTAACAGAT +TATGATATAACACAAAAGGGAGTGGATTCCAAAAGAGATCTTAAATCTATTTGGTTCCCAGGAAAGGTAA +ACCTGAAAGAATATTTAAACCAAATTTATCTACCATTTTATTTTAACTCTAAAGGATTACATGAAAAACA +TCATGTCTTGATAGATTTGGCTAAAACAGTACTAGAAATCGAAAAAGAGCAAAGGGAGTCATTACCTGAG +CCATGGTCAGAGATACCTGCTAAGCGACTGTCACTTAATGTTTTAATTTACTCATTGCAGGAACTGAATT +TAGATACTTCAAGACATAATTTTGTAAGAAGCCGGGTGGAAAACGCAAATAATTTCAACAGATCTATAAC +GACAATATCTACTTTTACCAGCTCAAAATCATGCATTAAGATTGGTGATTTTGAAGAAGAAAAAAGAGAA +AAACTAAGAATGATACAAAAGAAACTTGCAAAGGATATTTCTAAATTAACCATAGCCAACCCAGCATTCT +TAGATGAGATCACAAACGAACATGAGATAAGGCATTCAACTTATGAGGACTTAAAACAATCTATCCCAGA +TTACACAGATTATATGTCTGTGAAAGTTTTTGACAGATTGTACGAGAAGATTACTACCAATGAAATAAAT +GATAAGGAAACAGTCAAGCTGATTCTAGAGACCATGAAAAAACATAAAATATTTCATTTTGGATTCTTCA +ATAAAGGACAAAAAACAGCCAAAGATAGAGAAATATTTTTAGGTGAATTTGAAGCAAAAATGTGTCTGTA +CCTTGTCGAAAGAATAGCTAAAGAGAGGTGCAAATTAAACCCTGAAGAAATGATAAGTGAACCAGGCGAC +TCGAAACTAAGGGTATTAGAGAAGCAATCAGAAGACGAAATCAGGTATATTAGCAATACAATAAAGACAT +TAGGGAATGCCATAGAGAACTTGCAATCTGGATCTTTAAATTGGGCAGATATATGCGAAAACAAAGCAAG +AGGACTTAAGATAGAAATAAATGCTGATATGTCCAAATGGAGTGCCCAAGATGTACTTTTTAAATATTTT +TGGTTGATAGTGCTTGATCCCATCTTATATCCTGCTGAGAGGAAAAGGATAATTTATTTCCTCTGTAATT +ATATGCAGAAAAGGCTTATAATGCCCGATGAATTGCTCACTACTATATTGGATCAAAGAGTTCCTTATTC +AAATGACATAATTGGATTAATGACAAACAATTATAGGTCTAATACAGTAGAAATAAAGCGTAACTGGCTT +CAAGGCAACTTAAATTATACAAGCAGTTACTTACACAGCTGTAGTATGTCTGTGTACAAAGATATAATAA +GAGAAGCAGCAATATTATTAGAAGGAGAAGCCCTTGTGAACTCAATGGTACATTCTGATGATAATCAAAC +ATCTATATGTATGGTGCAGAATAAATTACCAGATGACAATATAATTGAATTTTGCATTAAGATATTCGAG +AAGATATGCTTAACTTTTGGCAATCAGGCAAATATGAAGAAGACATATCTAACTAACTTCATCAAAGAGT +TTGTTTCTTTATTTAATATACATGGAGAACCATTTTCTATATATGGGAGATTTCTACTCACAGCAGTAGG +AGACTGTGCCTATCTAGGGCCTTATGAAGATTTAGCAAGTAGGCTATCTGCAACACAAACTGCTATAAAG +CATGGTTGCCCACCATCACTTGCATGGGTATCTATCGCTCTAAATCACTGGATAACCCACACTACATATA +ATATGTTGCCTGGCCAAAATAATGACCCGTTACCATTCTTCCCTACTAACAATAGAAGTGAAATACCAGT +AGAGATGTGCGGAATACTAGAAAGTGATTTATCAACAATTGCACTAACTGGTTTAGAAGCAGGGAATGTC +ACGTTTCTAACAAATATAGCAAGGAAGTTATCATCCCCAATCTTACAAAGAGAAAGTATTCAAGATCAAT +ACAATTCTATAGAAAAGTGGGATCTGAGCAAATTATCACAGATCGACATTCTAAGGCTTAAAATGCTCAG +GTATATATCTCTTGATAGTTCAGTCACATCTGATGATGGTATGGGGGAGACTAGTGAAATGAGATCTCGA +TCACTTTTAACACCTCGTAAATTCACAACAAGTGGGTCACTTAATAGGTTGAAATCATATAAAGACTTTC +AAGATATAATAGCAGATGAGGACAAGACAAACGAACTATTTGAGAATTTCATTAGACACCCAGAGTTACT +GGTTACAAAAGGCGAAACATTTGAAGAATTTGTTAATACGATATTATTTAGGTACAATTCAAAGAAATTC +AAAGAATCTTTGTCAATACAAAACCCAGCACAGCTTTTTATTGAGCAAATATTATTTTCCAATAAACCAG +TAATTGACTACACTAGCATACATGACAAGATTTTTGGATTACAAGACATGCCAGGAATTGAAGAACTAGA +TACAATTATAGGTCGCAAAACATTTGTTGAGAGTTATGTTCAAATCGTAGATGACTTAAGCAATTTAACA +TTGGATATAAACGATGTCAAGACTATATTTGCCTTTTGTCTTATGAATGACCCACTACTGATCACATCTG +CTAACAATATAATAATGTCTGTTAAGGGACATAGTCAAGAAAGAATAGGTCAATCAGCATGCAAAATGCC +AGAGGTCCGAAGTCTAAAACTCATACATTATTCACCAGCAGTTGTTTTGAGAGCCTATGTGAGAGGGCCA +ACAAATGTACCGAATGTAGATATAGATGAACTTGCAAGGGATCTATCTCATTTAGAAGACTTCATACAAA +GTACAAAACTCAGAGAAAATATGAGAGAGAGAATAGAAATAAATGAGAAGCGGCACTTAGGAAGGGATTT +CAAATTTGAAATCAAAGAACTAACTAGATTTTACCAAGTGTGTTATGATTACATAAAGTCTACAGAACAT +AAAGTCAAGGTATTCATATTGCCATACAAAGTTTTCACATCAATAGAATTCTGCGGGGCACTGACAGGTA +ACTTGATAAATGACAAATTATGGTACATAACGCATTATCTGAAAAATATAGTGTCTACTACACATAAGGC +ACAAATTTCTTCTTCACCTGAATTGGAATTGCAAATTGCTGATGAGGCACTAAGACTAGTAGCACATTTT +GCTGATACTTTCTTGGCATCAGAATCAAGAATACAATTTCTGAAGAAAATTATTGAAGAATTCACATACA +AAGGGATACCTGTAAAACATTTATACTCAAAAATAAAGAACTCCAAGTTGAGGGTTAAATTTCTAGGGAT +TCTTTTATGGTTAGATGATCTAACACAGAATGATCTGGATAAATTTGATGCAGATAAATCAGATGAAAAG +ATTATATGGAATAACTGGCAAGTGTCAAGAGATATGAATACTGGACCAATAGACTTAATGATAAGCGGTT +ACTCTAGACAGCTGCGGATCACTGGGGAAGATGACAAATTGATTGCTGCTGAATTGCAGGTTACTAGATT +GTCAGAAGATTTAATTTATAGACACGGTCAGGCAATGTTGAATAAGCCACACGGCTTAAAGCTTGAAAAA +ATGCAACCTGTGACTGAGATGTCTAAACGATTACATTATATCGTTTTCCAGCAAAGATCACGGAAACGAT +ACTTCTATTCTATATTACCCACCCAAGTAATTGAGGACCATAATTCTAGAGTTGAATCATCTAGGCTAAG +CAGAGATTCAAAATGGGTTCCTGTATGCCCTGTTGCAATATCAAAACTCTACCAACAAGGACGGCCTATA +CTTTCCAAAGTTAGAAATCTGAATATGCAGACTCATTCGCTTTCCAGAATACAAGTTAATGTAGATGAAT +ATGCCATCACGAGAAGAGCACATTTTCAGAAAATGCCTTTCTTCGAAGGACCATCAATCCCTTCTGGTGG +TATGGATTTGTCTGAGTTGATGAAATCTACATCCCTATTAAGCTTGAATTATGATAACATAAAAAATGCA +TCCTTATTGGACATGTCTAGGGTATTTAAGTGCAATGGCAGTGGAGATGACCAAATGGCTTTCGAATTTC +TATCGGACGAAATTTTGGAGCAAGATGTAGTTGAAGAAATAGAATGCAACCCTATATTTTCTATTAGTTA +TACAAAAAGAGGAGAATCCAATATGACTTATAAAAATGCTTTCCACAAAGCCTTAATCTCAGAATGTGAC +AAATTTGAAGAAGCATTTGACTTCCTCGACATGGGATTTTGCTCGAATGAAAATCTTAGTATTCTGGAGG +AAATACATTGGATAATCAGTTATTTAAAAACAAATCAATGGTCTACGGAACTAGACAATTGTATTCACAT +GTGCATGTACAGGAATGGATATGATGCAGAATATCATAAATTTGATATACCCTCTAAATTCCTCAAAGAC +CCAATAAACCGAACAATAAATTGGACTGAAGTCATTGAATTTATATTATTAATTGAAGATTTCCAAACAA +AAATTGAGCCATGGTCTAGTATGAAGTCACACTTCTGTTCAAAAGCACACAGTGTAGCACTAGAGTGTAT +GAAAAATGAGAAAAGATCATTGGCAGAATTTGTAGACAAAAGTAAGAAAACTGGCAAATCCAAATTTGAC +TTCTAAGGTATACACATGTAAAAGTAGTGTTTGTTTCTAAATAGGAGCACACTACT + diff --git a/shared/oropouche_M.fasta b/shared/oropouche_M.fasta new file mode 100644 index 0000000..098af32 --- /dev/null +++ b/shared/oropouche_M.fasta @@ -0,0 +1,65 @@ +>NC_005775.1 Oropouche virus segment M, complete sequence +AGTAGTGTGCTACCGGCAACAAACAGTGACAATGGCGAATTTAATAATTATTTCAATGGTTCTGGGCGTT +GCCTATGGGCATCCGCTTTCAACAAGTCAAATTGGTGACCGCTGTTTTGCTGGTGGTAACCTCTTCAAGG +AGATGAACTTGAGTGTAGGACTTGGCGAAATATGTGTTAAAGATGACATATCTATTGTTAAGAGTACTAC +TGTTTTTAGCAAGAATAAACCAGCTCTTGAGGCAACTACAAAATTTTATAGATCTTTTATTGTGAAAGAT +TGGTCTGAGTGCAATCCAGTGCTAGACAAATTTGGAAATTTTATGGTTTTAAGTGTTGATGATAATGGCC +ATATAATACCTAAAATGTATACATGTAGAGCAGCATGTGATATTAGATTAAACAAAGATAATGCCGAAAT +AATATTATCGTCAACAAAACTCAACCACTTTGAAATTGTCGGGACAACTTCGACATCAGGCTGGTTCAAG +AACACAATAACAAACAACTTAGAACATACATGCGAGCATGTAACCGTAAATTGTGGTCAGAAATCAGTAA +AATTCCATGCATGCTTTAGACAACATAGGGGTTGTATAAGATTCTTTAAAGGAACATATATGCCATACTC +AATGATTGAAGCAATGTGCGTTAACATAGAGCTAATAATTCTGACATTATATATCTTTGCTGCAATTATA +TTTGCATTAATTATAACAAAGAGTTATGTGGCATACTTGCTCCTGCCGTTATTTTATCCAGTGACGTGGT +TTTACGGAAAGGTTTATAAAAAGATTAATTCTTGCCCAAATTGTCTACTTGCATCTCACCCTTTCACCAG +CTGCCCTAAATTCTGTATATGCGGTTCTAGGTTTAGTTGCACAGAAGCTCTTAAGGTACATAGAATGGGA +AAGGACTGCTTAGGTTATAAGTCTCTAAGCAAAGCTAGACAGATGTGTAAATCGAAAAGCTGGTCCTTCA +CAGCAGCTATATTAACTGGATTAATATTGATGGAATTCATCTCACCAATTGCAGGAGAGAGAATGTATAA +GCTTGAGGAGCTGGCTGATGAGTTTATAAAGCTAACAGAGCAGGTAAATATTTTAGAAAGAGAAATGGAA +GTCTTGAAGCAGTCAATAATTGTTATGTTTGCAATCATTTTGGTATTGTTGCTTTTTGAAAATATGATAT +TCAACCGGCTTTTCCGCATTTTTTATCGTTCTTGCTCCATGTGCGGCTTGATACATTATAGACCAGGACT +CAAGATCGATTTAACCAAAACTAACAGATGTGGTAGTTGCATCTGTGGATTCGACGAGCAGCAATCATCT +GGTTTTGAATATGAAATATTTCTAAAGGATATGCATGTCCAGAAAGAATCATGCAAATTTAGTCCAAGAG +TAAACCACTTTAGGAATATTAAAGCTTTATTATTTGCACTTGTAATCTGTGCTACTATATATACAGTTTA +TGCAGATGAAGATTGCTTATCTAAAGATATTAAAATAACATATCAAGAGTTGCACAATTGCATAGGTCCA +AAGATAATGGGCGATACTTGTGTATCAAAGAGTGAATTGTATTCTGATTTATTCTCAAAAAATCTAGTCA +CAGAATATGATAAAAAATATTTTGAACCCGATACTGTAAATGATCAATTCAACAAGATAGAGTTTGCCCA +AGATGCACATAGAATGATACTATTAGAAAGAATACTTTACAAGACAGAATGTGAAATGCTAAGCTTAAAA +AAGAACAGTGGACCTTATAATGTTGCCTGGCGCACATATTTAAAAAATCATAATATAGACTTATGCAGTA +GGCACAACTATAAAATGATATGTCAGTGTATTAATACACACTCTATGTGTAATAATACAGATATAGATTA +TAACAAAGAGATCGAAACATATTATAAAAGCAATGCTGCAGCCTATAGAGCTGACCTGAACACCATAATG +GACACATTAAAAACAGCATTCAGAGGGCTTACAAAGGTCCTAATTGAAAATTATATAGAAAAAGATGACT +CTGATGCTCTTAAGGCCTTATTTAGTAATATTACTGATTCTGTTCAGGACAATTATCAGATGATTGGAAT +ATTGAAATTTGCCAGTAAGCTATTAGATATTAACCTTGGTAGATCCACCCGCTCAGCTCATCATTCTATA +ATGACAAACGAAATTCCCAAATCTAATCCTTTTACTGATTACAGTTATTCTAATCTGAATATAAAGGAGT +GCATGTCACCAGAGAGCTTGAAATGTTTTAAGAAGAGAGGCAGTACACCGCATACAAACCACCTGCTTTG +CAAGATAGACAACAAATATAAAGCATTTGACTGGCCAGAGATAGAAACAATTCAAAAAGGTCAAAAATTG +TGCCTTGGTGACAGTCATTGCAACTTGGAATTTACAGCCATAACTGCAGACAAGATTATGTCATTAACTA +ACTGTTATAAAGAATCTTTCACTGCGCAACCAGCAGATATGCAGGCTGGAATTAAAAAATGCTCAGCTGA +TGAAATTGGAGAATGTACAACACTTGAAGACAAGACATGGCCAATTATATTCTGTGGTGGCAAATATTAC +TACTCTGATAGCAAGGAGCATGCCAAGGACGGTTCTATCAACAATTATTGTTTAACCAACAAATGCTCTG +AACAACGTTTCCCAATACATGAAAATTGGTTCAAAAAATGTAATTGGGACAAGACCCACAAAGAATTTAG +CACTATGAGACAAATAAACTATAATGATATAACTTCTTATAGAAAGGCAATAGAATCTGAGATAGGAACA +GATTTGATGACACACCATTATAAGCCCACAAAGAACCTTCCACATGTTGTACCAAGGTATCATAGTATAG +ATGTACAAGGTACAGAATCAACAGAGGGTATTATCAATGGTTTCATACAAAATACAATACCTGCTATTAG +TGGTCTAGGTGTCGGTTATCATCTGAATTTTCAAAGCAATCAGTTATTTGACATAGTTATTTTTGTGAAA +AAAGCTGTATATAAAGCACAGTACCAGAAAGCATACACCACAGGTCCTAGCATATCCATAAATATTGAAC +ACAACGAAAGATGCACAGGCCACTGCCCCGAAAAGATTCCTGCAAAAGAAGGTTGGTTAACATTTTCTAA +AGAGCATACTAGTTCTTGGGGATGTGAAGAATATGGCTGTCTTGCTATAGATACAGGCTGTCTATATGGC +TCTTGTCAAGATGTCATACGCCCAGAATTAGATGTTTATAAGAAGATTGGCAGTGAGGTATCACTGATAG +AAATATGTATAACTTTGCCTCATGAAACTTATTGCAATGATATGGACATATTAGAACCGATTATAGGAGA +TAAATTAAGTGCATCCTTTCAGAATACACAGACTAATCAACTTCCGACCCTTATAGCTTATAAAAAAGGG +AAGATATACACAGGTCAAATCAACGATATTGGAAACACAGCATTGCAGTGCGGCTCAATCCAAGTAATCA +ATGGATCAACTATCGGAACTGGCAGCCCCAAATTTGATTATATATGCCATGCTATGAGAAGGAAAGATGT +TATTGTAAGGAAGTGTTTCAATGACAATTATCAGTCGTGCACTAGGCTAGAAAAACGGAACGACTTGATC +CCTTATAGAAAAGGCGATGTCATCGAAGTATCAAAAACAGGATCGAACATGGGCCAAATGACATTTAAAA +TAGAACTAGGTGATATCAACTATAAAATATTTACTAAATCAATAGATCTTCAAATGTCTGGCATTTGTGC +TGGGTGCATTGACTGTGCTGAAGGCATTTCATGCTCTATAAATGCTGAAGTGCCAGCCGAAACAGTCTGC +CATTGCAAAACAAACTGTGAAGATTTTATAAACAATATAGTGTTTTCACCGCAAATCAAGAATTACAATA +TAAAAGTCCACTGTAAATCTAAAGTTGAAAAAATAACAGCACATATCTGTGGGAGAGACATTGATTTACA +GTTAACAATTAAGCCATACAATCAGAAGATAGACTTGTCGCAGTTAGATGAATCAAATTACATAAGGGAA +GAGGACCTCCAGTGTGGAACATGGTTATGTAAAGTGCAAAAAGAGGGGATTGATATTATATTTAAAGGAT +TATTTTCAGGCTTAGGGAGATATTGGACCATATTAATTTATTCAATTATAGGTGTGGTAATAATTGTGAT +ATTGGTTTATATCCTGTTGCCCATAGGTCGGCTTTTAAAAGCATTCCTAATCAGACATGAAATAGAATAT +GCCATGGAGCAGAAAATCAAGTAGATTTGGCTAAAAAGGGTAGGCAGGTCTAAAATCAGGTATAAATAAA +ATTCATATAAATAAAGTCAAAAATTGTTGTCGGTAGCACACTACT + diff --git a/shared/oropouche_S.fasta b/shared/oropouche_S.fasta new file mode 100644 index 0000000..4dd2ab0 --- /dev/null +++ b/shared/oropouche_S.fasta @@ -0,0 +1,13 @@ +>NC_005777.1 Oropouche virus segment S, complete genome +AGTAGTGTGCTCCACAATTCAAAACATAAAAAGAAATTCCAATAATGTCAGAGTTCATTTTCAACGATGT +ACCACAACGGACTACATCTACATTTGATCCGGAGGCAGCATATGTGGCATTTGAAGCTAGATACGGACAA +GTGCTCAATGCTGGTGTTGTTAGAGTCTTCTTCCTCAACCAAAAGAAGGCCAAAGATGTCTTACGTAAGA +CATCGAGGCCCATGGTTGACCTTACTTTTGGTGGGGTCCAATTTGCAATGGTTAATAACCATTTCCCACA +GTTCCAGTCGAATCCAGTGCCGGACAACGGTCTTACCCTGCACCGTCTGTCAGGATACCTAGCGCGCTGG +GCCTTTACCCAGATGCGATCACCAATTAAGCAAGCTGAGTTCAGAGCCACTGTAGTAGTGCCTTTGGCTG +AGGTAAAGGGCTGTACTTGGAATGATGGTGACGCAATGTACCTGGGGTTTGCTGCTGGTGCTGAGATGTT +CCTGCAAACATTCACTTTCTTCCCTTTGGTGATTGAGATGCATAGGGTTCTCAAGGATGGCATGGATGTC +AACTTTATGAAGAAAGTCCTCCGGCAGAGGTATGGTCAAAAGACTGCCGAGCAATGGATGCGTGAAGAAA +TAGTTGCTGTAAGAGCAGCTTTTGAAGCTGTAGGCACTCTGGCCTGGGCCAGAACTGGATTCTCCCCAGC +AGCAAGAGACTTCTTGCGCCAATTCGGAATTGACATATAGTGGAGCACACTACT + From 19de24c393938ad5acdeced9f8466a085b43b8dc Mon Sep 17 00:00:00 2001 From: Miguel Paredes Date: Mon, 12 Aug 2024 14:55:01 -0700 Subject: [PATCH 3/3] cleaning up the ingest rules based on nextstrain team feedback --- ingest/Snakefile | 10 +--------- ingest/rules/curate.smk | 2 +- ingest/rules/nextclade.smk | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index 8474409..2550fae 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -48,15 +48,7 @@ include: "rules/nextclade.smk" # to the final metadata TSV. # To run nextclade.smk rules, include the `defaults/nextclade_config.yaml` # config file with `nextstrain build ingest --configfile defaults/nextclade_config.yaml`. -rule create_final_metadata: - input: - metadata="data/subset_metadata.tsv" - output: - metadata="results/all/metadata.tsv" - shell: - """ - cp {input.metadata} {output.metadata} - """ + # Allow users to import custom rules provided via the config. # This allows users to run custom rules that can extend or override the workflow. diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 44855e3..c3b020d 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -121,7 +121,7 @@ rule subset_metadata: input: metadata="data/all_metadata.tsv", output: - metadata="data/subset_metadata.tsv", + metadata="results/all/metadata.tsv", params: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index a6a3e17..316fdce 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -20,7 +20,6 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html rule run_nextclade_to_identify_segment: input: - metadata = "data/subset_metadata.tsv", sequences = "results/all/sequences.fasta", segment_reference = config["nextclade"]["segment_reference"], output: