Skip to content

Commit

Permalink
Split by serotype using NCBI virus_tax_id #20
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Feb 14, 2024
2 parents 813fd31 + 6c206bc commit 9f54ad5
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 5 deletions.
5 changes: 4 additions & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ if not config:

send_slack_notifications = config.get("send_slack_notifications", False)

serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']

def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = ["results/sequences.fasta", "results/metadata.tsv"]
all_targets = expand(["results/sequences_{serotype}.fasta", "results/metadata_{serotype}.tsv"], serotype=serotypes)


# Add additional targets based on upload config
upload_config = config.get("upload", {})
Expand Down Expand Up @@ -57,6 +59,7 @@ rule all:

include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/split_serotypes.smk"


if config.get("upload", False):
Expand Down
52 changes: 52 additions & 0 deletions ingest/bin/infer-dengue-serotype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#! /usr/bin/env python3

import argparse
import json
from sys import stdin, stdout

def parse_args():
parser = argparse.ArgumentParser(
description="Dengue specific processing of metadata, infer serotype from virus_tax_id",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--virus-tax-id",
type=str,
default="virus_tax_id",
help="Column name containing the NCBI taxon id of the virus serotype.",
)
parser.add_argument(
"--out-col",
type=str,
default="ncbi_serotype",
help="Column name to store the inferred serotype.",
)
return parser.parse_args()


def _get_dengue_serotype(record, col="virus_tax_id"):
"""Set dengue serotype from virus_tax_id"""
dengue_types = {
"11053": "denv1",
"11060": "denv2",
"11069": "denv3",
"11070": "denv4",
"31634": "denv2", # Dengue virus 2 Thailand/16681/84
}

taxon_id = record[col]

return dengue_types.get(taxon_id, "")


def main():
args = parse_args()

for record in stdin:
record = json.loads(record)
record[args.out_col] = _get_dengue_serotype(record, col=args.virus_tax_id)
stdout.write(json.dumps(record) + "\n")


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ ncbi_datasets_fields:
- isolate-collection-date
- release-date
- update-date
- virus-tax-id
- virus-name
- length
- host-name
- isolate-lineage-source
Expand All @@ -37,6 +39,8 @@ transform:
'isolate-collection-date=date',
'release-date=release_date',
'update-date=update_date',
'virus-tax-id=virus_tax_id',
'virus-name=virus_name',
'sra-accs=sra_accessions',
'submitter-names=authors',
'submitter-affiliation=institution',
Expand Down Expand Up @@ -96,6 +100,7 @@ transform:
'host',
'release_date',
'update_date',
'ncbi_serotype', # inferred from virus_tax_id
'sra_accessions',
'abbr_authors',
'authors',
Expand Down
37 changes: 37 additions & 0 deletions ingest/workflow/snakemake_rules/split_serotypes.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
This part of the workflow handles splitting the data by serotype either based on the
NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate.
metadata = "results/metadata_all.tsv"
sequences = "results/sequences_all.fasta"
This will produce output files as
metadata_{serotype} = "results/metadata_{serotype}.tsv"
sequences_{serotype} = "results/sequences_{serotype}.fasta"
Parameters are expected to be defined in `config.transform`.
"""

rule split_by_ncbi_serotype:
"""
Split the data by serotype based on the NCBI metadata.
"""
input:
metadata = "results/metadata_all.tsv",
sequences = "results/sequences_all.fasta"
output:
metadata = "results/metadata_{serotype}.tsv",
sequences = "results/sequences_{serotype}.fasta"
params:
id_field = config["transform"]["id_field"]
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.id_field} \
--query "ncbi_serotype=='{wildcards.serotype}'" \
--output-sequences {output.sequences} \
--output-metadata {output.metadata}
"""
9 changes: 5 additions & 4 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ formats and expects input file
This will produce output files as
metadata = "results/metadata.tsv"
sequences = "results/sequences.fasta"
metadata = "results/metadata_all.tsv"
sequences = "results/sequences_all.fasta"
Parameters are expected to be defined in `config.transform`.
"""
Expand Down Expand Up @@ -42,8 +42,8 @@ rule transform:
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["transform"]["annotations"],
output:
metadata="results/metadata.tsv",
sequences="results/sequences.fasta",
metadata="results/metadata_all.tsv",
sequences="results/sequences_all.fasta",
log:
"logs/transform.txt",
params:
Expand Down Expand Up @@ -85,6 +85,7 @@ rule transform:
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/infer-dengue-serotype.py \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down

0 comments on commit 9f54ad5

Please sign in to comment.