Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update usage of "accession" as the ID column #29

Merged
merged 4 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Snakefile
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

d123d60's PR-triggered CI run results (pr29/d123d60/docker/rsv/a/F, etc.) should be comparable to those from latest master (master/4ecf498/docker/rsv/a/F, etc.).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ include: "workflow/snakemake_rules/glycosylation.smk"
include: "workflow/snakemake_rules/clades.smk"


if config.get("deploy_url"):
if "deploy_url" in config:
include: "workflow/snakemake_rules/nextstrain_automation.smk"

rule clean:
Expand Down
2 changes: 1 addition & 1 deletion config/configfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ exclude: "config/outliers.txt"
description: "config/description.md"

strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"
j23414 marked this conversation as resolved.
Show resolved Hide resolved

subtypes: ['a', 'b']

Expand Down
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _get_all_targets(wildcards):
)
elif len(remote_file_names) != len(set(remote_file_names)):
print(f"Skipping file upload for {target!r} because there are duplicate remote file names.")
elif not config.get("s3_dst"):
elif "s3_dst" not in config:
print(f"Skipping file upload for {target!r} because the destintion was not defined.")
else:
all_targets.extend(
Expand Down
6 changes: 4 additions & 2 deletions scripts/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
25 changes: 0 additions & 25 deletions scripts/wrangle_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion workflow/envs/nextstrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ dependencies:
- pip
- pip:
- awscli==1.18.45
- nextstrain-augur==11.3.0
- nextstrain-augur==22.2.0
- nextstrain-cli==1.16.5
- rethinkdb==2.3.0.post6
28 changes: 10 additions & 18 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,6 @@ This part of the workflow expects input files
metadata = "data/metadata.tsv"
'''

rule wrangle_metadata:
input:
metadata="data/{a_or_b}/metadata.tsv",
output:
metadata="data/{a_or_b}/metadata_by_accession.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
python3 scripts/wrangle_metadata.py --metadata {input.metadata} \
--strain-id {params.strain_id} \
--output {output.metadata}
"""

rule index_sequences:
message:
"""
Expand Down Expand Up @@ -64,21 +50,23 @@ rule filter:
input:
sequences = "data/{a_or_b}/sequences.fasta",
reference = "config/{a_or_b}reference.gbk",
metadata = "data/{a_or_b}/metadata_by_accession.tsv",
metadata = "data/{a_or_b}/metadata.tsv",
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000)
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -196,13 +184,15 @@ rule refine:
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
date_inference = config["refine"]["date_inference"]
date_inference = config["refine"]["date_inference"],
strain_id=config["strain_id_field"],
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--coalescent {params.coalescent} \
Expand Down Expand Up @@ -264,12 +254,14 @@ rule traits:
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
params:
columns = config["traits"]["columns"]
columns = config["traits"]["columns"],
strain_id=config["strain_id_field"],
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence
Expand Down
8 changes: 6 additions & 2 deletions workflow/snakemake_rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ rule export:
auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json",
root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json"
params:
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny"
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
strain_id=config["strain_id_field"],
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.node_data} \
--title {params.title:q} \
--description {input.description} \
Expand All @@ -62,10 +64,12 @@ rule final_strain_name:
output:
auspice_json=build_dir + "/{a_or_b}/{build_name}/tree_renamed.json"
params:
display_strain_field=lambda w: config.get("display_strain_field", "strain"),
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down