diff --git a/phylogenetic/defaults/exclude.txt b/phylogenetic/defaults/exclude.txt index e69de29..e99cbe9 100644 --- a/phylogenetic/defaults/exclude.txt +++ b/phylogenetic/defaults/exclude.txt @@ -0,0 +1,20 @@ +AY993909 #duplicated +AY993910 #duplicated +AY993911 #duplicated +AF164532 #duplicated +AY993909 #duplicated +AY993910 #duplicated +AY993911 #duplicated +AY117135 #duplicated +HM470140 #duplicated +AY704563 #duplicated +AY704568 #duplicated +AY704566 #duplicated +PP952117 #duplicated +EF467371 #duplicated +EF467372 #duplicated +EF467370 # duplicated +EF467369 # duplicated +AF164531 # duplicated +NC_005775 # duplicated +AF441119 # duplicated diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index dc03911..579b1bc 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -53,7 +53,7 @@ rule export: # description = config['export']['description'], auspice_config = config['export']['auspice_config'], output: - auspice ="auspice/oropouche_{segment}.json", + auspice ="results/{segment}/oropouche.json", params: strain_id_field = config["strain_id_field"], shell: @@ -69,21 +69,21 @@ rule export: --include-root-sequence-inline """ -# rule final_strain_name: -# input: -# auspice_json="results/{segment}/oropouche.json", -# metadata="data/{segment}/metadata.tsv", -# output: -# auspice_json="auspice/oropouche_{segment}.json", -# params: -# strain_id_field=config["strain_id_field"], -# display_strain_field=config["display_strain_field"], -# shell: -# """ -# python3 scripts/set_final_strain_name.py \ -# --metadata {input.metadata} \ -# --metadata-id-columns {params.strain_id_field} \ -# --input-auspice-json {input.auspice_json} \ -# --display-strain-name {params.display_strain_field} \ -# --output {output.auspice_json} -# """ +rule final_strain_name: + input: + auspice_json="results/{segment}/oropouche.json", + metadata="data/{segment}/metadata.tsv", + output: + auspice_json="auspice/oropouche_{segment}.json", + params: + strain_id_field=config["strain_id_field"], + display_strain_field=config["display_strain_field"], + shell: + """ + python3 scripts/set_final_strain_name.py \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id_field} \ + --input-auspice-json {input.auspice_json} \ + --display-strain-name {params.display_strain_field} \ + --output {output.auspice_json} + """ diff --git a/phylogenetic/scripts/set_final_strain_name.py b/phylogenetic/scripts/set_final_strain_name.py new file mode 100644 index 0000000..306a59e --- /dev/null +++ b/phylogenetic/scripts/set_final_strain_name.py @@ -0,0 +1,48 @@ +import pandas as pd +import json +import argparse +from augur.io import read_metadata + +def replace_name_recursive(node, lookup, saveoldcolumn): + if node["name"] in lookup: + if saveoldcolumn == "accession": + node["node_attrs"][saveoldcolumn] = node["name"] + elif saveoldcolumn == "genbank_accession": + node["node_attrs"][saveoldcolumn] = {} + node["node_attrs"][saveoldcolumn]["value"] = node["name"] + else: + node["node_attrs"][saveoldcolumn] = node["name"] + + if node["name"] in lookup: + node["name"] = lookup[node["name"]] + + if "children" in node: + for child in node["children"]: + replace_name_recursive(child, lookup, saveoldcolumn) + +if __name__=="__main__": + parser = argparse.ArgumentParser( + description="Swaps out the strain names in the Auspice JSON with the final strain name", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") + parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") + parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") + parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") + args = parser.parse_args() + + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) + name_lookup = {} + for ri, row in metadata.iterrows(): + strain_id = row.name + name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] + + with open(args.input_auspice_json, 'r') as fh: + data = json.load(fh) + + replace_name_recursive(data['tree'], name_lookup, "genbank_accession") + + with open(args.output, 'w') as fh: + json.dump(data, fh)