diff --git a/bin/date_filter.py b/bin/date_filter.py index 39a8e03..bc5ff02 100755 --- a/bin/date_filter.py +++ b/bin/date_filter.py @@ -58,7 +58,7 @@ def filter_by_date(in_metadata, out_metadata, todays_date, time_window, filter_c continue if (todays_date - window) > date: - row[filter_column] = "sample_date older than %s days" %window + row[filter_column] = "sample_date older than %s days" %time_window if not restrict: writer.writerow(row) continue diff --git a/bin/downsample.py b/bin/downsample.py index cf66c5f..de9d264 100755 --- a/bin/downsample.py +++ b/bin/downsample.py @@ -119,7 +119,10 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou open(out_metadata, 'w', newline = '') as csv_out: reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + fieldnames = reader.fieldnames + if "note" not in reader.fieldnames: + fieldnames.append("note") + writer = csv.DictWriter(csv_out, fieldnames = fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") writer.writeheader() if "nucleotide_variants" in reader.fieldnames: @@ -134,8 +137,10 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou very_most_frequent = get_by_frequency(count_dict, num_samples, band=[0.5,1.0]) for row in reader: + row["note"] = "" fasta_header = row[FASTA_HEADER] if fasta_header not in indexed_fasta: + writer.writerow(row) continue if original_num_seqs % 1000 == 0: now = datetime.datetime.now() @@ -146,13 +151,14 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou downsample_lineage_size,lineage_dict): if fasta_header in outgroups: row["why_excluded"]="" - writer.writerow(row) if row["why_excluded"] in [None, "None", ""] and fasta_header in indexed_fasta: seq_rec = indexed_fasta[fasta_header] fa_out.write(">" + seq_rec.id + "\n") fa_out.write(str(seq_rec.seq) + "\n") + row["note"] = "included in downsample" else: print(row["why_excluded"], fasta_header, (fasta_header in indexed_fasta)) + writer.writerow(row) continue muts = row[var_column].split("|") if len(muts) < max_diff: @@ -192,11 +198,12 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou else: var_dict[mut].append(fasta_header) row["why_excluded"] = "" - writer.writerow(row) if fasta_header in indexed_fasta: seq_rec = indexed_fasta[fasta_header] fa_out.write(">" + seq_rec.id + "\n") fa_out.write(str(seq_rec.seq) + "\n") + row["note"] = "included in downsample" + writer.writerow(row) now = datetime.datetime.now() print("%s Downsampled from %i seqs to %i seqs" %(str(now), original_num_seqs, len(sample_dict))) diff --git a/bin/filter_by_ambiguous_sites.py b/bin/filter_by_ambiguous_sites.py index e0a8b65..77bf356 100755 --- a/bin/filter_by_ambiguous_sites.py +++ b/bin/filter_by_ambiguous_sites.py @@ -44,6 +44,8 @@ def apply_filter(in_fasta, out_fasta, sites_file): if keep: fasta_out.write('>' + ID + '\n') fasta_out.write(seq + '\n') + else: + print(ID) def main(): args = parse_args() diff --git a/bin/parse_usher_log.py b/bin/parse_usher_log.py index 0f3d015..4ca9476 100755 --- a/bin/parse_usher_log.py +++ b/bin/parse_usher_log.py @@ -15,6 +15,11 @@ def parse_log(in_file, out_file): """ Usher STDOUT > CSV """ + ignored = set() + with open(in_file, 'r') as in_handle: + for line in in_handle: + if "Ignoring sample" in line: + ignored.add(line.split("Ignoring sample ")[1].split(".")[0]) with open(in_file, 'r') as in_handle, \ open(out_file, 'w') as out_handle: out_handle.write("sequence_name,parsimony_score,num_parsimony_optimal_placements,is_unreliable_in_tree\n") @@ -26,6 +31,8 @@ def parse_log(in_file, out_file): print(fields) assert len(fields) > 3 row = {"sequence_name": fields[1].split(": ")[1], "parsimony_score": fields[2].split(": ")[1], "num_parsimony_optimal_placements": fields[3].split(": ")[1]} + if row["sequence_name"] in ignored: + continue if int(row["num_parsimony_optimal_placements"]) > 1: row["is_unreliable_in_tree"] = "Y" else: diff --git a/modules/extract_protected_sequences.nf b/modules/extract_protected_sequences.nf index 3c22f9b..6e8a303 100644 --- a/modules/extract_protected_sequences.nf +++ b/modules/extract_protected_sequences.nf @@ -40,6 +40,51 @@ process filter_on_sample_date_for_recent { """ } +process annotate_metadata { + /** + * Adds to note column with info about sequences excluded by date + * @input metadata + * @output metadata + */ + + input: + path metadata + path fasta + + output: + path "${metadata.baseName}.annotated.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + seqs = set() + with open("${fasta}", 'r', newline = '') as fasta_in: + for line in fasta_in: + if line.startswith(">"): + seqs.add(line.rstrip()[1:]) + + print("Fasta of recent had %d sequences" %len(seqs)) + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + new_fieldnames = reader.fieldnames + if "note" not in reader.fieldnames: + new_fieldnames.append("note") + writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + for row in reader: + note = [] + if row["sequence_name"] not in seqs: + note.append("sample not recent") + if row["note"]: + note.append(row["note"]) + row["note"] = "|".join(note) + writer.writerow(row) + """ +} process fetch_recent { /** @@ -146,8 +191,10 @@ workflow extract_protected_sequences { fetch_outgroups(fasta) fetch_outgroups.out.concat( recent_ch, designations_ch ).set { protected_ch } protected_ch.collectFile(name: "force.fa").set { output_ch } + annotate_metadata(metadata,output_ch) emit: - output_ch + fasta = output_ch + metadata = annotate_metadata.out } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index ae6a026..05b28ae 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -188,6 +188,72 @@ process prune_tree_with_metadata { """ } +process get_tree_tips { + /** + * Gets list of tree tips + * @input tree + */ + + input: + path tree + + output: + path "${tree.baseName}.tips.txt" + + script: + """ + gotree stats tips \ + -i ${tree} | cut -f4 > "${tree.baseName}.tips.txt" + """ +} + + +process annotate_metadata { + /** + * Adds note column with info about sequences in input + * @input metadata + * @output metadata + */ + + input: + path metadata + path tips + + output: + path "${metadata.baseName}.annotated.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + tips = set() + with open("${tips}", 'r', newline = '') as tips_in: + for line in tips_in: + tip = line.rstrip().replace("'","") + tips.add(tip) + + print("Tree contains %d tips" %len(tips)) + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + new_fieldnames = reader.fieldnames + if "note" not in reader.fieldnames: + new_fieldnames.append("note") + writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + for row in reader: + note = [] + if row["sequence_name"] in tips: + note.append("tip in input tree") + if row["note"]: + note.append(row["note"]) + row["note"] = "|".join(note) + writer.writerow(row) + """ +} + process announce_summary { /** @@ -300,10 +366,12 @@ workflow clean_fasta_and_metadata_and_tree { clean_metadata(metadata, clean_fasta_headers_with_tree.out.map) get_keep_tips(clean_metadata.out) prune_tree_with_metadata(clean_fasta_headers_with_tree.out.tree, get_keep_tips.out) + get_tree_tips(prune_tree_with_metadata.out) + annotate_metadata(clean_metadata.out, get_tree_tips.out) announce_metadata_pruned_tree(tree, metadata, prune_tree_with_metadata.out) emit: fasta = clean_fasta_headers_with_tree.out.fasta - metadata = clean_metadata.out + metadata = annotate_metadata.out tree = prune_tree_with_metadata.out } diff --git a/modules/subsample_for_tree.nf b/modules/subsample_for_tree.nf index 7980dd8..4a0afde 100644 --- a/modules/subsample_for_tree.nf +++ b/modules/subsample_for_tree.nf @@ -79,7 +79,9 @@ process filter_on_ambiguous_sites { path fasta output: - path "${fasta.baseName}.site_filtered.fa" + path "${fasta.baseName}.site_filtered.fa", emit: fasta + path "ids_with_ambiguous_sites.log", emit: ambiguous_site_ids + script: if ( params.downsample ) @@ -87,7 +89,7 @@ process filter_on_ambiguous_sites { $project_dir/../bin/filter_by_ambiguous_sites.py \ --in-alignment ${fasta} \ --out-alignment "${fasta.baseName}.site_filtered.fa" \ - --sites ${ambiguous_sites} + --sites ${ambiguous_sites} > "ids_with_ambiguous_sites.log" """ else """ @@ -129,6 +131,65 @@ process downsample { """ } +process annotate_metadata { + /** + * Adds note column with info about sequences hashed, filtered and downsampled + * @input metadata + * @output metadata + */ + + input: + path metadata + path hashmap + path ambiguous_site_log + + output: + path "${metadata.baseName}.annotated.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + tips = set() + hashmap = {} + with open("${hashmap}", 'r', newline = '') as hashmap_in: + for line in hashmap_in: + tip, redundant = line.rstrip().split(",") + tips.add(tip) + for id in redundant.split("|"): + hashmap[id] = tip + print("hashmap contains %d tips and %d redundants" %(len(tips), len(hashmap))) + + ambig = set() + with open("${ambiguous_site_log}", 'r', newline = '') as ambiguous_site_log_in: + for line in ambiguous_site_log_in: + ambig.add(line.rstrip()) + print("%d had ambiguous bases" %len(ambig)) + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + new_fieldnames = reader.fieldnames + if "note" not in reader.fieldnames: + new_fieldnames.append("note") + writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + for row in reader: + note = [] + if row["sequence_name"] in hashmap: + note.append("hashed to tip %s" %hashmap[row["sequence_name"]]) + if row["sequence_name"] in ambig: + note.append("filtered due to ambiguous base") + if row["date_filter"]: + if row["note"] and "downsample" not in row["note"]: + note.append("date filtered") + if row["note"]: + note.append(row["note"]) + row["note"] = "|".join(note) + writer.writerow(row) + """ +} process announce_summary { /** @@ -188,11 +249,12 @@ workflow subsample_for_tree { hash_non_unique_seqs(fasta, metadata) filter_on_ambiguous_sites(hash_non_unique_seqs.out.fasta) filter_on_sample_date(metadata) - downsample(filter_on_ambiguous_sites.out, filter_on_sample_date.out) - announce_summary(fasta, hash_non_unique_seqs.out.fasta, filter_on_ambiguous_sites.out, filter_on_sample_date.out, downsample.out.fasta) + downsample(filter_on_ambiguous_sites.out.fasta, filter_on_sample_date.out) + annotate_metadata(downsample.out.metadata, hash_non_unique_seqs.out.hashmap, filter_on_ambiguous_sites.out.ambiguous_site_ids) + announce_summary(fasta, hash_non_unique_seqs.out.fasta, filter_on_ambiguous_sites.out.fasta, filter_on_sample_date.out, downsample.out.fasta) emit: fasta = downsample.out.fasta // subset of unique - metadata = downsample.out.metadata + metadata = annotate_metadata.out hashmap = hash_non_unique_seqs.out.hashmap } diff --git a/modules/usher_expand_tree.nf b/modules/usher_expand_tree.nf index 2d5c54f..01ea7fe 100644 --- a/modules/usher_expand_tree.nf +++ b/modules/usher_expand_tree.nf @@ -207,6 +207,54 @@ process usher_start_tree { """ } + +process optimize_start_tree { + /** + * Runs matOptimize to improve tree + * @input tree + */ + + publishDir "${publish_dev}/trees", pattern: "*.pb", mode: 'copy', saveAs: { "cog_global.${params.date}.pb" }, overwrite: true + + input: + path protobuf + + output: + path "${protobuf.baseName}.optimized.pb" + + script: + """ + matOptimize -i ${protobuf} \ + -o "${protobuf.baseName}.optimized.pb" \ + -r 100 \ + -T ${params.max_cpus} \ + -s 259200 + """ +} + +process protobuf_to_tree { + /** + * Runs matUtils to convert to tree + * @input tree + */ + + publishDir "${publish_dev}/trees", pattern: "*.tree", mode: 'copy', saveAs: { "cog_global.${params.date}.tree" }, overwrite: true + + input: + path protobuf + + output: + path "${protobuf.baseName}.tree" + + script: + """ + matUtils extract \ + -i ${protobuf} \ + -t "${protobuf.baseName}.tree" + """ +} + + process usher_update_tree { /** * Makes usher mutation annotated tree @@ -361,6 +409,52 @@ process add_usher_metadata { """ } +process annotate_metadata { + /** + * Adds to note column with info about sequences added + * @input metadata + * @output metadata + */ + + input: + path metadata + path fasta + + output: + path "${metadata.baseName}.annotated.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + seqs = set() + with open("${fasta}", 'r', newline = '') as fasta_in: + for line in fasta_in: + if line.startswith(">"): + seqs.add(line.rstrip()[1:]) + + print("Fasta to add had %d sequences" %len(seqs)) + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + new_fieldnames = reader.fieldnames + if "note" not in reader.fieldnames: + new_fieldnames.append("note") + writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + for row in reader: + note = [] + if row["sequence_name"] in seqs: + note.append("tried to add with usher") + if row["note"]: + note.append(row["note"]) + row["note"] = "|".join(note) + writer.writerow(row) + """ +} + process root_tree { /** * Roots tree with WH04 @@ -488,7 +582,7 @@ workflow prepare_fasta { add_reference_to_fasta(mask_fasta.out, reference) fasta_to_vcf(add_reference_to_fasta.out) emit: - fasta = fasta_to_vcf.out + vcf = fasta_to_vcf.out } @@ -503,8 +597,10 @@ workflow iteratively_update_tree { prepare_fasta(fasta_chunks) prepare_fasta.out.collect().set{ vcf_list } usher_update_tree(vcf_list, protobuf) - add_usher_metadata(usher_update_tree.out.usher_log.last(),metadata) + annotate_metadata(metadata, fasta) + add_usher_metadata(usher_update_tree.out.usher_log.last(),annotate_metadata.out) prune_tree_of_long_branches(usher_update_tree.out.protobuf.last()) + final_tree = prune_tree_of_long_branches.out.tree final_protobuf = prune_tree_of_long_branches.out.protobuf emit: @@ -523,7 +619,8 @@ workflow iteratively_force_update_tree { prepare_fasta(fasta_chunks) prepare_fasta.out.collect().set{ vcf_list } usher_force_update_tree(vcf_list, protobuf) - add_usher_metadata(usher_force_update_tree.out.usher_log.last(),metadata) + annotate_metadata(metadata, fasta) + add_usher_metadata(usher_force_update_tree.out.usher_log.last(),annotate_metadata.out) prune_tree_of_long_branches(usher_force_update_tree.out.protobuf.last()) final_tree = prune_tree_of_long_branches.out.tree final_protobuf = prune_tree_of_long_branches.out.protobuf @@ -554,12 +651,14 @@ workflow usher_expand_tree { out_tree = usher_start_tree.out.tree out_metadata = metadata } - root_tree(out_tree) + optimize_start_tree(out_pb) + protobuf_to_tree(optimize_start_tree.out) + root_tree(protobuf_to_tree.out) rescale_branch_lengths(root_tree.out) announce_tree_complete(rescale_branch_lengths.out, "initial") emit: tree = rescale_branch_lengths.out - protobuf = out_pb + protobuf = optimize_start_tree.out metadata = out_metadata } diff --git a/workflows/process_tree_building.nf b/workflows/process_tree_building.nf index 6b35271..ed676ed 100644 --- a/workflows/process_tree_building.nf +++ b/workflows/process_tree_building.nf @@ -39,42 +39,50 @@ workflow { ch_clean_metadata = clean_fasta_and_metadata.out.metadata } - ch_protected = extract_protected_sequences(ch_clean_fasta, ch_clean_metadata) + extract_protected_sequences(ch_clean_fasta, ch_clean_metadata) + ch_protected = extract_protected_sequences.out.fasta + ch_protected_metadata = extract_protected_sequences.out.metadata if ( ! params.protobuf ) { if ( ! params.newick_tree ) { - subsample_for_tree(ch_clean_fasta, ch_clean_metadata) + subsample_for_tree(ch_clean_fasta, ch_protected_metadata) build_split_grafted_tree(subsample_for_tree.out.fasta, subsample_for_tree.out.metadata, subsample_for_tree.out.hashmap) - if ( params.skip_usher ){ - ch_full_tree = build_split_grafted_tree.out.tree - ch_full_metadata = ch_clean_metadata - } else { - ch_clean_tree = build_split_grafted_tree.out.tree - } + ch_clean_tree = build_split_grafted_tree.out.tree + ch_fasttree_metadata = subsample_for_tree.out.metadata + } else { + ch_fasttree_metadata = ch_protected_metadata } + if ( ! params.skip_usher ){ - usher_expand_tree(ch_clean_fasta, ch_clean_tree, ch_clean_metadata) + usher_expand_tree(ch_clean_fasta, ch_clean_tree, ch_fasttree_metadata) ch_protobuf = usher_expand_tree.out.protobuf ch_expanded_tree = usher_expand_tree.out.tree ch_expanded_metadata = usher_expand_tree.out.metadata + } else { + ch_expanded_tree = ch_clean_tree + ch_expanded_metadata = ch_fasttree_metadata } } else if ( params.newick_tree && params.update_protobuf ) { ch_protobuf_raw = Channel.fromPath(params.protobuf) - soft_update_usher_tree(ch_clean_fasta, ch_clean_tree, ch_protobuf_raw, ch_clean_metadata) + soft_update_usher_tree(ch_clean_fasta, ch_clean_tree, ch_protobuf_raw, ch_protected_metadata) ch_protobuf = soft_update_usher_tree.out.protobuf ch_expanded_tree = soft_update_usher_tree.out.tree ch_expanded_metadata = soft_update_usher_tree.out.metadata + } else if ( params.newick_tree ) { ch_protobuf = Channel.fromPath(params.protobuf) ch_expanded_tree = ch_clean_tree - ch_expanded_metadata = ch_clean_metadata + ch_expanded_metadata = ch_protected_metadata } if (! params.skip_usher ) { hard_update_usher_tree(ch_protected, ch_expanded_tree, ch_protobuf, ch_expanded_metadata) ch_full_tree = hard_update_usher_tree.out.tree ch_full_metadata = hard_update_usher_tree.out.metadata + } else if ( params.protobuf ){ + ch_full_tree = ch_expanded_tree + ch_full_metadata = ch_expanded_metadata } post_process_tree(ch_full_tree, ch_full_metadata)