Skip to content

Commit

Permalink
add notes to master metadata to trace samples through pipeline #19 #18
Browse files Browse the repository at this point in the history
  • Loading branch information
rmcolq committed Jun 16, 2021
1 parent f428802 commit f772aba
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 27 deletions.
2 changes: 1 addition & 1 deletion bin/date_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def filter_by_date(in_metadata, out_metadata, todays_date, time_window, filter_c
continue

if (todays_date - window) > date:
row[filter_column] = "sample_date older than %s days" %window
row[filter_column] = "sample_date older than %s days" %time_window
if not restrict:
writer.writerow(row)
continue
Expand Down
13 changes: 10 additions & 3 deletions bin/downsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou
open(out_metadata, 'w', newline = '') as csv_out:

reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
fieldnames = reader.fieldnames
if "note" not in reader.fieldnames:
fieldnames.append("note")
writer = csv.DictWriter(csv_out, fieldnames = fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
writer.writeheader()

if "nucleotide_variants" in reader.fieldnames:
Expand All @@ -134,8 +137,10 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou
very_most_frequent = get_by_frequency(count_dict, num_samples, band=[0.5,1.0])

for row in reader:
row["note"] = ""
fasta_header = row[FASTA_HEADER]
if fasta_header not in indexed_fasta:
writer.writerow(row)
continue
if original_num_seqs % 1000 == 0:
now = datetime.datetime.now()
Expand All @@ -146,13 +151,14 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou
downsample_lineage_size,lineage_dict):
if fasta_header in outgroups:
row["why_excluded"]=""
writer.writerow(row)
if row["why_excluded"] in [None, "None", ""] and fasta_header in indexed_fasta:
seq_rec = indexed_fasta[fasta_header]
fa_out.write(">" + seq_rec.id + "\n")
fa_out.write(str(seq_rec.seq) + "\n")
row["note"] = "included in downsample"
else:
print(row["why_excluded"], fasta_header, (fasta_header in indexed_fasta))
writer.writerow(row)
continue
muts = row[var_column].split("|")
if len(muts) < max_diff:
Expand Down Expand Up @@ -192,11 +198,12 @@ def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff, outgrou
else:
var_dict[mut].append(fasta_header)
row["why_excluded"] = ""
writer.writerow(row)
if fasta_header in indexed_fasta:
seq_rec = indexed_fasta[fasta_header]
fa_out.write(">" + seq_rec.id + "\n")
fa_out.write(str(seq_rec.seq) + "\n")
row["note"] = "included in downsample"
writer.writerow(row)

now = datetime.datetime.now()
print("%s Downsampled from %i seqs to %i seqs" %(str(now), original_num_seqs, len(sample_dict)))
Expand Down
2 changes: 2 additions & 0 deletions bin/filter_by_ambiguous_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def apply_filter(in_fasta, out_fasta, sites_file):
if keep:
fasta_out.write('>' + ID + '\n')
fasta_out.write(seq + '\n')
else:
print(ID)

def main():
args = parse_args()
Expand Down
7 changes: 7 additions & 0 deletions bin/parse_usher_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ def parse_log(in_file, out_file):
"""
Usher STDOUT > CSV
"""
ignored = set()
with open(in_file, 'r') as in_handle:
for line in in_handle:
if "Ignoring sample" in line:
ignored.add(line.split("Ignoring sample ")[1].split(".")[0])
with open(in_file, 'r') as in_handle, \
open(out_file, 'w') as out_handle:
out_handle.write("sequence_name,parsimony_score,num_parsimony_optimal_placements,is_unreliable_in_tree\n")
Expand All @@ -26,6 +31,8 @@ def parse_log(in_file, out_file):
print(fields)
assert len(fields) > 3
row = {"sequence_name": fields[1].split(": ")[1], "parsimony_score": fields[2].split(": ")[1], "num_parsimony_optimal_placements": fields[3].split(": ")[1]}
if row["sequence_name"] in ignored:
continue
if int(row["num_parsimony_optimal_placements"]) > 1:
row["is_unreliable_in_tree"] = "Y"
else:
Expand Down
49 changes: 48 additions & 1 deletion modules/extract_protected_sequences.nf
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,51 @@ process filter_on_sample_date_for_recent {
"""
}

process annotate_metadata {
/**
* Adds to note column with info about sequences excluded by date
* @input metadata
* @output metadata
*/

input:
path metadata
path fasta

output:
path "${metadata.baseName}.annotated.csv"

script:
"""
#!/usr/bin/env python3
import csv
seqs = set()
with open("${fasta}", 'r', newline = '') as fasta_in:
for line in fasta_in:
if line.startswith(">"):
seqs.add(line.rstrip()[1:])
print("Fasta of recent had %d sequences" %len(seqs))
with open("${metadata}", 'r', newline = '') as csv_in, \
open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out:
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
new_fieldnames = reader.fieldnames
if "note" not in reader.fieldnames:
new_fieldnames.append("note")
writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
writer.writeheader()
for row in reader:
note = []
if row["sequence_name"] not in seqs:
note.append("sample not recent")
if row["note"]:
note.append(row["note"])
row["note"] = "|".join(note)
writer.writerow(row)
"""
}

process fetch_recent {
/**
Expand Down Expand Up @@ -146,8 +191,10 @@ workflow extract_protected_sequences {
fetch_outgroups(fasta)
fetch_outgroups.out.concat( recent_ch, designations_ch ).set { protected_ch }
protected_ch.collectFile(name: "force.fa").set { output_ch }
annotate_metadata(metadata,output_ch)
emit:
output_ch
fasta = output_ch
metadata = annotate_metadata.out
}


Expand Down
70 changes: 69 additions & 1 deletion modules/preprocess.nf
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,72 @@ process prune_tree_with_metadata {
"""
}

process get_tree_tips {
/**
* Gets list of tree tips
* @input tree
*/

input:
path tree

output:
path "${tree.baseName}.tips.txt"

script:
"""
gotree stats tips \
-i ${tree} | cut -f4 > "${tree.baseName}.tips.txt"
"""
}


process annotate_metadata {
/**
* Adds note column with info about sequences in input
* @input metadata
* @output metadata
*/

input:
path metadata
path tips

output:
path "${metadata.baseName}.annotated.csv"

script:
"""
#!/usr/bin/env python3
import csv
tips = set()
with open("${tips}", 'r', newline = '') as tips_in:
for line in tips_in:
tip = line.rstrip().replace("'","")
tips.add(tip)
print("Tree contains %d tips" %len(tips))
with open("${metadata}", 'r', newline = '') as csv_in, \
open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out:
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
new_fieldnames = reader.fieldnames
if "note" not in reader.fieldnames:
new_fieldnames.append("note")
writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
writer.writeheader()
for row in reader:
note = []
if row["sequence_name"] in tips:
note.append("tip in input tree")
if row["note"]:
note.append(row["note"])
row["note"] = "|".join(note)
writer.writerow(row)
"""
}


process announce_summary {
/**
Expand Down Expand Up @@ -300,10 +366,12 @@ workflow clean_fasta_and_metadata_and_tree {
clean_metadata(metadata, clean_fasta_headers_with_tree.out.map)
get_keep_tips(clean_metadata.out)
prune_tree_with_metadata(clean_fasta_headers_with_tree.out.tree, get_keep_tips.out)
get_tree_tips(prune_tree_with_metadata.out)
annotate_metadata(clean_metadata.out, get_tree_tips.out)
announce_metadata_pruned_tree(tree, metadata, prune_tree_with_metadata.out)
emit:
fasta = clean_fasta_headers_with_tree.out.fasta
metadata = clean_metadata.out
metadata = annotate_metadata.out
tree = prune_tree_with_metadata.out
}

Expand Down
72 changes: 67 additions & 5 deletions modules/subsample_for_tree.nf
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,17 @@ process filter_on_ambiguous_sites {
path fasta

output:
path "${fasta.baseName}.site_filtered.fa"
path "${fasta.baseName}.site_filtered.fa", emit: fasta
path "ids_with_ambiguous_sites.log", emit: ambiguous_site_ids


script:
if ( params.downsample )
"""
$project_dir/../bin/filter_by_ambiguous_sites.py \
--in-alignment ${fasta} \
--out-alignment "${fasta.baseName}.site_filtered.fa" \
--sites ${ambiguous_sites}
--sites ${ambiguous_sites} > "ids_with_ambiguous_sites.log"
"""
else
"""
Expand Down Expand Up @@ -129,6 +131,65 @@ process downsample {
"""
}

process annotate_metadata {
/**
* Adds note column with info about sequences hashed, filtered and downsampled
* @input metadata
* @output metadata
*/

input:
path metadata
path hashmap
path ambiguous_site_log

output:
path "${metadata.baseName}.annotated.csv"

script:
"""
#!/usr/bin/env python3
import csv
tips = set()
hashmap = {}
with open("${hashmap}", 'r', newline = '') as hashmap_in:
for line in hashmap_in:
tip, redundant = line.rstrip().split(",")
tips.add(tip)
for id in redundant.split("|"):
hashmap[id] = tip
print("hashmap contains %d tips and %d redundants" %(len(tips), len(hashmap)))
ambig = set()
with open("${ambiguous_site_log}", 'r', newline = '') as ambiguous_site_log_in:
for line in ambiguous_site_log_in:
ambig.add(line.rstrip())
print("%d had ambiguous bases" %len(ambig))
with open("${metadata}", 'r', newline = '') as csv_in, \
open("${metadata.baseName}.annotated.csv", 'w', newline = '') as csv_out:
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
new_fieldnames = reader.fieldnames
if "note" not in reader.fieldnames:
new_fieldnames.append("note")
writer = csv.DictWriter(csv_out, fieldnames = new_fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix")
writer.writeheader()
for row in reader:
note = []
if row["sequence_name"] in hashmap:
note.append("hashed to tip %s" %hashmap[row["sequence_name"]])
if row["sequence_name"] in ambig:
note.append("filtered due to ambiguous base")
if row["date_filter"]:
if row["note"] and "downsample" not in row["note"]:
note.append("date filtered")
if row["note"]:
note.append(row["note"])
row["note"] = "|".join(note)
writer.writerow(row)
"""
}

process announce_summary {
/**
Expand Down Expand Up @@ -188,11 +249,12 @@ workflow subsample_for_tree {
hash_non_unique_seqs(fasta, metadata)
filter_on_ambiguous_sites(hash_non_unique_seqs.out.fasta)
filter_on_sample_date(metadata)
downsample(filter_on_ambiguous_sites.out, filter_on_sample_date.out)
announce_summary(fasta, hash_non_unique_seqs.out.fasta, filter_on_ambiguous_sites.out, filter_on_sample_date.out, downsample.out.fasta)
downsample(filter_on_ambiguous_sites.out.fasta, filter_on_sample_date.out)
annotate_metadata(downsample.out.metadata, hash_non_unique_seqs.out.hashmap, filter_on_ambiguous_sites.out.ambiguous_site_ids)
announce_summary(fasta, hash_non_unique_seqs.out.fasta, filter_on_ambiguous_sites.out.fasta, filter_on_sample_date.out, downsample.out.fasta)
emit:
fasta = downsample.out.fasta // subset of unique
metadata = downsample.out.metadata
metadata = annotate_metadata.out
hashmap = hash_non_unique_seqs.out.hashmap
}

Expand Down
Loading

0 comments on commit f772aba

Please sign in to comment.