From de1bd23737b4cf55d0563e5871704110cfbf2ab5 Mon Sep 17 00:00:00 2001
From: nycticebus <huw.ogilvie@anu.edu.au>
Date: Sat, 25 Oct 2014 21:06:48 +1100
Subject: [PATCH] Deriving trees from conditional clades is fixed. Support
 values are working now too.

---
 libscculs.py | 223 ++++++++++++++++++++++++++++-----------------------
 scculs.py    |  67 ++++++++++------
 2 files changed, 164 insertions(+), 126 deletions(-)

diff --git a/libscculs.py b/libscculs.py
index 81d6307..6b6d829 100644
--- a/libscculs.py
+++ b/libscculs.py
@@ -49,7 +49,7 @@ def recurse_node_properties(self, node, topology_values):
 			self.recurse_node_properties(child1, topology_values)
 			self.recurse_node_properties(child2, topology_values)
 
-			parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order)
+			parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order)
 			topology_values.append((parent_id, split_id))
 
 class UltrametricSample(TopologySample):
@@ -105,7 +105,7 @@ def recurse_node_properties(self, node, node_height, tree_values):
 			self.recurse_node_properties(child1, child1_height, tree_values)
 			self.recurse_node_properties(child2, child2_height, tree_values)
 
-			parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order)
+			parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order)
 			tree_values.append((parent_id, split_id, node_height))
 
 class DiscreteProbabilities():
@@ -116,34 +116,40 @@ def __init__(self, data):
 
 		sorted_data = [data[feature_hash] for feature_hash in self.hashes_array]
 		self.data_array = numpy.array(sorted_data)
+
 		self.probabilities = {}
-		self.probabilities_array = numpy.zeros(self.n_features, dtype = numpy.float64)
+		for feature_hash in sorted_hashes:
+			self.probabilities[feature_hash] = 0.0
 
-	def probabilities_from_counts(self, counts):
-		sorted_counts = []
-		for feature_hash in self.hashes_array:
-			if feature_hash in counts:
-				sorted_counts.append(counts[feature_hash])
-			else:
-				sorted_counts.append(0)
+		self.convert_probabilities()
+
+	def add_probabilities(self, probabilities):
+		for feature in self.hashes_array:
+			feature_hash = feature.tostring()
+			self.probabilities[feature_hash] = probabilities[feature_hash]
 
-		counts_array = numpy.array(sorted_counts, dtype = numpy.uint64)
+		self.convert_probabilities()
 
+	def probabilities_from_counts(self, counts):
 		log_counts = {}
-		for i in range(self.n_features):
-			feature_hash  = self.hashes_array[i]
-			feature_count = counts_array[i]
+		all_log_counts = []
 
-			if feature_counts == 0:
-				self.probabilities[feature_hash] = 0.0
-			else:
-				log_counts[feature_hash] = math.log(feature_count)
+		for count_hash, count in counts.items():
+			log_count = math.log(count)
+			log_counts[count_hash] = log_count
+			all_log_counts.append(log_count)
+
+		if len(all_log_counts) > 0:
+			log_sum_of_counts = numpy.logaddexp.reduce(all_log_counts)
+		else:
+			log_sum_of_counts = None
 
-		if len(log_counts) > 0:
-			log_sum_of_counts = numpy.logaddexp.reduce(log_counts.values())
-			for feature_hash in log_counts:
+		for feature_hash in self.hashes_array:
+			if feature_hash in log_counts:
 				normalized_probability = math.exp(log_counts[feature_hash] - log_sum_of_counts)
 				self.probabilities[feature_hash] = normalized_probability
+			else:
+				self.probabilities[feature_hash] = 0.0
 
 		self.convert_probabilities()
 
@@ -161,7 +167,7 @@ def cull_probabilities(self, max_features, max_probability):
 		cull_indices = []
 		for i in topology_descending_order:
 			if (posterior_features >= max_features) or (posterior_probability >= max_probability):
-				cull_hash = self.hashes_array[i]
+				cull_hash = self.hashes_array[i].tostring()
 				self.probabilities.pop(cull_hash)
 				cull_indices.append(i)
 
@@ -196,12 +202,81 @@ def probabilities_from_ccs(self, cc_sets):
 
 		self.convert_probabilities()
 
-	def add_clade_support(self, clade_set):
-		pass
+	def add_clade_support(self, clade_set, taxon_order):
+		topologies_with_support = []
+		for topology_newick in self.data_array:
+			root_node = ete2.Tree(topology_newick)
+			for node in root_node.get_descendants():
+				if not node.is_leaf():
+					child1, child2 = node.get_children()
+					child1_clade = set(child1.get_leaf_names())
+					child2_clade = set(child2.get_leaf_names())
+
+					clade_hash, split_hash = calculate_node_hashes(child1_clade, child2_clade, taxon_order)
+					clade_hash = clade_hash.rstrip("\x00")
+					split_hash = clade_hash.rstrip("\x00")
+
+					clade_probability = clade_set.probabilities[clade_hash]
+					node.support = clade_probability
+
+			newick_with_support = root_node.write(format = 2)
+			topologies_with_support.append(newick_with_support)
+
+		self.data_array = numpy.array(topologies_with_support)
 
 	def add_consensus_heights(self):
 		pass
 
+class CladeProbabilities(DiscreteProbabilities):
+	def derive_clade_probabilities(self, cc_sets, n_taxa):
+		reverse_ccp = reverse_cc_probabilities(cc_sets)
+		root_hash = calculate_root_hash(n_taxa)
+		self.probabilities[root_hash] = 1.0
+
+		clades_by_size = []
+		for i in range(n_taxa - 1):
+			clades_by_size.append(set())
+
+		for i in range(self.n_features):
+			clade_hash = self.hashes_array[i]
+			n_clade_taxa = self.data_array[i]
+			clades_by_size[n_taxa - n_clade_taxa].add(clade_hash)
+
+		# iterate through clades from largest to smallest (in number of taxa)
+		for i in range(1, n_taxa - 1): # skip the root hash
+			for clade_hash in clades_by_size[i]:
+				# there may be multiple paths from any clade to the root, so the sum of path probabilities is required
+				# as clades can only be children of larger parents, by calculating probabilities of larger clades first,
+				# the conditional probability of the clade of interest may be multiplied by the parent clade probability
+				# which is the sum of path probabilities from the parent to the root
+				clade_probability = 0.0
+				conditional_parents = reverse_ccp[clade_hash]
+				for parent_hash in conditional_parents:
+					# the product of conditional clade probabilities which link a clade to the root of the tree
+					path_probability = conditional_parents[parent_hash] * self.probabilities[parent_hash]
+					clade_probability += path_probability
+
+				self.probabilities[clade_hash] = clade_probability
+
+		self.convert_probabilities()
+
+	def melt_clade_probabilities(self, topology_set, n_taxa):
+		root_hash = calculate_root_hash(n_taxa)
+		n_bytes = len(root_hash) # maximum number of bytes required to store clade hashes
+		clade_dtype = "a" + str(n_bytes)
+
+		clade_probability_cache = {}
+		for i in range(topology_set.n_features):
+			topology_hash = topology_set.hashes_array[i]
+			topology_probability = topology_set.probabilities_array[i]
+
+			topology_char_array = numpy.array(list(topology_hash))
+			clades_array = topology_char_array.view(clade_dtype)
+			for clade_hash in clades_array:
+				self.probabilities[clade_hash] += topology_probability
+
+		self.convert_probabilities()
+
 # read a nexus or newick format file containing phylogenetic trees
 # if the file does not begin with a nexus header, assumes it is a newick file
 # returns a list of newick strings, in the same order as the input file
@@ -224,7 +299,7 @@ def trees_from_path(trees_filepath):
 	newick_strings = newick_blob.strip().split("\n")
 	return newick_strings
 
-def calculate_node_ids(children_a, children_b, taxon_order):
+def calculate_node_hashes(children_a, children_b, taxon_order):
 	n_taxa = len(taxon_order)
 	children = set.union(children_a, children_b)
 
@@ -268,6 +343,7 @@ def calculate_topology_probabilities(ts):
 	topology_data = {}
 	cc_counts = {}
 	cc_data = {}
+	clade_sizes = {}
 
 	for i in range(ts.n_trees):
 		tree_array = ts.tree_arrays[i]
@@ -288,6 +364,7 @@ def calculate_topology_probabilities(ts):
 			split_hash = node[1].tostring() # the hash for the bifurcation
 
 			n_node_taxa = clade_size(parent_hash)
+			clade_sizes[parent_hash] = n_node_taxa
 			if n_node_taxa >= 3: # record conditional clade
 				if parent_hash not in cc_counts:
 					cc_data[parent_hash] = {split_hash: node}
@@ -298,13 +375,14 @@ def calculate_topology_probabilities(ts):
 				else:
 					cc_counts[parent_hash][split_hash] += 1
 
+	clades_set = CladeProbabilities(clade_sizes)
 	topology_set = TopologyProbabilities(topology_data)
 
 	cc_sets = {}
 	for parent_hash, splits_data in cc_data.items():
 		cc_sets[parent_hash] = DiscreteProbabilities(splits_data)
 
-	return topology_set, topology_counts, cc_sets, cc_counts
+	return topology_set, topology_counts, cc_sets, cc_counts, clades_set
 
 def derive_best_topologies(cc_sets, taxon_order, topologies_threshold, probability_threshold):
 	cherry_hash = "\x80"
@@ -481,8 +559,7 @@ def clade_taxon_names(clade_hash, taxon_order):
 
 	return taxon_names
 
-def nonzero_derived_topologies(cc_sets, taxon_order):
-	n_taxa = len(taxon_order)
+def n_derived_topologies(cc_sets, n_taxa, include_zero_probability = False):
 	reverse_ccp = reverse_cc_probabilities(cc_sets)
 	clades_by_size = []
 	n_subtrees = {}
@@ -490,97 +567,39 @@ def nonzero_derived_topologies(cc_sets, taxon_order):
 	for i in range(n_taxa - 2):
 		clades_by_size.append(set())
 
-	for parent_id in ccp:
-		parent_size = clade_size(parent_id)
-		if parent_size > 2:
-			clades_by_size[parent_size - 3].add(parent_id)
+	for clade_hash in cc_sets:
+		n_clade_taxa = clade_size(clade_hash)
+		if n_clade_taxa >= 3:
+			clades_by_size[n_clade_taxa - 3].add(clade_hash)
 
 	for i in range(n_taxa - 2):
-		for parent_id in clades_by_size[i]:
+		for parent_hash in clades_by_size[i]:
+			parent_splits = cc_sets[parent_hash]
 			n_parent_subtrees = 0
-			for split_id in ccp[parent_id]:
-				if ccp[parent_id][split_id] > 0.0:
-					child1_id, child2_id = elucidate_cc_split(parent_id, split_id)
+			for j in range(parent_splits.n_features):
+				split_hash = parent_splits.hashes_array[j]
+				split_probability = parent_splits.probabilities_array[j]
+				if include_zero_probability or (split_probability > 0.0):
+					child1_hash, child2_hash = elucidate_cc_split(parent_hash, split_hash)
 
 					n_split_subtrees = 1
-					child1_size = clade_size(child1_id)
+					child1_size = clade_size(child1_hash)
 					if child1_size > 2:
-						n_split_subtrees = n_split_subtrees * n_subtrees[child1_id]
+						n_split_subtrees = n_split_subtrees * n_subtrees[child1_hash]
 
-					child2_size = clade_size(child2_id)
+					child2_size = clade_size(child2_hash)
 					if child2_size > 2:
-						n_split_subtrees = n_split_subtrees * n_subtrees[child2_id]
+						n_split_subtrees = n_split_subtrees * n_subtrees[child2_hash]
 
 					n_parent_subtrees += n_split_subtrees
 
-			n_subtrees[parent_id] = n_parent_subtrees
+			n_subtrees[parent_hash] = n_parent_subtrees
 
 	root_id = calculate_root_hash(n_taxa)
 	n_root_topologies = n_subtrees[root_id]
 
 	return n_root_topologies
 
-def derive_clade_probabilities(cc_sets):
-	clade_probability_cache = {}
-
-	n_taxa = len(taxon_order)
-	reverse_ccp = reverse_cc_probabilities(cc_sets)
-
-	new_newick_strings = []
-	for i in range(self.n_features):
-		topology_newick = self.data_array[i].tostring()
-		root_node = ete2.Tree(topology_newick)
-		for node in root_node.get_descendants():
-			if not node.is_leaf():
-				child1, child2 = node.get_children()
-				child1_clade = set(child1.get_leaf_names())
-				child2_clade = set(child2.get_leaf_names())
-
-				parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, taxon_order)
-
-				if parent_id not in clade_probability_cache:
-					clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp)
-					clade_probability_cache[parent_id] = {split_id: clade_probability}
-				elif split_id not in clade_probability_cache[parent_id]:
-					clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp)
-					clade_probability_cache[parent_id][split_id] = clade_probability
-
-def derive_single_clade_probability(clade_id, n_taxa, reverse_ccp):
-	n_bytes, remainder = divmod(n_taxa, 8)
-	if remainder > 0:
-		n_bytes += 1
-
-	derived_struct_format = "a%d,f8" % (n_bytes)
-
-	root_hash = calculate_root_hash(n_taxa)
-
-	target_clade = numpy.array([(clade_id, 1.0)], dtype=derived_struct_format)
-	paths_to_root = [target_clade]
-
-	target_clade_probability = 0.0
-	while len(paths_to_root) > 0:
-		incomplete_path = paths_to_root.pop(0)
-		incomplete_path_head = incomplete_path[0]
-		incomplete_path_hash = incomplete_path_head["f0"].tostring()
-		possible_parents = reverse_ccp[incomplete_path_hash]
-
-		for parent_hash in possible_parents:
-			split_probability = possible_parents[parent_hash]
-			if split_probability > 0.0:
-				parent_row = numpy.array([(parent_hash, split_probability)], dtype=derived_struct_format)
-				new_path = numpy.concatenate((parent_row, incomplete_path))
-
-				if parent_hash == root_hash:
-					path_probability = numpy.prod(new_path["f1"])
-					target_clade_probability += path_probability
-				else:
-					paths_to_root.append(new_path)
-
-	return target_clade_probability
-
-def melt_clade_probabilities(topology_set):
-	pass
-
 def reverse_cc_probabilities(cc_sets):
 	reverse_ccp = {}
 	for parent_id in cc_sets:
diff --git a/scculs.py b/scculs.py
index dfc4bd3..aee4bf8 100644
--- a/scculs.py
+++ b/scculs.py
@@ -18,20 +18,20 @@ def safe_open(file_path, overwrite):
 
 	return safe_open_file
 
-arg_parser = argparse.ArgumentParser(description = "SCCULS: Scalable Conditional-Clade Ultrametric Summary trees. Produces an annotate summary tree from an MCMC sample, using the highest tree probability calculated from conditional clade frequencies, or from topology frequencies.")
+arg_parser = argparse.ArgumentParser(description = "SCCULS: Scalable Conditional-Clade Ultrametric Summary trees. Distills a summary tree from an MCMC sample, using the highest tree probability calculated from conditional clade frequencies, or from topology frequencies.")
 arg_parser.add_argument("-v", "--version", action = "version", version = PROGRAM_VERSION)
 
 defaults_group = arg_parser.add_argument_group("program defaults")
 defaults_group.add_argument("-c", "--candidate-method", type = str, default = "derived", choices = ["derived", "sampled"], help = "Only consider topologies in the MCMC sample, or derive the most probable topology or topologies using conditional clades. Default: derived.")
-#defaults_group.add_argument("-n", "--node-heights", type = str, choices = ["median", "mean"], help = "Specify the method used to calculate node heights. Without this option, node heights will not be calculated, and trees of equal branch lengths will be returned.")
-defaults_group.add_argument("-p", "--probability-method", type = str, choices = ["conditional-clade", "tree-topology"], help = "Infer tree topology probabilities using either tree topology frequencies or conditional clade frequencies. When -c/--candidate-method is 'sampled', default is tree-topology. When -c/--candidate-method is 'derived', default is conditional-clade.")
-defaults_group.add_argument("-s", "--support-values", type = str, choices = ["conditional-clade", "tree-topology"], help = "Add clade monophyly support values to output trees, and infer them using either tree topology probabilities or conditional clade frequencies.")
-defaults_group.add_argument("-r", "--overwrite", action = "store_true", help = "If output file paths are alreading existing files, overwrite them with new output.")
+defaults_group.add_argument("-g", "--node-heights", type = str, choices = ["median", "mean"], help = "Specify the method used to calculate node heights. Without this option, node heights will not be calculated, and trees of equal branch lengths will be returned.")
+defaults_group.add_argument("-p", "--probability-method", type = str, choices = ["conditional-clade", "tree-topology"], help = "Infer tree topology probabilities using either tree topology probabilities or conditional clade probabilities. When -c/--candidate-method is 'derived', default is conditional-clade. When -c/--candidate-method is 'sampled', default is tree-topology.")
+defaults_group.add_argument("-s", "--support-values", type = str, choices = ["conditional-clade", "tree-topology"], help = "Add clade monophyly support values to output trees, and infer them using either tree topology frequencies or conditional clade frequencies.")
 
 output_group = arg_parser.add_argument_group('output files')
 output_group.add_argument("-i", "--info-output", metavar = "INFO_OUTPUT_PATH", type = str, help = "Calculate whole-sample statistics and output them to a text format file.")
+output_group.add_argument("-n", "--newick-output", metavar = "NEWICK_OUTPUT_PATH", type = str, help = "Output the summary tree(s) to newick format file(s). When -l/--max-topologies is greater than 1, more than one tree may be returned, so an identifying number will be appended to the end of each filename.")
 output_group.add_argument("-o", "--csv-output", metavar = "CSV_OUTPUT_PATH", type = str, help = "Calculate statistics for each returned tree topology, and output them to CSV format file.")
-output_group.add_argument("-w", "--newick-output", metavar = "NEWICK_OUTPUT_PATH", type = str, help = "Output the summary tree(s) to newick format file(s). When -l/--max-topologies is greater than 1, more than one tree may be returned, so an identifying number will be appended to the end of each filename.")
+output_group.add_argument("-w", "--overwrite", action = "store_true", help = "If output file paths point to existing files, overwrite the existing files.")
 
 limits_group = arg_parser.add_argument_group('output limits')
 limits_group.add_argument("-l", "--max-topologies", type = int, default = 1, help = "The size of the credible set in the number of unique topologies to output. The number of topologies returned will still be limited by -m/--max-probability. Default: 1.")
@@ -53,13 +53,13 @@ def safe_open(file_path, overwrite):
 elif not os.path.isfile(args.sample_path):
 	arg_parser.error("argument MCMC_SAMPLE_PATH: not a file path")
 
-# set default probability method
-if args.probability_method is None:
+# set probability method
+if args.probability_method is None: # defaults if not supplied
 	if args.candidate_method == "derived":
 		probability_method = "conditional-clade"
 	else:
 		probability_method = "tree-topology"
-else:
+else: # user-supplied method
 	probability_method = args.probability_method
 
 calibration_taxon = args.calibration_taxon
@@ -75,9 +75,10 @@ def safe_open(file_path, overwrite):
 mcmc_post = mcmc_sample[sample_burn_in:] # discard burn-in
 ultrametric_sample = libscculs.UltrametricSample(mcmc_post, calibration_taxon, calibration_date)
 taxon_order = ultrametric_sample.taxon_order
+n_taxa = len(taxon_order)
 
 print("Counting topologies and conditional clades...")
-topology_set, topology_counts, cc_sets, cc_counts = libscculs.calculate_topology_probabilities(ultrametric_sample)
+topology_set, topology_counts, cc_sets, cc_counts, clade_set = libscculs.calculate_topology_probabilities(ultrametric_sample)
 n_unique_topologies = topology_set.n_features
 
 # all circumstances where conditional clade probabilities are required
@@ -87,20 +88,19 @@ def safe_open(file_path, overwrite):
 	for parent_hash, split_counts in cc_counts.items():
 		cc_sets[parent_hash].probabilities_from_counts(split_counts)
 
-if args.support_values is not None:
-	if args.support_values == "tree-topology": # infer clade support values using tree topology probabilities
-		topology_set.probabilities_from_counts(topology_counts)
-		clade_set = melt_clade_probabilities(topology_set)
-	else: # infer clade support values using conditional clade probabilities
-		clade_set = derive_clade_probabilities(cc_sets)
-
-	output_topology_set.add_clade_support(clade_set)
+# adding tree-topology based support values needs to be done before other steps, in case the topology set is modified later
+if args.support_values == "conditional-clade":
+	print("Calculating clade probabilities from conditional clade probabilities...")
+	clade_set.derive_clade_probabilities(cc_sets, n_taxa)
+elif args.support_values == "tree-topology":
+	print("Calculating topology and clade probabilities from MCMC sample...")
+	topology_set.probabilities_from_counts(topology_counts)
+	clade_set.melt_clade_probabilities(topology_set, n_taxa)
 
-if args.candidate_method == "derived": # derive credible topologies based on conditional clade probabilities
+if args.candidate_method == "derived": # derive credible topologies from conditional clades
 	print("Deriving probable topologies from conditional clades...")
 	output_topology_set = libscculs.derive_best_topologies(cc_sets, taxon_order, max_tree_topologies, max_probability)
-else: # credible probable topologies based on topology frequencies
-	topology_set.cull_probabilities(max_tree_topologies, max_probability)
+else: # base credible topologies on frequency in MCMC sample
 	output_topology_set = topology_set
 
 if probability_method == "conditional-clade":
@@ -110,18 +110,36 @@ def safe_open(file_path, overwrite):
 	print("Calculating topology probabilities...")
 	output_topology_set.probabilities_from_counts(topology_counts)
 
+# once probabilities have been calculated for each topology in the sampled set
+# then topologies that exceed maximum topology/probability limits can be removed
+if args.candidate_method == "sampled":
+	print("Limiting output topologies to credible set...")
+	output_topology_set.cull_probabilities(max_tree_topologies, max_probability)
+
+if args.support_values is not None:
+	print("Adding clade support values to tree topologies...")
+	output_topology_set.add_clade_support(clade_set, taxon_order)
+
 if args.info_output is not None:
+	print("Writing MCMC sample statistics file...")
 	info_output_path = args.info_output
 	info_output_file = safe_open(info_output_path, overwrite)
-	info_output_file.write("Number of unique tree topologies in MCMC sample: " + str(n_unique_topologies))
+	info_output_file.write("Number of taxa in each tree: %i\n" % (n_taxa))
+	info_output_file.write("Number of unique tree topologies in MCMC sample: %i\n" % (n_unique_topologies))
 
 	if args.candidate_method == "derived": # calculate summary statistics for topologies
-		n_derived_topologies = libscculs.nonzero_derived_topologies(cc_sets, taxon_order)
-		info_output_file.write("Number of topologies with non-zero derived probabilities: " + str(n_derived_topologies))
+		n_nonzero_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa)
+		info_output_file.write("Number of topologies derived from conditional clades: %i\n" % (n_nonzero_topologies))
+
+		#n_derived_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa, include_zero_probability = True)
+		#n_nonzero_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa)
+		#info_output_file.write("Number of topologies derived from conditional clades: %i\n" % (n_derived_topologies))
+		#info_output_file.write("Number of topologies derived from conditional clades (with non-zero probabilities): %i\n" % (n_nonzero_topologies))
 
 	info_output_file.close()
 
 if args.newick_output is not None:
+	print("Writing tree topology files...")
 	newick_path_prefix = args.newick_output
 	for i in range(output_topology_set.n_features):
 		newick_string = output_topology_set.data_array[i]
@@ -131,6 +149,7 @@ def safe_open(file_path, overwrite):
 		newick_output_file.close()
 
 if args.csv_output is not None:
+	print("Writing tree statistics file...")
 	csv_output_path = args.csv_output
 	csv_output_file = safe_open(csv_output_path, overwrite)
 	csv_writer = csv.writer(csv_output_file)