From de1bd23737b4cf55d0563e5871704110cfbf2ab5 Mon Sep 17 00:00:00 2001 From: nycticebus Date: Sat, 25 Oct 2014 21:06:48 +1100 Subject: [PATCH] Deriving trees from conditional clades is fixed. Support values are working now too. --- libscculs.py | 223 ++++++++++++++++++++++++++++----------------------- scculs.py | 67 ++++++++++------ 2 files changed, 164 insertions(+), 126 deletions(-) diff --git a/libscculs.py b/libscculs.py index 81d6307..6b6d829 100644 --- a/libscculs.py +++ b/libscculs.py @@ -49,7 +49,7 @@ def recurse_node_properties(self, node, topology_values): self.recurse_node_properties(child1, topology_values) self.recurse_node_properties(child2, topology_values) - parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order) + parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order) topology_values.append((parent_id, split_id)) class UltrametricSample(TopologySample): @@ -105,7 +105,7 @@ def recurse_node_properties(self, node, node_height, tree_values): self.recurse_node_properties(child1, child1_height, tree_values) self.recurse_node_properties(child2, child2_height, tree_values) - parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order) + parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order) tree_values.append((parent_id, split_id, node_height)) class DiscreteProbabilities(): @@ -116,34 +116,40 @@ def __init__(self, data): sorted_data = [data[feature_hash] for feature_hash in self.hashes_array] self.data_array = numpy.array(sorted_data) + self.probabilities = {} - self.probabilities_array = numpy.zeros(self.n_features, dtype = numpy.float64) + for feature_hash in sorted_hashes: + self.probabilities[feature_hash] = 0.0 - def probabilities_from_counts(self, counts): - sorted_counts = [] - for feature_hash in self.hashes_array: - if feature_hash in counts: - sorted_counts.append(counts[feature_hash]) - else: - sorted_counts.append(0) + self.convert_probabilities() + + def add_probabilities(self, probabilities): + for feature in self.hashes_array: + feature_hash = feature.tostring() + self.probabilities[feature_hash] = probabilities[feature_hash] - counts_array = numpy.array(sorted_counts, dtype = numpy.uint64) + self.convert_probabilities() + def probabilities_from_counts(self, counts): log_counts = {} - for i in range(self.n_features): - feature_hash = self.hashes_array[i] - feature_count = counts_array[i] + all_log_counts = [] - if feature_counts == 0: - self.probabilities[feature_hash] = 0.0 - else: - log_counts[feature_hash] = math.log(feature_count) + for count_hash, count in counts.items(): + log_count = math.log(count) + log_counts[count_hash] = log_count + all_log_counts.append(log_count) + + if len(all_log_counts) > 0: + log_sum_of_counts = numpy.logaddexp.reduce(all_log_counts) + else: + log_sum_of_counts = None - if len(log_counts) > 0: - log_sum_of_counts = numpy.logaddexp.reduce(log_counts.values()) - for feature_hash in log_counts: + for feature_hash in self.hashes_array: + if feature_hash in log_counts: normalized_probability = math.exp(log_counts[feature_hash] - log_sum_of_counts) self.probabilities[feature_hash] = normalized_probability + else: + self.probabilities[feature_hash] = 0.0 self.convert_probabilities() @@ -161,7 +167,7 @@ def cull_probabilities(self, max_features, max_probability): cull_indices = [] for i in topology_descending_order: if (posterior_features >= max_features) or (posterior_probability >= max_probability): - cull_hash = self.hashes_array[i] + cull_hash = self.hashes_array[i].tostring() self.probabilities.pop(cull_hash) cull_indices.append(i) @@ -196,12 +202,81 @@ def probabilities_from_ccs(self, cc_sets): self.convert_probabilities() - def add_clade_support(self, clade_set): - pass + def add_clade_support(self, clade_set, taxon_order): + topologies_with_support = [] + for topology_newick in self.data_array: + root_node = ete2.Tree(topology_newick) + for node in root_node.get_descendants(): + if not node.is_leaf(): + child1, child2 = node.get_children() + child1_clade = set(child1.get_leaf_names()) + child2_clade = set(child2.get_leaf_names()) + + clade_hash, split_hash = calculate_node_hashes(child1_clade, child2_clade, taxon_order) + clade_hash = clade_hash.rstrip("\x00") + split_hash = clade_hash.rstrip("\x00") + + clade_probability = clade_set.probabilities[clade_hash] + node.support = clade_probability + + newick_with_support = root_node.write(format = 2) + topologies_with_support.append(newick_with_support) + + self.data_array = numpy.array(topologies_with_support) def add_consensus_heights(self): pass +class CladeProbabilities(DiscreteProbabilities): + def derive_clade_probabilities(self, cc_sets, n_taxa): + reverse_ccp = reverse_cc_probabilities(cc_sets) + root_hash = calculate_root_hash(n_taxa) + self.probabilities[root_hash] = 1.0 + + clades_by_size = [] + for i in range(n_taxa - 1): + clades_by_size.append(set()) + + for i in range(self.n_features): + clade_hash = self.hashes_array[i] + n_clade_taxa = self.data_array[i] + clades_by_size[n_taxa - n_clade_taxa].add(clade_hash) + + # iterate through clades from largest to smallest (in number of taxa) + for i in range(1, n_taxa - 1): # skip the root hash + for clade_hash in clades_by_size[i]: + # there may be multiple paths from any clade to the root, so the sum of path probabilities is required + # as clades can only be children of larger parents, by calculating probabilities of larger clades first, + # the conditional probability of the clade of interest may be multiplied by the parent clade probability + # which is the sum of path probabilities from the parent to the root + clade_probability = 0.0 + conditional_parents = reverse_ccp[clade_hash] + for parent_hash in conditional_parents: + # the product of conditional clade probabilities which link a clade to the root of the tree + path_probability = conditional_parents[parent_hash] * self.probabilities[parent_hash] + clade_probability += path_probability + + self.probabilities[clade_hash] = clade_probability + + self.convert_probabilities() + + def melt_clade_probabilities(self, topology_set, n_taxa): + root_hash = calculate_root_hash(n_taxa) + n_bytes = len(root_hash) # maximum number of bytes required to store clade hashes + clade_dtype = "a" + str(n_bytes) + + clade_probability_cache = {} + for i in range(topology_set.n_features): + topology_hash = topology_set.hashes_array[i] + topology_probability = topology_set.probabilities_array[i] + + topology_char_array = numpy.array(list(topology_hash)) + clades_array = topology_char_array.view(clade_dtype) + for clade_hash in clades_array: + self.probabilities[clade_hash] += topology_probability + + self.convert_probabilities() + # read a nexus or newick format file containing phylogenetic trees # if the file does not begin with a nexus header, assumes it is a newick file # returns a list of newick strings, in the same order as the input file @@ -224,7 +299,7 @@ def trees_from_path(trees_filepath): newick_strings = newick_blob.strip().split("\n") return newick_strings -def calculate_node_ids(children_a, children_b, taxon_order): +def calculate_node_hashes(children_a, children_b, taxon_order): n_taxa = len(taxon_order) children = set.union(children_a, children_b) @@ -268,6 +343,7 @@ def calculate_topology_probabilities(ts): topology_data = {} cc_counts = {} cc_data = {} + clade_sizes = {} for i in range(ts.n_trees): tree_array = ts.tree_arrays[i] @@ -288,6 +364,7 @@ def calculate_topology_probabilities(ts): split_hash = node[1].tostring() # the hash for the bifurcation n_node_taxa = clade_size(parent_hash) + clade_sizes[parent_hash] = n_node_taxa if n_node_taxa >= 3: # record conditional clade if parent_hash not in cc_counts: cc_data[parent_hash] = {split_hash: node} @@ -298,13 +375,14 @@ def calculate_topology_probabilities(ts): else: cc_counts[parent_hash][split_hash] += 1 + clades_set = CladeProbabilities(clade_sizes) topology_set = TopologyProbabilities(topology_data) cc_sets = {} for parent_hash, splits_data in cc_data.items(): cc_sets[parent_hash] = DiscreteProbabilities(splits_data) - return topology_set, topology_counts, cc_sets, cc_counts + return topology_set, topology_counts, cc_sets, cc_counts, clades_set def derive_best_topologies(cc_sets, taxon_order, topologies_threshold, probability_threshold): cherry_hash = "\x80" @@ -481,8 +559,7 @@ def clade_taxon_names(clade_hash, taxon_order): return taxon_names -def nonzero_derived_topologies(cc_sets, taxon_order): - n_taxa = len(taxon_order) +def n_derived_topologies(cc_sets, n_taxa, include_zero_probability = False): reverse_ccp = reverse_cc_probabilities(cc_sets) clades_by_size = [] n_subtrees = {} @@ -490,97 +567,39 @@ def nonzero_derived_topologies(cc_sets, taxon_order): for i in range(n_taxa - 2): clades_by_size.append(set()) - for parent_id in ccp: - parent_size = clade_size(parent_id) - if parent_size > 2: - clades_by_size[parent_size - 3].add(parent_id) + for clade_hash in cc_sets: + n_clade_taxa = clade_size(clade_hash) + if n_clade_taxa >= 3: + clades_by_size[n_clade_taxa - 3].add(clade_hash) for i in range(n_taxa - 2): - for parent_id in clades_by_size[i]: + for parent_hash in clades_by_size[i]: + parent_splits = cc_sets[parent_hash] n_parent_subtrees = 0 - for split_id in ccp[parent_id]: - if ccp[parent_id][split_id] > 0.0: - child1_id, child2_id = elucidate_cc_split(parent_id, split_id) + for j in range(parent_splits.n_features): + split_hash = parent_splits.hashes_array[j] + split_probability = parent_splits.probabilities_array[j] + if include_zero_probability or (split_probability > 0.0): + child1_hash, child2_hash = elucidate_cc_split(parent_hash, split_hash) n_split_subtrees = 1 - child1_size = clade_size(child1_id) + child1_size = clade_size(child1_hash) if child1_size > 2: - n_split_subtrees = n_split_subtrees * n_subtrees[child1_id] + n_split_subtrees = n_split_subtrees * n_subtrees[child1_hash] - child2_size = clade_size(child2_id) + child2_size = clade_size(child2_hash) if child2_size > 2: - n_split_subtrees = n_split_subtrees * n_subtrees[child2_id] + n_split_subtrees = n_split_subtrees * n_subtrees[child2_hash] n_parent_subtrees += n_split_subtrees - n_subtrees[parent_id] = n_parent_subtrees + n_subtrees[parent_hash] = n_parent_subtrees root_id = calculate_root_hash(n_taxa) n_root_topologies = n_subtrees[root_id] return n_root_topologies -def derive_clade_probabilities(cc_sets): - clade_probability_cache = {} - - n_taxa = len(taxon_order) - reverse_ccp = reverse_cc_probabilities(cc_sets) - - new_newick_strings = [] - for i in range(self.n_features): - topology_newick = self.data_array[i].tostring() - root_node = ete2.Tree(topology_newick) - for node in root_node.get_descendants(): - if not node.is_leaf(): - child1, child2 = node.get_children() - child1_clade = set(child1.get_leaf_names()) - child2_clade = set(child2.get_leaf_names()) - - parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, taxon_order) - - if parent_id not in clade_probability_cache: - clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp) - clade_probability_cache[parent_id] = {split_id: clade_probability} - elif split_id not in clade_probability_cache[parent_id]: - clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp) - clade_probability_cache[parent_id][split_id] = clade_probability - -def derive_single_clade_probability(clade_id, n_taxa, reverse_ccp): - n_bytes, remainder = divmod(n_taxa, 8) - if remainder > 0: - n_bytes += 1 - - derived_struct_format = "a%d,f8" % (n_bytes) - - root_hash = calculate_root_hash(n_taxa) - - target_clade = numpy.array([(clade_id, 1.0)], dtype=derived_struct_format) - paths_to_root = [target_clade] - - target_clade_probability = 0.0 - while len(paths_to_root) > 0: - incomplete_path = paths_to_root.pop(0) - incomplete_path_head = incomplete_path[0] - incomplete_path_hash = incomplete_path_head["f0"].tostring() - possible_parents = reverse_ccp[incomplete_path_hash] - - for parent_hash in possible_parents: - split_probability = possible_parents[parent_hash] - if split_probability > 0.0: - parent_row = numpy.array([(parent_hash, split_probability)], dtype=derived_struct_format) - new_path = numpy.concatenate((parent_row, incomplete_path)) - - if parent_hash == root_hash: - path_probability = numpy.prod(new_path["f1"]) - target_clade_probability += path_probability - else: - paths_to_root.append(new_path) - - return target_clade_probability - -def melt_clade_probabilities(topology_set): - pass - def reverse_cc_probabilities(cc_sets): reverse_ccp = {} for parent_id in cc_sets: diff --git a/scculs.py b/scculs.py index dfc4bd3..aee4bf8 100644 --- a/scculs.py +++ b/scculs.py @@ -18,20 +18,20 @@ def safe_open(file_path, overwrite): return safe_open_file -arg_parser = argparse.ArgumentParser(description = "SCCULS: Scalable Conditional-Clade Ultrametric Summary trees. Produces an annotate summary tree from an MCMC sample, using the highest tree probability calculated from conditional clade frequencies, or from topology frequencies.") +arg_parser = argparse.ArgumentParser(description = "SCCULS: Scalable Conditional-Clade Ultrametric Summary trees. Distills a summary tree from an MCMC sample, using the highest tree probability calculated from conditional clade frequencies, or from topology frequencies.") arg_parser.add_argument("-v", "--version", action = "version", version = PROGRAM_VERSION) defaults_group = arg_parser.add_argument_group("program defaults") defaults_group.add_argument("-c", "--candidate-method", type = str, default = "derived", choices = ["derived", "sampled"], help = "Only consider topologies in the MCMC sample, or derive the most probable topology or topologies using conditional clades. Default: derived.") -#defaults_group.add_argument("-n", "--node-heights", type = str, choices = ["median", "mean"], help = "Specify the method used to calculate node heights. Without this option, node heights will not be calculated, and trees of equal branch lengths will be returned.") -defaults_group.add_argument("-p", "--probability-method", type = str, choices = ["conditional-clade", "tree-topology"], help = "Infer tree topology probabilities using either tree topology frequencies or conditional clade frequencies. When -c/--candidate-method is 'sampled', default is tree-topology. When -c/--candidate-method is 'derived', default is conditional-clade.") -defaults_group.add_argument("-s", "--support-values", type = str, choices = ["conditional-clade", "tree-topology"], help = "Add clade monophyly support values to output trees, and infer them using either tree topology probabilities or conditional clade frequencies.") -defaults_group.add_argument("-r", "--overwrite", action = "store_true", help = "If output file paths are alreading existing files, overwrite them with new output.") +defaults_group.add_argument("-g", "--node-heights", type = str, choices = ["median", "mean"], help = "Specify the method used to calculate node heights. Without this option, node heights will not be calculated, and trees of equal branch lengths will be returned.") +defaults_group.add_argument("-p", "--probability-method", type = str, choices = ["conditional-clade", "tree-topology"], help = "Infer tree topology probabilities using either tree topology probabilities or conditional clade probabilities. When -c/--candidate-method is 'derived', default is conditional-clade. When -c/--candidate-method is 'sampled', default is tree-topology.") +defaults_group.add_argument("-s", "--support-values", type = str, choices = ["conditional-clade", "tree-topology"], help = "Add clade monophyly support values to output trees, and infer them using either tree topology frequencies or conditional clade frequencies.") output_group = arg_parser.add_argument_group('output files') output_group.add_argument("-i", "--info-output", metavar = "INFO_OUTPUT_PATH", type = str, help = "Calculate whole-sample statistics and output them to a text format file.") +output_group.add_argument("-n", "--newick-output", metavar = "NEWICK_OUTPUT_PATH", type = str, help = "Output the summary tree(s) to newick format file(s). When -l/--max-topologies is greater than 1, more than one tree may be returned, so an identifying number will be appended to the end of each filename.") output_group.add_argument("-o", "--csv-output", metavar = "CSV_OUTPUT_PATH", type = str, help = "Calculate statistics for each returned tree topology, and output them to CSV format file.") -output_group.add_argument("-w", "--newick-output", metavar = "NEWICK_OUTPUT_PATH", type = str, help = "Output the summary tree(s) to newick format file(s). When -l/--max-topologies is greater than 1, more than one tree may be returned, so an identifying number will be appended to the end of each filename.") +output_group.add_argument("-w", "--overwrite", action = "store_true", help = "If output file paths point to existing files, overwrite the existing files.") limits_group = arg_parser.add_argument_group('output limits') limits_group.add_argument("-l", "--max-topologies", type = int, default = 1, help = "The size of the credible set in the number of unique topologies to output. The number of topologies returned will still be limited by -m/--max-probability. Default: 1.") @@ -53,13 +53,13 @@ def safe_open(file_path, overwrite): elif not os.path.isfile(args.sample_path): arg_parser.error("argument MCMC_SAMPLE_PATH: not a file path") -# set default probability method -if args.probability_method is None: +# set probability method +if args.probability_method is None: # defaults if not supplied if args.candidate_method == "derived": probability_method = "conditional-clade" else: probability_method = "tree-topology" -else: +else: # user-supplied method probability_method = args.probability_method calibration_taxon = args.calibration_taxon @@ -75,9 +75,10 @@ def safe_open(file_path, overwrite): mcmc_post = mcmc_sample[sample_burn_in:] # discard burn-in ultrametric_sample = libscculs.UltrametricSample(mcmc_post, calibration_taxon, calibration_date) taxon_order = ultrametric_sample.taxon_order +n_taxa = len(taxon_order) print("Counting topologies and conditional clades...") -topology_set, topology_counts, cc_sets, cc_counts = libscculs.calculate_topology_probabilities(ultrametric_sample) +topology_set, topology_counts, cc_sets, cc_counts, clade_set = libscculs.calculate_topology_probabilities(ultrametric_sample) n_unique_topologies = topology_set.n_features # all circumstances where conditional clade probabilities are required @@ -87,20 +88,19 @@ def safe_open(file_path, overwrite): for parent_hash, split_counts in cc_counts.items(): cc_sets[parent_hash].probabilities_from_counts(split_counts) -if args.support_values is not None: - if args.support_values == "tree-topology": # infer clade support values using tree topology probabilities - topology_set.probabilities_from_counts(topology_counts) - clade_set = melt_clade_probabilities(topology_set) - else: # infer clade support values using conditional clade probabilities - clade_set = derive_clade_probabilities(cc_sets) - - output_topology_set.add_clade_support(clade_set) +# adding tree-topology based support values needs to be done before other steps, in case the topology set is modified later +if args.support_values == "conditional-clade": + print("Calculating clade probabilities from conditional clade probabilities...") + clade_set.derive_clade_probabilities(cc_sets, n_taxa) +elif args.support_values == "tree-topology": + print("Calculating topology and clade probabilities from MCMC sample...") + topology_set.probabilities_from_counts(topology_counts) + clade_set.melt_clade_probabilities(topology_set, n_taxa) -if args.candidate_method == "derived": # derive credible topologies based on conditional clade probabilities +if args.candidate_method == "derived": # derive credible topologies from conditional clades print("Deriving probable topologies from conditional clades...") output_topology_set = libscculs.derive_best_topologies(cc_sets, taxon_order, max_tree_topologies, max_probability) -else: # credible probable topologies based on topology frequencies - topology_set.cull_probabilities(max_tree_topologies, max_probability) +else: # base credible topologies on frequency in MCMC sample output_topology_set = topology_set if probability_method == "conditional-clade": @@ -110,18 +110,36 @@ def safe_open(file_path, overwrite): print("Calculating topology probabilities...") output_topology_set.probabilities_from_counts(topology_counts) +# once probabilities have been calculated for each topology in the sampled set +# then topologies that exceed maximum topology/probability limits can be removed +if args.candidate_method == "sampled": + print("Limiting output topologies to credible set...") + output_topology_set.cull_probabilities(max_tree_topologies, max_probability) + +if args.support_values is not None: + print("Adding clade support values to tree topologies...") + output_topology_set.add_clade_support(clade_set, taxon_order) + if args.info_output is not None: + print("Writing MCMC sample statistics file...") info_output_path = args.info_output info_output_file = safe_open(info_output_path, overwrite) - info_output_file.write("Number of unique tree topologies in MCMC sample: " + str(n_unique_topologies)) + info_output_file.write("Number of taxa in each tree: %i\n" % (n_taxa)) + info_output_file.write("Number of unique tree topologies in MCMC sample: %i\n" % (n_unique_topologies)) if args.candidate_method == "derived": # calculate summary statistics for topologies - n_derived_topologies = libscculs.nonzero_derived_topologies(cc_sets, taxon_order) - info_output_file.write("Number of topologies with non-zero derived probabilities: " + str(n_derived_topologies)) + n_nonzero_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa) + info_output_file.write("Number of topologies derived from conditional clades: %i\n" % (n_nonzero_topologies)) + + #n_derived_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa, include_zero_probability = True) + #n_nonzero_topologies = libscculs.n_derived_topologies(cc_sets, n_taxa) + #info_output_file.write("Number of topologies derived from conditional clades: %i\n" % (n_derived_topologies)) + #info_output_file.write("Number of topologies derived from conditional clades (with non-zero probabilities): %i\n" % (n_nonzero_topologies)) info_output_file.close() if args.newick_output is not None: + print("Writing tree topology files...") newick_path_prefix = args.newick_output for i in range(output_topology_set.n_features): newick_string = output_topology_set.data_array[i] @@ -131,6 +149,7 @@ def safe_open(file_path, overwrite): newick_output_file.close() if args.csv_output is not None: + print("Writing tree statistics file...") csv_output_path = args.csv_output csv_output_file = safe_open(csv_output_path, overwrite) csv_writer = csv.writer(csv_output_file)