Skip to content

Commit

Permalink
Deriving trees from conditional clades is fixed. Support values are w…
Browse files Browse the repository at this point in the history
…orking now too.
  • Loading branch information
nycticebus committed Oct 25, 2014
1 parent 7267e2c commit de1bd23
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 126 deletions.
223 changes: 121 additions & 102 deletions libscculs.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def recurse_node_properties(self, node, topology_values):
self.recurse_node_properties(child1, topology_values)
self.recurse_node_properties(child2, topology_values)

parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order)
parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order)
topology_values.append((parent_id, split_id))

class UltrametricSample(TopologySample):
Expand Down Expand Up @@ -105,7 +105,7 @@ def recurse_node_properties(self, node, node_height, tree_values):
self.recurse_node_properties(child1, child1_height, tree_values)
self.recurse_node_properties(child2, child2_height, tree_values)

parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, self.taxon_order)
parent_id, split_id = calculate_node_hashes(child1_clade, child2_clade, self.taxon_order)
tree_values.append((parent_id, split_id, node_height))

class DiscreteProbabilities():
Expand All @@ -116,34 +116,40 @@ def __init__(self, data):

sorted_data = [data[feature_hash] for feature_hash in self.hashes_array]
self.data_array = numpy.array(sorted_data)

self.probabilities = {}
self.probabilities_array = numpy.zeros(self.n_features, dtype = numpy.float64)
for feature_hash in sorted_hashes:
self.probabilities[feature_hash] = 0.0

def probabilities_from_counts(self, counts):
sorted_counts = []
for feature_hash in self.hashes_array:
if feature_hash in counts:
sorted_counts.append(counts[feature_hash])
else:
sorted_counts.append(0)
self.convert_probabilities()

def add_probabilities(self, probabilities):
for feature in self.hashes_array:
feature_hash = feature.tostring()
self.probabilities[feature_hash] = probabilities[feature_hash]

counts_array = numpy.array(sorted_counts, dtype = numpy.uint64)
self.convert_probabilities()

def probabilities_from_counts(self, counts):
log_counts = {}
for i in range(self.n_features):
feature_hash = self.hashes_array[i]
feature_count = counts_array[i]
all_log_counts = []

if feature_counts == 0:
self.probabilities[feature_hash] = 0.0
else:
log_counts[feature_hash] = math.log(feature_count)
for count_hash, count in counts.items():
log_count = math.log(count)
log_counts[count_hash] = log_count
all_log_counts.append(log_count)

if len(all_log_counts) > 0:
log_sum_of_counts = numpy.logaddexp.reduce(all_log_counts)
else:
log_sum_of_counts = None

if len(log_counts) > 0:
log_sum_of_counts = numpy.logaddexp.reduce(log_counts.values())
for feature_hash in log_counts:
for feature_hash in self.hashes_array:
if feature_hash in log_counts:
normalized_probability = math.exp(log_counts[feature_hash] - log_sum_of_counts)
self.probabilities[feature_hash] = normalized_probability
else:
self.probabilities[feature_hash] = 0.0

self.convert_probabilities()

Expand All @@ -161,7 +167,7 @@ def cull_probabilities(self, max_features, max_probability):
cull_indices = []
for i in topology_descending_order:
if (posterior_features >= max_features) or (posterior_probability >= max_probability):
cull_hash = self.hashes_array[i]
cull_hash = self.hashes_array[i].tostring()
self.probabilities.pop(cull_hash)
cull_indices.append(i)

Expand Down Expand Up @@ -196,12 +202,81 @@ def probabilities_from_ccs(self, cc_sets):

self.convert_probabilities()

def add_clade_support(self, clade_set):
pass
def add_clade_support(self, clade_set, taxon_order):
topologies_with_support = []
for topology_newick in self.data_array:
root_node = ete2.Tree(topology_newick)
for node in root_node.get_descendants():
if not node.is_leaf():
child1, child2 = node.get_children()
child1_clade = set(child1.get_leaf_names())
child2_clade = set(child2.get_leaf_names())

clade_hash, split_hash = calculate_node_hashes(child1_clade, child2_clade, taxon_order)
clade_hash = clade_hash.rstrip("\x00")
split_hash = clade_hash.rstrip("\x00")

clade_probability = clade_set.probabilities[clade_hash]
node.support = clade_probability

newick_with_support = root_node.write(format = 2)
topologies_with_support.append(newick_with_support)

self.data_array = numpy.array(topologies_with_support)

def add_consensus_heights(self):
pass

class CladeProbabilities(DiscreteProbabilities):
def derive_clade_probabilities(self, cc_sets, n_taxa):
reverse_ccp = reverse_cc_probabilities(cc_sets)
root_hash = calculate_root_hash(n_taxa)
self.probabilities[root_hash] = 1.0

clades_by_size = []
for i in range(n_taxa - 1):
clades_by_size.append(set())

for i in range(self.n_features):
clade_hash = self.hashes_array[i]
n_clade_taxa = self.data_array[i]
clades_by_size[n_taxa - n_clade_taxa].add(clade_hash)

# iterate through clades from largest to smallest (in number of taxa)
for i in range(1, n_taxa - 1): # skip the root hash
for clade_hash in clades_by_size[i]:
# there may be multiple paths from any clade to the root, so the sum of path probabilities is required
# as clades can only be children of larger parents, by calculating probabilities of larger clades first,
# the conditional probability of the clade of interest may be multiplied by the parent clade probability
# which is the sum of path probabilities from the parent to the root
clade_probability = 0.0
conditional_parents = reverse_ccp[clade_hash]
for parent_hash in conditional_parents:
# the product of conditional clade probabilities which link a clade to the root of the tree
path_probability = conditional_parents[parent_hash] * self.probabilities[parent_hash]
clade_probability += path_probability

self.probabilities[clade_hash] = clade_probability

self.convert_probabilities()

def melt_clade_probabilities(self, topology_set, n_taxa):
root_hash = calculate_root_hash(n_taxa)
n_bytes = len(root_hash) # maximum number of bytes required to store clade hashes
clade_dtype = "a" + str(n_bytes)

clade_probability_cache = {}
for i in range(topology_set.n_features):
topology_hash = topology_set.hashes_array[i]
topology_probability = topology_set.probabilities_array[i]

topology_char_array = numpy.array(list(topology_hash))
clades_array = topology_char_array.view(clade_dtype)
for clade_hash in clades_array:
self.probabilities[clade_hash] += topology_probability

self.convert_probabilities()

# read a nexus or newick format file containing phylogenetic trees
# if the file does not begin with a nexus header, assumes it is a newick file
# returns a list of newick strings, in the same order as the input file
Expand All @@ -224,7 +299,7 @@ def trees_from_path(trees_filepath):
newick_strings = newick_blob.strip().split("\n")
return newick_strings

def calculate_node_ids(children_a, children_b, taxon_order):
def calculate_node_hashes(children_a, children_b, taxon_order):
n_taxa = len(taxon_order)
children = set.union(children_a, children_b)

Expand Down Expand Up @@ -268,6 +343,7 @@ def calculate_topology_probabilities(ts):
topology_data = {}
cc_counts = {}
cc_data = {}
clade_sizes = {}

for i in range(ts.n_trees):
tree_array = ts.tree_arrays[i]
Expand All @@ -288,6 +364,7 @@ def calculate_topology_probabilities(ts):
split_hash = node[1].tostring() # the hash for the bifurcation

n_node_taxa = clade_size(parent_hash)
clade_sizes[parent_hash] = n_node_taxa
if n_node_taxa >= 3: # record conditional clade
if parent_hash not in cc_counts:
cc_data[parent_hash] = {split_hash: node}
Expand All @@ -298,13 +375,14 @@ def calculate_topology_probabilities(ts):
else:
cc_counts[parent_hash][split_hash] += 1

clades_set = CladeProbabilities(clade_sizes)
topology_set = TopologyProbabilities(topology_data)

cc_sets = {}
for parent_hash, splits_data in cc_data.items():
cc_sets[parent_hash] = DiscreteProbabilities(splits_data)

return topology_set, topology_counts, cc_sets, cc_counts
return topology_set, topology_counts, cc_sets, cc_counts, clades_set

def derive_best_topologies(cc_sets, taxon_order, topologies_threshold, probability_threshold):
cherry_hash = "\x80"
Expand Down Expand Up @@ -481,106 +559,47 @@ def clade_taxon_names(clade_hash, taxon_order):

return taxon_names

def nonzero_derived_topologies(cc_sets, taxon_order):
n_taxa = len(taxon_order)
def n_derived_topologies(cc_sets, n_taxa, include_zero_probability = False):
reverse_ccp = reverse_cc_probabilities(cc_sets)
clades_by_size = []
n_subtrees = {}

for i in range(n_taxa - 2):
clades_by_size.append(set())

for parent_id in ccp:
parent_size = clade_size(parent_id)
if parent_size > 2:
clades_by_size[parent_size - 3].add(parent_id)
for clade_hash in cc_sets:
n_clade_taxa = clade_size(clade_hash)
if n_clade_taxa >= 3:
clades_by_size[n_clade_taxa - 3].add(clade_hash)

for i in range(n_taxa - 2):
for parent_id in clades_by_size[i]:
for parent_hash in clades_by_size[i]:
parent_splits = cc_sets[parent_hash]
n_parent_subtrees = 0
for split_id in ccp[parent_id]:
if ccp[parent_id][split_id] > 0.0:
child1_id, child2_id = elucidate_cc_split(parent_id, split_id)
for j in range(parent_splits.n_features):
split_hash = parent_splits.hashes_array[j]
split_probability = parent_splits.probabilities_array[j]
if include_zero_probability or (split_probability > 0.0):
child1_hash, child2_hash = elucidate_cc_split(parent_hash, split_hash)

n_split_subtrees = 1
child1_size = clade_size(child1_id)
child1_size = clade_size(child1_hash)
if child1_size > 2:
n_split_subtrees = n_split_subtrees * n_subtrees[child1_id]
n_split_subtrees = n_split_subtrees * n_subtrees[child1_hash]

child2_size = clade_size(child2_id)
child2_size = clade_size(child2_hash)
if child2_size > 2:
n_split_subtrees = n_split_subtrees * n_subtrees[child2_id]
n_split_subtrees = n_split_subtrees * n_subtrees[child2_hash]

n_parent_subtrees += n_split_subtrees

n_subtrees[parent_id] = n_parent_subtrees
n_subtrees[parent_hash] = n_parent_subtrees

root_id = calculate_root_hash(n_taxa)
n_root_topologies = n_subtrees[root_id]

return n_root_topologies

def derive_clade_probabilities(cc_sets):
clade_probability_cache = {}

n_taxa = len(taxon_order)
reverse_ccp = reverse_cc_probabilities(cc_sets)

new_newick_strings = []
for i in range(self.n_features):
topology_newick = self.data_array[i].tostring()
root_node = ete2.Tree(topology_newick)
for node in root_node.get_descendants():
if not node.is_leaf():
child1, child2 = node.get_children()
child1_clade = set(child1.get_leaf_names())
child2_clade = set(child2.get_leaf_names())

parent_id, split_id = calculate_node_ids(child1_clade, child2_clade, taxon_order)

if parent_id not in clade_probability_cache:
clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp)
clade_probability_cache[parent_id] = {split_id: clade_probability}
elif split_id not in clade_probability_cache[parent_id]:
clade_probability = derive_single_clade_probability(parent_id, n_taxa, reverse_ccp)
clade_probability_cache[parent_id][split_id] = clade_probability

def derive_single_clade_probability(clade_id, n_taxa, reverse_ccp):
n_bytes, remainder = divmod(n_taxa, 8)
if remainder > 0:
n_bytes += 1

derived_struct_format = "a%d,f8" % (n_bytes)

root_hash = calculate_root_hash(n_taxa)

target_clade = numpy.array([(clade_id, 1.0)], dtype=derived_struct_format)
paths_to_root = [target_clade]

target_clade_probability = 0.0
while len(paths_to_root) > 0:
incomplete_path = paths_to_root.pop(0)
incomplete_path_head = incomplete_path[0]
incomplete_path_hash = incomplete_path_head["f0"].tostring()
possible_parents = reverse_ccp[incomplete_path_hash]

for parent_hash in possible_parents:
split_probability = possible_parents[parent_hash]
if split_probability > 0.0:
parent_row = numpy.array([(parent_hash, split_probability)], dtype=derived_struct_format)
new_path = numpy.concatenate((parent_row, incomplete_path))

if parent_hash == root_hash:
path_probability = numpy.prod(new_path["f1"])
target_clade_probability += path_probability
else:
paths_to_root.append(new_path)

return target_clade_probability

def melt_clade_probabilities(topology_set):
pass

def reverse_cc_probabilities(cc_sets):
reverse_ccp = {}
for parent_id in cc_sets:
Expand Down
Loading

0 comments on commit de1bd23

Please sign in to comment.