Skip to content

Commit

Permalink
C and python implementations of pair_coalescence_quantiles
Browse files Browse the repository at this point in the history
  • Loading branch information
nspope authored and mergify[bot] committed Aug 28, 2024
1 parent 0403de0 commit f0bc3aa
Show file tree
Hide file tree
Showing 9 changed files with 1,024 additions and 83 deletions.
122 changes: 121 additions & 1 deletion c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ verify_divergence_matrix(tsk_treeseq_t *ts, tsk_flags_t options)
}
}

/* Check coalescence counts against naive implementation */
/* Check coalescence counts */
static void
verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options)
{
Expand Down Expand Up @@ -406,6 +406,112 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options)
node_bin_map[0] = 0;
}

/* Check coalescence quantiles */
static void
verify_pair_coalescence_quantiles(tsk_treeseq_t *ts)
{
int ret;
const tsk_size_t n = tsk_treeseq_get_num_samples(ts);
const tsk_size_t N = tsk_treeseq_get_num_nodes(ts);
const tsk_size_t T = tsk_treeseq_get_num_trees(ts);
const tsk_id_t *samples = tsk_treeseq_get_samples(ts);
const double *breakpoints = tsk_treeseq_get_breakpoints(ts);
const double *nodes_time = ts->tables->nodes.time;
const double max_time = ts->max_time;
const tsk_size_t P = 2;
const tsk_size_t Q = 5;
const tsk_size_t B = 4;
const tsk_size_t I = P * (P + 1) / 2;
double quantiles[] = { 0.0, 0.25, 0.5, 0.75, 1.0 };
double epochs[] = { 0.0, max_time / 4, max_time / 2, max_time, INFINITY };
tsk_id_t sample_sets[n];
tsk_size_t sample_set_sizes[P];
tsk_id_t index_tuples[2 * I];
tsk_id_t node_bin_map[N];
tsk_size_t dim = T * Q * I;
double C[dim];
tsk_size_t i, j, k;

for (i = 0; i < N; i++) {
node_bin_map[i] = TSK_NULL;
for (j = 0; j < B; j++) {
if (nodes_time[i] >= epochs[j] && nodes_time[i] < epochs[j + 1]) {
node_bin_map[i] = (tsk_id_t) j;
}
}
}

for (i = 0; i < n; i++) {
sample_sets[i] = samples[i];
}

for (i = 0; i < P; i++) {
sample_set_sizes[i] = 0;
}
for (j = 0; j < n; j++) {
i = j / (n / P);
sample_set_sizes[i]++;
}

for (j = 0, i = 0; j < P; j++) {
for (k = j; k < P; k++) {
index_tuples[i++] = (tsk_id_t) j;
index_tuples[i++] = (tsk_id_t) k;
}
}

ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, 0);
/* TODO: compare against naive quantiles per tree */

quantiles[Q - 1] = 0.9;
ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, 0);
quantiles[Q - 1] = 1.0;
/* TODO: compare against naive quantiles per tree */

/* cover errors */
quantiles[0] = -1.0;
ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES);
quantiles[0] = 0.0;

quantiles[Q - 1] = 2.0;
ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES);
quantiles[Q - 1] = 1.0;

quantiles[1] = 0.0;
quantiles[0] = 0.25;
ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES);
quantiles[0] = 0.0;
quantiles[1] = 0.25;

for (i = 0; i < N; i++) {
if (node_bin_map[i] == 0) {
node_bin_map[i] = 2;
} else if (node_bin_map[i] == 2) {
node_bin_map[i] = 0;
}
}
ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_TIMES);
for (i = 0; i < N; i++) {
if (node_bin_map[i] == 0) {
node_bin_map[i] = 2;
} else if (node_bin_map[i] == 2) {
node_bin_map[i] = 0;
}
}
}

typedef struct {
int call_count;
int error_on;
Expand Down Expand Up @@ -2896,6 +3002,19 @@ test_pair_coalescence_counts(void)
nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0);
verify_pair_coalescence_counts(&ts, 0);
verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE);
verify_pair_coalescence_counts(&ts, TSK_STAT_PAIR_NORMALISE);
verify_pair_coalescence_counts(
&ts, TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE);
tsk_treeseq_free(&ts);
}

static void
test_pair_coalescence_quantiles(void)
{
tsk_treeseq_t ts;
tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL,
nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0);
verify_pair_coalescence_quantiles(&ts);
tsk_treeseq_free(&ts);
}

Expand Down Expand Up @@ -2998,6 +3117,7 @@ main(int argc, char **argv)
{ "test_multiroot_divergence_matrix", test_multiroot_divergence_matrix },

{ "test_pair_coalescence_counts", test_pair_coalescence_counts },
{ "test_pair_coalescence_quantiles", test_pair_coalescence_quantiles },

{ NULL, NULL },
};
Expand Down
7 changes: 7 additions & 0 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,13 @@ tsk_strerror_internal(int err)
ret = "Maximum index in node-to-bin map does not match "
"output dimension. (TSK_ERR_BAD_NODE_BIN_MAP_DIM)";
break;
case TSK_ERR_BAD_QUANTILES:
ret = "Quantiles must be between 0 and 1 (inclusive) "
"and strictly increasing. (TSK_ERR_BAD_QUANTILES)";
break;
case TSK_ERR_UNSORTED_TIMES:
ret = "Times must be strictly increasing. (TSK_ERR_UNSORTED_TIMES)";
break;

/* Mutation mapping errors */
case TSK_ERR_GENOTYPES_ALL_MISSING:
Expand Down
8 changes: 8 additions & 0 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,14 @@ The node bin map contains a value less than TSK_NULL.
Maximum index in node bin map does not match output dimension.
*/
#define TSK_ERR_BAD_NODE_BIN_MAP_DIM -915
/**
The vector of quantiles is out of bounds or in nonascending order.
*/
#define TSK_ERR_BAD_QUANTILES -916
/**
Times are not in ascending order
*/
#define TSK_ERR_UNSORTED_TIMES -917
/** @} */

/**
Expand Down
106 changes: 102 additions & 4 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -8217,7 +8217,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
void *summary_func_args, tsk_flags_t options, double *result)
{
int ret = 0;
double left, right, remaining_span, window_span, x, t;
double left, right, remaining_span, window_span, denominator, x, t;
tsk_id_t e, p, c, u, v, w, i, j;
tsk_size_t num_samples;
tsk_tree_position_t tree_pos;
Expand All @@ -8238,6 +8238,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
double *bin_weight = NULL;
double *bin_values = NULL;
double *pair_count = NULL;
double *total_pair = NULL;
double *outside = NULL;

/* row pointers */
Expand Down Expand Up @@ -8294,13 +8295,25 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
bin_weight = tsk_malloc(num_bins * num_set_indexes * sizeof(*bin_weight));
bin_values = tsk_malloc(num_bins * num_set_indexes * sizeof(*bin_values));
pair_count = tsk_malloc(num_set_indexes * sizeof(*pair_count));
total_pair = tsk_malloc(num_set_indexes * sizeof(*total_pair));
if (nodes_parent == NULL || nodes_sample == NULL || sample_count == NULL
|| coalescing_pairs == NULL || bin_weight == NULL || bin_values == NULL
|| outside == NULL || pair_count == NULL || visited == NULL) {
|| outside == NULL || pair_count == NULL || visited == NULL
|| total_pair == NULL) {
ret = TSK_ERR_NO_MEMORY;
goto out;
}

for (i = 0; i < (tsk_id_t) num_set_indexes; i++) {
u = set_indexes[2 * i];
v = set_indexes[2 * i + 1];
total_pair[i] = (double) sample_set_sizes[u] * (double) sample_set_sizes[v];
if (u == v) {
total_pair[i] -= (double) sample_set_sizes[v];
total_pair[i] /= 2;
}
}

/* initialize internal state */
for (c = 0; c < (tsk_id_t) num_nodes; c++) {
i = nodes_sample_set[c];
Expand Down Expand Up @@ -8344,6 +8357,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
below = GET_2D_ROW(sample_count, num_sample_sets, c);
state = GET_2D_ROW(nodes_sample, num_sample_sets, p);
pairs = GET_2D_ROW(coalescing_pairs, num_set_indexes, v);
times = GET_2D_ROW(coalescence_time, num_set_indexes, v);
pair_coalescence_count(num_set_indexes, set_indexes, num_sample_sets,
above, below, state, inside, outside, pair_count);
for (i = 0; i < (tsk_id_t) num_set_indexes; i++) {
Expand Down Expand Up @@ -8456,12 +8470,20 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
values[v] /= weight[v];
}
}
if (options & TSK_STAT_SPAN_NORMALISE) { /* normalise weights */
/* normalise weights */
if (options & (TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE)) {
window_span = windows[w + 1] - windows[w];
for (i = 0; i < (tsk_id_t) num_set_indexes; i++) {
denominator = 1.0;
if (options & TSK_STAT_SPAN_NORMALISE) {
denominator *= window_span;
}
if (options & TSK_STAT_PAIR_NORMALISE) {
denominator *= total_pair[i];
}
weight = GET_2D_ROW(bin_weight, num_bins, i);
for (v = 0; v < (tsk_id_t) num_bins; v++) {
weight[v] /= window_span;
weight[v] /= denominator;
}
}
}
Expand Down Expand Up @@ -8490,6 +8512,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
tsk_safe_free(bin_weight);
tsk_safe_free(bin_values);
tsk_safe_free(pair_count);
tsk_safe_free(total_pair);
tsk_safe_free(visited);
tsk_safe_free(outside);
return ret;
Expand All @@ -8516,3 +8539,78 @@ tsk_treeseq_pair_coalescence_counts(const tsk_treeseq_t *self,
sample_sets, num_set_indexes, set_indexes, num_windows, windows, num_bins,
node_bin_map, pair_coalescence_weights, num_bins, NULL, options, result);
}

static int
pair_coalescence_quantiles(tsk_size_t input_dim, const double *weight,
const double *values, tsk_size_t output_dim, double *output, void *params)
{
int ret = 0;
double coalesced, timepoint;
double *quantiles = (double *) params;
tsk_size_t i, j;
j = 0;
coalesced = 0.0;
timepoint = -INFINITY;
for (i = 0; i < input_dim; i++) {
if (weight[i] > 0) {
coalesced += weight[i];
if (values[i] <= timepoint) {
ret = TSK_ERR_UNSORTED_TIMES;
goto out;
}
timepoint = values[i];
while (j < output_dim && quantiles[j] <= coalesced) {
output[j] = timepoint;
j += 1;
}
}
}
if (quantiles[output_dim - 1] == 1.0) {
output[output_dim - 1] = timepoint;
}
out:
return ret;
}

static int
check_quantiles(const tsk_size_t num_quantiles, const double *quantiles)
{
int ret = 0;
tsk_size_t i;
double last = -INFINITY;
for (i = 0; i < num_quantiles; ++i) {
if (quantiles[i] <= last || quantiles[i] < 0.0 || quantiles[i] > 1.0) {
ret = TSK_ERR_BAD_QUANTILES;
goto out;
}
last = quantiles[i];
}
out:
return ret;
}

int
tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes,
tsk_size_t num_windows, const double *windows, tsk_size_t num_bins,
const tsk_id_t *node_bin_map, tsk_size_t num_quantiles, double *quantiles,
tsk_flags_t options, double *result)
{
int ret = 0;
void *params = (void *) quantiles;
ret = check_quantiles(num_quantiles, quantiles);
if (ret != 0) {
goto out;
}
options |= TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE;
ret = tsk_treeseq_pair_coalescence_stat(self, num_sample_sets, sample_set_sizes,
sample_sets, num_set_indexes, set_indexes, num_windows, windows, num_bins,
node_bin_map, pair_coalescence_quantiles, num_quantiles, params, options,
result);
if (ret != 0) {
goto out;
}
out:
return ret;
}
7 changes: 7 additions & 0 deletions c/tskit/trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ extern "C" {
#define TSK_STAT_POLARISED (1 << 10)
#define TSK_STAT_SPAN_NORMALISE (1 << 11)
#define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12)
#define TSK_STAT_PAIR_NORMALISE (1 << 13)

/* Options for map_mutations */
#define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0)
Expand Down Expand Up @@ -1138,6 +1139,12 @@ int tsk_treeseq_pair_coalescence_counts(const tsk_treeseq_t *self,
const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes,
tsk_size_t num_windows, const double *windows, tsk_size_t num_bins,
const tsk_id_t *node_bin_map, tsk_flags_t options, double *result);
int tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes,
tsk_size_t num_windows, const double *windows, tsk_size_t num_bins,
const tsk_id_t *node_bin_map, tsk_size_t num_quantiles, double *quantiles,
tsk_flags_t options, double *result);

/****************************************************************************/
/* Tree */
Expand Down
Loading

0 comments on commit f0bc3aa

Please sign in to comment.