Skip to content

Commit

Permalink
Adding a centre=False option to genetic_relatedness statistic (#1623)
Browse files Browse the repository at this point in the history
rebase of @mmsomond's previous commits to add centered option to genetic_relatedness
initialize all nodes in branch stats; closes #2983
right normalization in genetic relatedness, closes #2984
divmat works for sample sets; closes #2888
apply @jeromekelleher comments
Apply suggestions from @brieuclehmann review
Co-authored-by: peter <[email protected]>
  • Loading branch information
mmosmond authored Sep 17, 2024
1 parent 600a18c commit 4170933
Show file tree
Hide file tree
Showing 11 changed files with 897 additions and 217 deletions.
57 changes: 42 additions & 15 deletions c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,8 @@ verify_one_way_stat_func_errors(tsk_treeseq_t *ts, one_way_sample_stat_method *m
}

static void
verify_two_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *method)
verify_two_way_stat_func_errors(
tsk_treeseq_t *ts, general_sample_stat_method *method, tsk_flags_t options)
{
int ret;
tsk_id_t samples[] = { 0, 1, 2, 3 };
Expand All @@ -875,30 +876,30 @@ verify_two_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *m
double result;

ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
TSK_STAT_SITE, &result);
options | TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS);
ret = method(ts, 1, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
TSK_STAT_SITE, &result);
options | TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS);

ret = method(ts, 2, sample_set_sizes, samples, 0, set_indexes, 0, NULL,
TSK_STAT_SITE, &result);
options | TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES);

set_indexes[0] = -1;
ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
TSK_STAT_SITE, &result);
options | TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX);
set_indexes[0] = 0;
set_indexes[1] = 2;
ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
TSK_STAT_SITE, &result);
options | TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX);
}

static void
verify_two_way_weighted_stat_func_errors(
tsk_treeseq_t *ts, two_way_weighted_method *method)
tsk_treeseq_t *ts, two_way_weighted_method *method, tsk_flags_t options)
{
int ret;
tsk_id_t indexes[] = { 0, 0, 0, 1 };
Expand All @@ -908,13 +909,17 @@ verify_two_way_weighted_stat_func_errors(

memset(weights, 0, sizeof(weights));

ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, 0);
ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, options);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = method(ts, 0, weights, 2, indexes, 0, NULL, result, 0);
ret = method(ts, 2, weights, 2, indexes, 0, NULL, result,
options | TSK_STAT_SITE | TSK_STAT_NODE);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES);

ret = method(ts, 0, weights, 2, indexes, 0, NULL, result, options);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_WEIGHTS);

ret = method(ts, 2, weights, 2, indexes, 1, bad_windows, result, 0);
ret = method(ts, 2, weights, 2, indexes, 1, bad_windows, result, options);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS);
}

Expand Down Expand Up @@ -1863,7 +1868,7 @@ test_paper_ex_divergence_errors(void)

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_divergence);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_divergence, 0);
tsk_treeseq_free(&ts);
}

Expand Down Expand Up @@ -1911,6 +1916,11 @@ test_paper_ex_genetic_relatedness(void)
ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1,
set_indexes, 0, NULL, TSK_STAT_SITE, &result);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1,
set_indexes, 0, NULL, TSK_STAT_SITE | TSK_STAT_NONCENTRED, &result);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tsk_treeseq_free(&ts);
}

Expand All @@ -1921,7 +1931,11 @@ test_paper_ex_genetic_relatedness_errors(void)

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_genetic_relatedness);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_genetic_relatedness, 0);
verify_two_way_stat_func_errors(
&ts, tsk_treeseq_genetic_relatedness, TSK_STAT_NONCENTRED);
verify_two_way_stat_func_errors(
&ts, tsk_treeseq_genetic_relatedness, TSK_STAT_POLARISED);
tsk_treeseq_free(&ts);
}

Expand All @@ -1948,6 +1962,15 @@ test_paper_ex_genetic_relatedness_weighted(void)
ret = tsk_treeseq_genetic_relatedness_weighted(
&ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_NODE);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
indexes, 0, NULL, result, TSK_STAT_SITE | TSK_STAT_NONCENTRED);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
indexes, 0, NULL, result, TSK_STAT_BRANCH | TSK_STAT_NONCENTRED);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
indexes, 0, NULL, result, TSK_STAT_NODE | TSK_STAT_NONCENTRED);
CU_ASSERT_EQUAL_FATAL(ret, 0);
}

tsk_treeseq_free(&ts);
Expand All @@ -1961,7 +1984,11 @@ test_paper_ex_genetic_relatedness_weighted_errors(void)
tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
verify_two_way_weighted_stat_func_errors(
&ts, tsk_treeseq_genetic_relatedness_weighted);
&ts, tsk_treeseq_genetic_relatedness_weighted, 0);
verify_two_way_weighted_stat_func_errors(
&ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_NONCENTRED);
verify_two_way_weighted_stat_func_errors(
&ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_POLARISED);
tsk_treeseq_free(&ts);
}

Expand All @@ -1972,7 +1999,7 @@ test_paper_ex_Y2_errors(void)

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_Y2);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_Y2, 0);
tsk_treeseq_free(&ts);
}

Expand Down Expand Up @@ -2010,7 +2037,7 @@ test_paper_ex_f2_errors(void)

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_f2);
verify_two_way_stat_func_errors(&ts, tsk_treeseq_f2, 0);
tsk_treeseq_free(&ts);
}

Expand Down
104 changes: 85 additions & 19 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -1297,19 +1297,30 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state));
double *summary = tsk_calloc(num_nodes * result_dim, sizeof(*summary));
double *running_sum = tsk_calloc(result_dim, sizeof(*running_sum));
double *zero_state = tsk_calloc(state_dim, sizeof(*zero_state));
double *zero_summary = tsk_calloc(result_dim, sizeof(*zero_state));

if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) {
ret = TSK_ERR_TIME_UNCALIBRATED;
goto out;
}

if (parent == NULL || branch_length == NULL || state == NULL || running_sum == NULL
|| summary == NULL) {
|| summary == NULL || zero_state == NULL || zero_summary == NULL) {
ret = TSK_ERR_NO_MEMORY;
goto out;
}
tsk_memset(parent, 0xff, num_nodes * sizeof(*parent));

/* If f is not strict, we may need to set conditions for non-sample nodes as well. */
ret = f(state_dim, zero_state, result_dim, zero_summary, f_params);
if (ret != 0) {
goto out;
}
for (j = 0; j < num_nodes; j++) { // we could skip this if zero_summary is zero
summary_u = GET_2D_ROW(summary, result_dim, j);
tsk_memcpy(summary_u, zero_summary, result_dim * sizeof(*zero_summary));
}
/* Set the initial conditions */
for (j = 0; j < self->num_samples; j++) {
u = self->samples[j];
Expand All @@ -1322,6 +1333,7 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
goto out;
}
}

tsk_memset(result, 0, num_windows * result_dim * sizeof(*result));

/* Iterate over the trees */
Expand Down Expand Up @@ -1425,6 +1437,8 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
tsk_safe_free(state);
tsk_safe_free(summary);
tsk_safe_free(running_sum);
tsk_safe_free(zero_state);
tsk_safe_free(zero_summary);
return ret;
}

Expand Down Expand Up @@ -2072,6 +2086,7 @@ typedef struct {
} sample_count_stat_params_t;

typedef struct {
tsk_size_t num_samples;
double *total_weights;
const tsk_id_t *index_tuples;
} indexed_weight_stat_params_t;
Expand Down Expand Up @@ -4542,21 +4557,39 @@ genetic_relatedness_summary_func(tsk_size_t state_dim, const double *state,
tsk_id_t i, j;
tsk_size_t k;
double sumx = 0;
double sumn = 0;
double meanx, ni, nj;

for (k = 0; k < state_dim; k++) {
sumx += x[k];
sumn += (double) args.sample_set_sizes[k];
sumx += x[k] / (double) args.sample_set_sizes[k];
}

meanx = sumx / sumn;
meanx = sumx / (double) state_dim;
for (k = 0; k < result_dim; k++) {
i = args.set_indexes[2 * k];
j = args.set_indexes[2 * k + 1];
ni = (double) args.sample_set_sizes[i];
nj = (double) args.sample_set_sizes[j];
result[k] = (x[i] - ni * meanx) * (x[j] - nj * meanx) / 2;
result[k] = (x[i] / ni - meanx) * (x[j] / nj - meanx);
}
return 0;
}

static int
genetic_relatedness_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim),
const double *state, tsk_size_t result_dim, double *result, void *params)
{
sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
const double *x = state;
tsk_id_t i, j;
tsk_size_t k;
double ni, nj;

for (k = 0; k < result_dim; k++) {
i = args.set_indexes[2 * k];
j = args.set_indexes[2 * k + 1];
ni = (double) args.sample_set_sizes[i];
nj = (double) args.sample_set_sizes[j];
result[k] = x[i] * x[j] / (ni * nj);
}
return 0;
}
Expand All @@ -4572,9 +4605,16 @@ tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample
if (ret != 0) {
goto out;
}
ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
sample_sets, num_index_tuples, index_tuples, genetic_relatedness_summary_func,
num_windows, windows, options, result);
if (!(options & TSK_STAT_NONCENTRED)) {
ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
sample_sets, num_index_tuples, index_tuples,
genetic_relatedness_summary_func, num_windows, windows, options, result);
} else {
ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
sample_sets, num_index_tuples, index_tuples,
genetic_relatedness_noncentred_summary_func, num_windows, windows, options,
result);
}
out:
return ret;
}
Expand All @@ -4587,15 +4627,32 @@ genetic_relatedness_weighted_summary_func(tsk_size_t state_dim, const double *st
const double *x = state;
tsk_id_t i, j;
tsk_size_t k;
double meanx, ni, nj;
double pn, ni, nj;

meanx = state[state_dim - 1] / args.total_weights[state_dim - 1];
pn = state[state_dim - 1];
for (k = 0; k < result_dim; k++) {
i = args.index_tuples[2 * k];
j = args.index_tuples[2 * k + 1];
ni = args.total_weights[i];
nj = args.total_weights[j];
result[k] = (x[i] - ni * meanx) * (x[j] - nj * meanx) / 2;
result[k] = (x[i] - ni * pn) * (x[j] - nj * pn);
}
return 0;
}

static int
genetic_relatedness_weighted_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim),
const double *state, tsk_size_t result_dim, double *result, void *params)
{
indexed_weight_stat_params_t args = *(indexed_weight_stat_params_t *) params;
const double *x = state;
tsk_id_t i, j;
tsk_size_t k;

for (k = 0; k < result_dim; k++) {
i = args.index_tuples[2 * k];
j = args.index_tuples[2 * k + 1];
result[k] = x[i] * x[j];
}
return 0;
}
Expand Down Expand Up @@ -4633,17 +4690,26 @@ tsk_treeseq_genetic_relatedness_weighted(const tsk_treeseq_t *self,
new_row[k] = row[k];
total_weights[k] += row[k];
}
new_row[num_weights] = 1.0;
new_row[num_weights] = 1.0 / (double) num_samples;
}
total_weights[num_weights] = (double) num_samples;
total_weights[num_weights] = 1.0;

args.total_weights = total_weights;
args.index_tuples = index_tuples;
ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_index_tuples,
genetic_relatedness_weighted_summary_func, &args, num_windows, windows, options,
result);
if (ret != 0) {
goto out;
if (!(options & TSK_STAT_NONCENTRED)) {
ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights,
num_index_tuples, genetic_relatedness_weighted_summary_func, &args,
num_windows, windows, options, result);
if (ret != 0) {
goto out;
}
} else {
ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights,
num_index_tuples, genetic_relatedness_weighted_noncentred_summary_func,
&args, num_windows, windows, options, result);
if (ret != 0) {
goto out;
}
}

out:
Expand Down
1 change: 1 addition & 0 deletions c/tskit/trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ extern "C" {
#define TSK_STAT_SPAN_NORMALISE (1 << 11)
#define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12)
#define TSK_STAT_PAIR_NORMALISE (1 << 13)
#define TSK_STAT_NONCENTRED (1 << 14)

/* Options for map_mutations */
#define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0)
Expand Down
36 changes: 36 additions & 0 deletions docs/data-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -1095,3 +1095,39 @@ See the {meth}`TreeSequence.variants` method and {class}`Variant` class for
more information on how missing data is represented in variant data.


(sec_gotchas)=

## Possibly surprising consequences of the data model

This is a section of miscellaneous issues that might trip even an experienced user up,
also known as "gotchas".
The current examples are quite uncommon, so can be ignored for most purposes,
but the list may be expanded in the future.

### Unrelated material

Usually, all parts of a tree sequence are ancestral to at least one sample,
since that's essentially the definition of a sample: the genomes that
we're describing the ancestry of.
However, in some cases there will be portions of the tree sequence from which
no samples inherit - notably, the result of a forwards simulation that has
not been simplified.
In fact, if the simulation has not coalesced,
one can have entire portions of some marginal tree that are
unrelated to any of the samples
(for instance, an individual in the initial generation of the simulation
that had no offspring).
This can lead to a gotcha:
the *roots* of a tree are defined to be only those roots *reachable from the samples*
(and, furthermore, reachable from at least `root_threshold` samples;
see {meth}`TreeSequence.trees`).
So, our unlucky ancestor would not appear in the list of `roots`, even though
if we drew all the relationships provided by the tree sequence,
they'd definitely be a root.
Furthermore, only nodes *reachable from a root* are included in the
{meth}`Tree.nodes`. So, if you iterate over all the nodes in each marginal tree,
you won't see those parts of the tree sequence that are unrelated to the samples.
If you need to get those, too, you could either
work with the {meth}`TreeSequence.edge_diffs` directly,
or iterate over all nodes (instead of over {meth}`Tree.nodes`).

Loading

0 comments on commit 4170933

Please sign in to comment.