Adding a centre=False option to genetic_relatedness statistic (#1623)

rebase of @mmsomond's previous commits to add centered option to genetic_relatedness initialize all nodes in branch stats; closes #2983 right normalization in genetic relatedness, closes #2984 divmat works for sample sets; closes #2888 apply @jeromekelleher comments Apply suggestions from @brieuclehmann review Co-authored-by: peter <[email protected]>
tskit-dev · Sep 17, 2024 · 4170933 · 4170933
1 parent 600a18c
commit 4170933
Show file tree

Hide file tree

Showing 11 changed files with 897 additions and 217 deletions.
diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c
@@ -866,7 +866,8 @@ verify_one_way_stat_func_errors(tsk_treeseq_t *ts, one_way_sample_stat_method *m
 }
 
 static void
-verify_two_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *method)
+verify_two_way_stat_func_errors(
+    tsk_treeseq_t *ts, general_sample_stat_method *method, tsk_flags_t options)
 {
     int ret;
     tsk_id_t samples[] = { 0, 1, 2, 3 };
@@ -875,30 +876,30 @@ verify_two_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *m
     double result;
 
     ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
-        TSK_STAT_SITE, &result);
+        options | TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS);
     ret = method(ts, 1, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
-        TSK_STAT_SITE, &result);
+        options | TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS);
 
     ret = method(ts, 2, sample_set_sizes, samples, 0, set_indexes, 0, NULL,
-        TSK_STAT_SITE, &result);
+        options | TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES);
 
     set_indexes[0] = -1;
     ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
-        TSK_STAT_SITE, &result);
+        options | TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX);
     set_indexes[0] = 0;
     set_indexes[1] = 2;
     ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL,
-        TSK_STAT_SITE, &result);
+        options | TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX);
 }
 
 static void
 verify_two_way_weighted_stat_func_errors(
-    tsk_treeseq_t *ts, two_way_weighted_method *method)
+    tsk_treeseq_t *ts, two_way_weighted_method *method, tsk_flags_t options)
 {
     int ret;
     tsk_id_t indexes[] = { 0, 0, 0, 1 };
@@ -908,13 +909,17 @@ verify_two_way_weighted_stat_func_errors(
 
     memset(weights, 0, sizeof(weights));
 
-    ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, 0);
+    ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, options);
     CU_ASSERT_EQUAL_FATAL(ret, 0);
 
-    ret = method(ts, 0, weights, 2, indexes, 0, NULL, result, 0);
+    ret = method(ts, 2, weights, 2, indexes, 0, NULL, result,
+        options | TSK_STAT_SITE | TSK_STAT_NODE);
+    CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES);
+
+    ret = method(ts, 0, weights, 2, indexes, 0, NULL, result, options);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_WEIGHTS);
 
-    ret = method(ts, 2, weights, 2, indexes, 1, bad_windows, result, 0);
+    ret = method(ts, 2, weights, 2, indexes, 1, bad_windows, result, options);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS);
 }
 
@@ -1863,7 +1868,7 @@ test_paper_ex_divergence_errors(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    verify_two_way_stat_func_errors(&ts, tsk_treeseq_divergence);
+    verify_two_way_stat_func_errors(&ts, tsk_treeseq_divergence, 0);
     tsk_treeseq_free(&ts);
 }
 
@@ -1911,6 +1916,11 @@ test_paper_ex_genetic_relatedness(void)
     ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1,
         set_indexes, 0, NULL, TSK_STAT_SITE, &result);
     CU_ASSERT_EQUAL_FATAL(ret, 0);
+
+    ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1,
+        set_indexes, 0, NULL, TSK_STAT_SITE | TSK_STAT_NONCENTRED, &result);
+    CU_ASSERT_EQUAL_FATAL(ret, 0);
+
     tsk_treeseq_free(&ts);
 }
 
@@ -1921,7 +1931,11 @@ test_paper_ex_genetic_relatedness_errors(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    verify_two_way_stat_func_errors(&ts, tsk_treeseq_genetic_relatedness);
+    verify_two_way_stat_func_errors(&ts, tsk_treeseq_genetic_relatedness, 0);
+    verify_two_way_stat_func_errors(
+        &ts, tsk_treeseq_genetic_relatedness, TSK_STAT_NONCENTRED);
+    verify_two_way_stat_func_errors(
+        &ts, tsk_treeseq_genetic_relatedness, TSK_STAT_POLARISED);
     tsk_treeseq_free(&ts);
 }
 
@@ -1948,6 +1962,15 @@ test_paper_ex_genetic_relatedness_weighted(void)
         ret = tsk_treeseq_genetic_relatedness_weighted(
             &ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_NODE);
         CU_ASSERT_EQUAL_FATAL(ret, 0);
+        ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
+            indexes, 0, NULL, result, TSK_STAT_SITE | TSK_STAT_NONCENTRED);
+        CU_ASSERT_EQUAL_FATAL(ret, 0);
+        ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
+            indexes, 0, NULL, result, TSK_STAT_BRANCH | TSK_STAT_NONCENTRED);
+        CU_ASSERT_EQUAL_FATAL(ret, 0);
+        ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2,
+            indexes, 0, NULL, result, TSK_STAT_NODE | TSK_STAT_NONCENTRED);
+        CU_ASSERT_EQUAL_FATAL(ret, 0);
     }
 
     tsk_treeseq_free(&ts);
@@ -1961,7 +1984,11 @@ test_paper_ex_genetic_relatedness_weighted_errors(void)
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
     verify_two_way_weighted_stat_func_errors(
-        &ts, tsk_treeseq_genetic_relatedness_weighted);
+        &ts, tsk_treeseq_genetic_relatedness_weighted, 0);
+    verify_two_way_weighted_stat_func_errors(
+        &ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_NONCENTRED);
+    verify_two_way_weighted_stat_func_errors(
+        &ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_POLARISED);
     tsk_treeseq_free(&ts);
 }
 
@@ -1972,7 +1999,7 @@ test_paper_ex_Y2_errors(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    verify_two_way_stat_func_errors(&ts, tsk_treeseq_Y2);
+    verify_two_way_stat_func_errors(&ts, tsk_treeseq_Y2, 0);
     tsk_treeseq_free(&ts);
 }
 
@@ -2010,7 +2037,7 @@ test_paper_ex_f2_errors(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    verify_two_way_stat_func_errors(&ts, tsk_treeseq_f2);
+    verify_two_way_stat_func_errors(&ts, tsk_treeseq_f2, 0);
     tsk_treeseq_free(&ts);
 }
 

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -1297,19 +1297,30 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
     double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state));
     double *summary = tsk_calloc(num_nodes * result_dim, sizeof(*summary));
     double *running_sum = tsk_calloc(result_dim, sizeof(*running_sum));
+    double *zero_state = tsk_calloc(state_dim, sizeof(*zero_state));
+    double *zero_summary = tsk_calloc(result_dim, sizeof(*zero_state));
 
     if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) {
         ret = TSK_ERR_TIME_UNCALIBRATED;
         goto out;
     }
 
     if (parent == NULL || branch_length == NULL || state == NULL || running_sum == NULL
-        || summary == NULL) {
+        || summary == NULL || zero_state == NULL || zero_summary == NULL) {
         ret = TSK_ERR_NO_MEMORY;
         goto out;
     }
     tsk_memset(parent, 0xff, num_nodes * sizeof(*parent));
 
+    /* If f is not strict, we may need to set conditions for non-sample nodes as well. */
+    ret = f(state_dim, zero_state, result_dim, zero_summary, f_params);
+    if (ret != 0) {
+        goto out;
+    }
+    for (j = 0; j < num_nodes; j++) { // we could skip this if zero_summary is zero
+        summary_u = GET_2D_ROW(summary, result_dim, j);
+        tsk_memcpy(summary_u, zero_summary, result_dim * sizeof(*zero_summary));
+    }
     /* Set the initial conditions */
     for (j = 0; j < self->num_samples; j++) {
         u = self->samples[j];
@@ -1322,6 +1333,7 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
             goto out;
         }
     }
+
     tsk_memset(result, 0, num_windows * result_dim * sizeof(*result));
 
     /* Iterate over the trees */
@@ -1425,6 +1437,8 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
     tsk_safe_free(state);
     tsk_safe_free(summary);
     tsk_safe_free(running_sum);
+    tsk_safe_free(zero_state);
+    tsk_safe_free(zero_summary);
     return ret;
 }
 
@@ -2072,6 +2086,7 @@ typedef struct {
 } sample_count_stat_params_t;
 
 typedef struct {
+    tsk_size_t num_samples;
     double *total_weights;
     const tsk_id_t *index_tuples;
 } indexed_weight_stat_params_t;
@@ -4542,21 +4557,39 @@ genetic_relatedness_summary_func(tsk_size_t state_dim, const double *state,
     tsk_id_t i, j;
     tsk_size_t k;
     double sumx = 0;
-    double sumn = 0;
     double meanx, ni, nj;
 
     for (k = 0; k < state_dim; k++) {
-        sumx += x[k];
-        sumn += (double) args.sample_set_sizes[k];
+        sumx += x[k] / (double) args.sample_set_sizes[k];
     }
 
-    meanx = sumx / sumn;
+    meanx = sumx / (double) state_dim;
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
         ni = (double) args.sample_set_sizes[i];
         nj = (double) args.sample_set_sizes[j];
-        result[k] = (x[i] - ni * meanx) * (x[j] - nj * meanx) / 2;
+        result[k] = (x[i] / ni - meanx) * (x[j] / nj - meanx);
+    }
+    return 0;
+}
+
+static int
+genetic_relatedness_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim),
+    const double *state, tsk_size_t result_dim, double *result, void *params)
+{
+    sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
+    const double *x = state;
+    tsk_id_t i, j;
+    tsk_size_t k;
+    double ni, nj;
+
+    for (k = 0; k < result_dim; k++) {
+        i = args.set_indexes[2 * k];
+        j = args.set_indexes[2 * k + 1];
+        ni = (double) args.sample_set_sizes[i];
+        nj = (double) args.sample_set_sizes[j];
+        result[k] = x[i] * x[j] / (ni * nj);
     }
     return 0;
 }
@@ -4572,9 +4605,16 @@ tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample
     if (ret != 0) {
         goto out;
     }
-    ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
-        sample_sets, num_index_tuples, index_tuples, genetic_relatedness_summary_func,
-        num_windows, windows, options, result);
+    if (!(options & TSK_STAT_NONCENTRED)) {
+        ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
+            sample_sets, num_index_tuples, index_tuples,
+            genetic_relatedness_summary_func, num_windows, windows, options, result);
+    } else {
+        ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
+            sample_sets, num_index_tuples, index_tuples,
+            genetic_relatedness_noncentred_summary_func, num_windows, windows, options,
+            result);
+    }
 out:
     return ret;
 }
@@ -4587,15 +4627,32 @@ genetic_relatedness_weighted_summary_func(tsk_size_t state_dim, const double *st
     const double *x = state;
     tsk_id_t i, j;
     tsk_size_t k;
-    double meanx, ni, nj;
+    double pn, ni, nj;
 
-    meanx = state[state_dim - 1] / args.total_weights[state_dim - 1];
+    pn = state[state_dim - 1];
     for (k = 0; k < result_dim; k++) {
         i = args.index_tuples[2 * k];
         j = args.index_tuples[2 * k + 1];
         ni = args.total_weights[i];
         nj = args.total_weights[j];
-        result[k] = (x[i] - ni * meanx) * (x[j] - nj * meanx) / 2;
+        result[k] = (x[i] - ni * pn) * (x[j] - nj * pn);
+    }
+    return 0;
+}
+
+static int
+genetic_relatedness_weighted_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim),
+    const double *state, tsk_size_t result_dim, double *result, void *params)
+{
+    indexed_weight_stat_params_t args = *(indexed_weight_stat_params_t *) params;
+    const double *x = state;
+    tsk_id_t i, j;
+    tsk_size_t k;
+
+    for (k = 0; k < result_dim; k++) {
+        i = args.index_tuples[2 * k];
+        j = args.index_tuples[2 * k + 1];
+        result[k] = x[i] * x[j];
     }
     return 0;
 }
@@ -4633,17 +4690,26 @@ tsk_treeseq_genetic_relatedness_weighted(const tsk_treeseq_t *self,
             new_row[k] = row[k];
             total_weights[k] += row[k];
         }
-        new_row[num_weights] = 1.0;
+        new_row[num_weights] = 1.0 / (double) num_samples;
     }
-    total_weights[num_weights] = (double) num_samples;
+    total_weights[num_weights] = 1.0;
 
     args.total_weights = total_weights;
     args.index_tuples = index_tuples;
-    ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_index_tuples,
-        genetic_relatedness_weighted_summary_func, &args, num_windows, windows, options,
-        result);
-    if (ret != 0) {
-        goto out;
+    if (!(options & TSK_STAT_NONCENTRED)) {
+        ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights,
+            num_index_tuples, genetic_relatedness_weighted_summary_func, &args,
+            num_windows, windows, options, result);
+        if (ret != 0) {
+            goto out;
+        }
+    } else {
+        ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights,
+            num_index_tuples, genetic_relatedness_weighted_noncentred_summary_func,
+            &args, num_windows, windows, options, result);
+        if (ret != 0) {
+            goto out;
+        }
     }
 
 out:

diff --git a/c/tskit/trees.h b/c/tskit/trees.h
@@ -53,6 +53,7 @@ extern "C" {
 #define TSK_STAT_SPAN_NORMALISE          (1 << 11)
 #define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12)
 #define TSK_STAT_PAIR_NORMALISE          (1 << 13)
+#define TSK_STAT_NONCENTRED              (1 << 14)
 
 /* Options for map_mutations */
 #define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0)

diff --git a/docs/data-model.md b/docs/data-model.md
@@ -1095,3 +1095,39 @@ See the {meth}`TreeSequence.variants` method and {class}`Variant` class for
 more information on how missing data is represented in variant data.
 
 
+(sec_gotchas)=
+
+## Possibly surprising consequences of the data model
+
+This is a section of miscellaneous issues that might trip even an experienced user up,
+also known as "gotchas".
+The current examples are quite uncommon, so can be ignored for most purposes,
+but the list may be expanded in the future.
+
+### Unrelated material
+
+Usually, all parts of a tree sequence are ancestral to at least one sample,
+since that's essentially the definition of a sample: the genomes that
+we're describing the ancestry of.
+However, in some cases there will be portions of the tree sequence from which
+no samples inherit - notably, the result of a forwards simulation that has
+not been simplified.
+In fact, if the simulation has not coalesced,
+one can have entire portions of some marginal tree that are
+unrelated to any of the samples
+(for instance, an individual in the initial generation of the simulation
+that had no offspring).
+This can lead to a gotcha:
+the *roots* of a tree are defined to be only those roots *reachable from the samples*
+(and, furthermore, reachable from at least `root_threshold` samples;
+see {meth}`TreeSequence.trees`).
+So, our unlucky ancestor would not appear in the list of `roots`, even though
+if we drew all the relationships provided by the tree sequence,
+they'd definitely be a root.
+Furthermore, only nodes *reachable from a root* are included in the
+{meth}`Tree.nodes`. So, if you iterate over all the nodes in each marginal tree,
+you won't see those parts of the tree sequence that are unrelated to the samples.
+If you need to get those, too, you could either
+work with the {meth}`TreeSequence.edge_diffs` directly,
+or iterate over all nodes (instead of over {meth}`Tree.nodes`).
+