From 4c6f29ac6f7ab598c28732e44cb4230361a04532 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 6 Sep 2024 23:56:10 +0100 Subject: [PATCH] Internal samples in C (with minimal unit tests) Minimal testing in Python C code cleanup --- lib/msprime.c | 105 ++++++++++++++++++------------------- lib/tests/test_pedigrees.c | 32 ++++++----- lib/util.c | 6 --- lib/util.h | 1 - tests/test_pedigree.py | 47 +++++++---------- 5 files changed, 88 insertions(+), 103 deletions(-) diff --git a/lib/msprime.c b/lib/msprime.c index 2316fb00b..4bbdfd06c 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -1859,11 +1859,13 @@ msp_verify_overlaps(msp_t *self) int ok = overlap_counter_alloc(&counter, self->sequence_length, 0); tsk_bug_assert(ok == 0); - /* add in the overlaps for ancient samples */ - for (j = self->next_sampling_event; j < self->num_sampling_events; j++) { - se = self->sampling_events[j]; - for (u = self->root_segments[se.sample]; u != NULL; u = u->next) { - overlap_counter_increment_interval(&counter, u->left, u->right); + if (self->model.type != MSP_MODEL_WF_PED) { + /* add in the overlaps for ancient samples */ + for (j = self->next_sampling_event; j < self->num_sampling_events; j++) { + se = self->sampling_events[j]; + for (u = self->root_segments[se.sample]; u != NULL; u = u->next) { + overlap_counter_increment_interval(&counter, u->left, u->right); + } } } @@ -2495,12 +2497,10 @@ msp_store_coalescence_edge( int ret = 0; if (self->model.type == MSP_MODEL_DTWF || self->model.type == MSP_MODEL_WF_PED) { - if (self->additional_nodes & MSP_NODE_IS_RE_EVENT) { - // parent and child can be equal - // don't store edges - if (parent == child) { - return ret; - } + // parent and child can be equal if we're storing RE events, or if we've + // got internal samples. Don't store edges + if (parent == child) { + return ret; } } @@ -2914,13 +2914,30 @@ msp_pedigree_add_individual_common_ancestor( } static int MSP_WARN_UNUSED -msp_pedigree_add_sample_ancestry(msp_t *self, segment_t *segment) +msp_pedigree_add_sample_ancestry(msp_t *self, tsk_id_t node_id) { int ret = 0; tsk_size_t ploid; - tsk_id_t node_id = segment->value; tsk_id_t individual_id; individual_t *ind; + avl_node_t *a; + node_mapping_t *counter; + segment_t *seg + = msp_alloc_segment(self, 0, self->sequence_length, node_id, 0, 0, NULL, NULL); + lineage_t *lin = msp_alloc_lineage(self, seg, seg, 0, 0); + + if (seg == NULL || lin == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + ret = msp_insert_individual(self, lin); + if (ret != 0) { + goto out; + } + for (a = self->overlap_counts.head; a != self->overlap_counts.tail; a = a->next) { + counter = (node_mapping_t *) a->item; + counter->value += 1; + } tsk_bug_assert(node_id < (tsk_id_t) self->tables->nodes.num_rows); individual_id = self->tables->nodes.individual[node_id]; @@ -2934,14 +2951,7 @@ msp_pedigree_add_sample_ancestry(msp_t *self, segment_t *segment) } } tsk_bug_assert(ploid < ind->ploidy); - if (avl_count(&ind->common_ancestors[ploid]) > 0) { - /* This is where we'd deal with the internal samples. What we'll probably - * need to do is to go through the ancestry after we've processed the - * individual and then pad out any gaps in the ancestry segments. */ - ret = MSP_ERR_PEDIGREE_INTERNAL_SAMPLE; - goto out; - } - ret = msp_pedigree_add_individual_common_ancestor(self, ind->id, segment, ploid); + ret = msp_pedigree_add_individual_common_ancestor(self, ind->id, seg, ploid); if (ret != 0) { goto out; } @@ -2955,9 +2965,11 @@ msp_pedigree_initialise(msp_t *self) int ret = 0; population_t *pop; lineage_t *lin; + segment_t *seg; avl_node_t *a; label_id_t label = 0; tsk_size_t j; + node_mapping_t *counter; if (self->next_demographic_event != NULL) { ret = MSP_ERR_UNSUPPORTED_OPERATION; @@ -2981,14 +2993,21 @@ msp_pedigree_initialise(msp_t *self) for (j = 0; j < self->num_populations; j++) { pop = &self->populations[j]; + /* Rather than messing about with how we initialise from trees, it's + * easier to just remove the lineages here, before we add them + * back later when dealing with samples in the pedigree. */ for (a = pop->ancestors[label].head; a != NULL; a = a->next) { lin = (lineage_t *) a->item; - ret = msp_pedigree_add_sample_ancestry(self, lin->head); - if (ret != 0) { - goto out; + for (seg = lin->head; seg != NULL; seg = seg->next) { + msp_free_segment(self, seg); } + msp_remove_individual(self, lin); } } + tsk_bug_assert(avl_count(&self->overlap_counts) == 2); + counter = (node_mapping_t *) self->overlap_counts.head->item; + counter->value = 0; + self->pedigree.next_individual = 0; out: return ret; @@ -4589,31 +4608,6 @@ msp_apply_sampling_events(msp_t *self, double time) return ret; } -static int MSP_WARN_UNUSED -msp_pedigree_insert_ancient_samples(msp_t *self) -{ - int ret = 0; - sampling_event_t *se; - segment_t *root_seg, *new_head; - - while (self->next_sampling_event < self->num_sampling_events - && self->sampling_events[self->next_sampling_event].time <= self->time) { - se = self->sampling_events + self->next_sampling_event; - root_seg = self->root_segments[se->sample]; - ret = msp_insert_root_segments(self, root_seg, &new_head); - if (ret != 0) { - goto out; - } - ret = msp_pedigree_add_sample_ancestry(self, new_head); - if (ret != 0) { - goto out; - } - self->next_sampling_event++; - } -out: - return ret; -} - static double msp_get_next_fixed_event_time(msp_t *self) { @@ -5483,6 +5477,7 @@ msp_pedigree_process_common_ancestors(msp_t *self, individual_t *ind, tsk_size_t { int ret = 0; tsk_id_t node = ind->nodes[ploid]; + bool is_sample = (self->tables->nodes.flags[node] & TSK_NODE_IS_SAMPLE) != 0; tsk_id_t parent = ind->parents[ploid]; avl_tree_t *common_ancestors = &ind->common_ancestors[ploid]; segment_t *genome, *parent_ancestry[MSP_MAX_PED_PLOIDY], *seg; @@ -5490,6 +5485,14 @@ msp_pedigree_process_common_ancestors(msp_t *self, individual_t *ind, tsk_size_t tsk_size_t j; tsk_id_t *parent_nodes; + /* printf("Process node=%d sample=%d\n", node, is_sample); */ + if (is_sample) { + ret = msp_pedigree_add_sample_ancestry(self, node); + if (ret != 0) { + goto out; + } + } + /* FIXME - assuming 1 label here */ ret = msp_merge_n_ancestors( self, common_ancestors, ind->population, 0, node, &genome); @@ -5607,10 +5610,6 @@ msp_run_pedigree(msp_t *self, double max_time, unsigned long max_events) } tsk_bug_assert(ind->time >= self->time); self->time = ind->time; - ret = msp_pedigree_insert_ancient_samples(self); - if (ret != 0) { - goto out; - } for (ploid = 0; ploid < ind->ploidy; ploid++) { ret = msp_pedigree_process_common_ancestors(self, ind, ploid); if (ret != 0) { diff --git a/lib/tests/test_pedigrees.c b/lib/tests/test_pedigrees.c index de9cd957e..946bcfa08 100644 --- a/lib/tests/test_pedigrees.c +++ b/lib/tests/test_pedigrees.c @@ -553,8 +553,8 @@ test_replicates_ancient_samples(void) CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < 20; j++) { - msp_verify(&msp, 0); ret = msp_run(&msp, DBL_MAX, UINT32_MAX); + CU_ASSERT_EQUAL_FATAL(ret, MSP_EXIT_MODEL_COMPLETE); msp_verify(&msp, 0); ret = msp_finalise_tables(&msp); @@ -844,24 +844,28 @@ test_errors(void) static void test_internal_samples(void) { - int ret; tsk_id_t parents[] = { -1, -1, -1, -1, 0, 1 }; double time[] = { 1.0, 1.0, 0.0 }; tsk_flags_t is_sample[] = { true, true, true }; - msp_t msp; - tsk_table_collection_t tables; - gsl_rng *rng = safe_rng_alloc(); - ret = build_pedigree_sim( - &msp, &tables, rng, 100, 2, 3, parents, time, is_sample, NULL); - ret = msp_initialise(&msp); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = msp_run(&msp, DBL_MAX, UINT32_MAX); - CU_ASSERT_EQUAL_FATAL(ret, MSP_ERR_PEDIGREE_INTERNAL_SAMPLE); - tsk_table_collection_free(&tables); - msp_free(&msp); + verify_pedigree(0, 1, 3, parents, time, is_sample, NULL, 0); - gsl_rng_free(rng); + /* verify_pedigree */ + /* msp_t msp; */ + /* tsk_table_collection_t tables; */ + /* gsl_rng *rng = safe_rng_alloc(); */ + + /* ret = build_pedigree_sim( */ + /* &msp, &tables, rng, 100, 2, 3, parents, time, is_sample, NULL); */ + /* ret = msp_initialise(&msp); */ + /* msp_print_state(&msp, stdout); */ + /* CU_ASSERT_EQUAL_FATAL(ret, 0); */ + /* ret = msp_run(&msp, DBL_MAX, UINT32_MAX); */ + /* CU_ASSERT_EQUAL_FATAL(ret, MSP_ERR_PEDIGREE_INTERNAL_SAMPLE); */ + /* tsk_table_collection_free(&tables); */ + /* msp_free(&msp); */ + + /* gsl_rng_free(rng); */ } static void diff --git a/lib/util.c b/lib/util.c index 6b8342d13..270201fea 100644 --- a/lib/util.c +++ b/lib/util.c @@ -183,12 +183,6 @@ msp_strerror_internal(int err) ret = "All individuals in the input pedigree must be associated with " "exactly two parents (can be TSK_NULL, if not known)"; break; - case MSP_ERR_PEDIGREE_INTERNAL_SAMPLE: - ret = "Samples that are internal nodes in the pedigree are not " - "currently supported. Please comment on this GitHub issue if you " - "would like to see this feature implemented: " - "https://github.com/tskit-dev/msprime/issues/1855 "; - break; case MSP_ERR_BAD_PROPORTION: ret = "Proportion values must have 0 <= x <= 1"; diff --git a/lib/util.h b/lib/util.h index 619407268..ced714448 100644 --- a/lib/util.h +++ b/lib/util.h @@ -134,7 +134,6 @@ #define MSP_ERR_PEDIGREE_TIME_TRAVEL -88 #define MSP_ERR_PEDIGREE_IND_NOT_DIPLOID -89 #define MSP_ERR_PEDIGREE_IND_NOT_TWO_PARENTS -90 -#define MSP_ERR_PEDIGREE_INTERNAL_SAMPLE -91 /* clang-format on */ /* This bit is 0 for any errors originating from tskit */ diff --git a/tests/test_pedigree.py b/tests/test_pedigree.py index 6db662cbd..765ec41b1 100644 --- a/tests/test_pedigree.py +++ b/tests/test_pedigree.py @@ -750,6 +750,19 @@ def test_just_samples(self): parent_nodes = {0, 1, 2, 3} assert set(ts.first().roots) == parent_nodes + @pytest.mark.parametrize("num_founders", [2, 3, 10, 20]) + def test_all_samples(self, num_founders): + tables = simulate_pedigree( + num_founders=num_founders, + num_children_prob=[0, 0, 1], + num_generations=6, + sequence_length=100, + ) + flags = tables.nodes.flags + flags[:] = tskit.NODE_IS_SAMPLE + tables.nodes.flags = flags + self.verify(tables) + class TestSimulateThroughPedigreeEventByEvent(TestSimulateThroughPedigree): def verify(self, input_tables, recombination_rate=0): @@ -792,11 +805,14 @@ def verify(self, input_tables, recombination_rate=0): for node_id in tree.nodes(): if tree.num_children(node_id) == 1: # Any unary nodes should be associated with a pedigree - # founder. + # founder or a sapmle node = ts2.node(node_id) assert node.individual != tskit.NULL individual = ts2.individual(node.individual) - assert list(individual.parents) == [tskit.NULL, tskit.NULL] + assert node.is_sample() or list(individual.parents) == [ + tskit.NULL, + tskit.NULL, + ] tables1 = ts1.tables tables2 = ts2.tables tables1.individuals.assert_equals( @@ -912,19 +928,6 @@ def test_10_percent_parents_removed(self): class TestSimulateThroughPedigreeErrors: - def test_parents_are_samples(self): - tables = get_base_tables(100) - parents = [ - add_pedigree_individual(tables, time=1, is_sample=True) for _ in range(2) - ] - add_pedigree_individual(tables, parents=parents, time=0) - - with pytest.raises(_msprime.LibraryError, match="1855"): - msprime.sim_ancestry( - initial_state=tables, - model="fixed_pedigree", - ) - @pytest.mark.parametrize("num_parents", [0, 1, 3]) def test_not_two_parents(self, num_parents): tables = get_base_tables(100) @@ -984,20 +987,6 @@ def test_pedigree_time_travel(self): with pytest.raises(_msprime.InputError, match="time for a parent must be"): msprime.sim_ancestry(initial_state=tables, model="fixed_pedigree") - @pytest.mark.parametrize("num_founders", [2, 3, 10, 20]) - def test_all_samples(self, num_founders): - tables = simulate_pedigree( - num_founders=num_founders, - num_children_prob=[0, 0, 1], - num_generations=6, - sequence_length=100, - ) - flags = tables.nodes.flags - flags[:] = tskit.NODE_IS_SAMPLE - tables.nodes.flags = flags - with pytest.raises(_msprime.LibraryError, match="1855"): - msprime.sim_ancestry(initial_state=tables, model="fixed_pedigree") - class TestSimulateThroughPedigreeMultiplePops: def test_demography_raises_error(self):