diff --git a/GeneticInheritanceGraphLibrary/tables.py b/GeneticInheritanceGraphLibrary/tables.py index ef56152..dae07f2 100644 --- a/GeneticInheritanceGraphLibrary/tables.py +++ b/GeneticInheritanceGraphLibrary/tables.py @@ -97,7 +97,7 @@ class IndividualTableRow( class BaseTable: _RowClass = None - _non_int_fieldtypes = {} + _non_int64_fieldtypes = {} # By default all fields are int64 initial_size = 64 # default max_resize = 2**18 # maximum number of rows by which we expand internal storage _frozen = None # Will be overridden during init @@ -106,7 +106,7 @@ def _create_datastore(self): self._datastore = np.empty( self.initial_size, dtype=[ - (name, self._non_int_fieldtypes.get(name, np.int64)) + (name, self._non_int64_fieldtypes.get(name, np.int64)) for name in self._RowClass._fields ], ) @@ -259,7 +259,7 @@ def _create_datastore(self): self._datastore = np.empty( self.initial_size, dtype=[ - (name, self._non_int_fieldtypes.get(name, np.int64)) + (name, self._non_int64_fieldtypes.get(name, np.int64)) for name in self._RowClass._fields if name not in self._extra_names ], @@ -295,6 +295,10 @@ class IEdgeTable(BaseTable): """ _RowClass = IEdgeTableRow + _non_int64_fieldtypes = { + "child_chromosome": np.int16, # Save some space + "parent_chromosome": np.int16, # Save some space + } # define each property by hand, for speed @property @@ -464,22 +468,22 @@ def add_row( # need to check the values before they were put into the data array, # as numpy silently converts floats to integers on assignment if validate & ValidFlags.IEDGES_INTEGERS: - for is_edge, i in enumerate( + for i, val in enumerate( ( - edge, - child_left, - child_right, - parent_left, - parent_right, - child, - parent, - child_chromosome, - parent_chromosome, + child_left, # 0 + child_right, # 1 + parent_left, # 2 + parent_right, # 3 + child, # 4 + parent, # 5 + child_chromosome, # 6 + parent_chromosome, # 7 + edge, # 8 ) ): - if int(i) != i: + if int(val) != val: raise ValueError("Iedge data must be integers") - if is_edge != 0 and i < 0: + if i < 6 and val < 0: raise ValueError( "Iedge data must be non-negative (except edge ID)" ) @@ -708,7 +712,7 @@ class NodeTable(BaseExtraTable): """ _RowClass = NodeTableRow - _non_int_fieldtypes = {"time": np.float64, "flags": np.uint32} + _non_int64_fieldtypes = {"time": np.float64, "flags": np.uint32} # define each property by hand, for speed @property @@ -758,7 +762,7 @@ def append(self, obj) -> int: class IndividualTable(BaseExtraTable): _RowClass = IndividualTableRow - _non_int_fieldtypes = {"flags": np.uint32} + _non_int64_fieldtypes = {"flags": np.uint32} _extra_names = ["location", "parents", "metadata"] @property diff --git a/requirements.txt b/requirements.txt index ec804e6..83f4bbb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ portion pytest pytest-cov pytest-xdist +tqdm tskit msprime numpy diff --git a/tests/gigutil.py b/tests/gigutil.py index 9caf89b..6090ebd 100644 --- a/tests/gigutil.py +++ b/tests/gigutil.py @@ -1,5 +1,6 @@ import GeneticInheritanceGraphLibrary as gigl import numpy as np +from tqdm.auto import tqdm # Utilities for creating and editing gigs @@ -168,6 +169,7 @@ def run( random_seed=None, initial_node_flags=None, further_node_flags=None, + progress_monitor=None, ): """ Initialise and run a new population for a given number of generations. The last @@ -190,6 +192,8 @@ def run( gens, num_diploids[-gens - 1], node_flags=initial_node_flags ) # First generation by hand, so that we can specify the sequence length + if progress_monitor: + progress_monitor = tqdm(total=gens, desc="Simulating", unit="gen") gens -= 1 self.new_population( gens, @@ -204,6 +208,8 @@ def run( self.new_population( gens, size=num_diploids[-gens - 1], node_flags=further_node_flags ) + if progress_monitor: + progress_monitor.update(1) self.tables.sort() # We should probably simplify or at least sample_resolve here? @@ -211,7 +217,9 @@ def run( # Probably a parameter `simplify` would be useful? return self.tables.copy().graph() - def run_more(self, num_diploids, seq_len, gens, random_seed=None): + def run_more( + self, num_diploids, seq_len, gens, random_seed=None, progress_monitor=None + ): """ The num_diploids parameter can be an array of length `gens` giving the diploid population size in each generation. @@ -225,9 +233,13 @@ def run_more(self, num_diploids, seq_len, gens, random_seed=None): # augment the generations self.tables.change_times(gens) + if progress_monitor: + progress_monitor = tqdm(total=gens, desc="Simulating more", unit="gen") while gens > 0: gens -= 1 self.new_population(gens, size=num_diploids[-gens - 1]) + if progress_monitor: + progress_monitor.update(1) self.tables.sort() return self.tables.copy().graph() @@ -347,6 +359,7 @@ def run( random_seed=None, initial_node_flags=None, further_node_flags=None, + progress_monitor=None, ): """ The num_diploids param can be an array of length `gens + 1` giving the diploid @@ -354,6 +367,8 @@ def run( """ self.rng = np.random.default_rng(random_seed) self.num_tries_for_breakpoint = 20 # number of tries to find a breakpoint + if progress_monitor: + progress_monitor = tqdm(total=gens, desc="Simulating", unit="gen") if isinstance(num_diploids, int): num_diploids = [num_diploids] * (gens + 1) self.pop = self.initialise_population( @@ -364,9 +379,11 @@ def run( self.new_population( gens, size=num_diploids[-gens - 1], node_flags=further_node_flags ) + if progress_monitor: + progress_monitor.update(1) return self.tables_to_gig_without_grand_mrca() - def run_more(self, num_diploids, gens, random_seed=None): + def run_more(self, num_diploids, gens, random_seed=None, progress_monitor=None): """ The num_diploids parameter can be an array of length `gens` giving the diploid population size in each generation. @@ -378,9 +395,13 @@ def run_more(self, num_diploids, gens, random_seed=None): if isinstance(num_diploids, int): num_diploids = [num_diploids] * (gens) self.tables.change_times(gens) + if progress_monitor: + progress_monitor = tqdm(total=gens, desc="Simulating more", unit="gen") while gens > 0: gens -= 1 self.new_population(gens, size=num_diploids[-gens - 1]) + if progress_monitor: + progress_monitor.update(1) return self.tables_to_gig_without_grand_mrca() def find_comparable_points(self, tables, parent_nodes): diff --git a/tests/test_gigutil.py b/tests/test_gigutil.py index 34d2400..8091ea5 100644 --- a/tests/test_gigutil.py +++ b/tests/test_gigutil.py @@ -7,6 +7,7 @@ INVERTED_CHILD_FLAG = 1 << 17 +DUPLICATION_CHILD_FLAG = 1 << 18 # Tests for functions in tests/gigutil.py @@ -168,15 +169,21 @@ class TestDTWF_one_break_no_rec_inversions_slow: default_gens = 5 seq_len = 531 inversion = tskit.Interval(100, 200) - + duplication = tskit.Interval(50, 500) # make a huge duplicated region + progress_monitor = False simulator = None ts = None # only used to extract a tree sequence from subfunctions + iedge_validate = True def test_plain_sim(self): gens = self.default_gens - self.simulator = sim.DTWF_one_break_no_rec_inversions_slow() + self.simulator = sim.DTWF_one_break_no_rec_inversions_slow(self.iedge_validate) gig = self.simulator.run( - num_diploids=10, seq_len=self.seq_len, gens=gens, random_seed=1 + num_diploids=10, + seq_len=self.seq_len, + gens=gens, + random_seed=1, + progress_monitor=self.progress_monitor, ) assert len(np.unique(gig.tables.nodes.time)) == gens + 1 assert gig.num_iedges > 0 @@ -187,12 +194,21 @@ def test_plain_sim(self): def test_run_more(self): gens = self.default_gens - self.simulator = sim.DTWF_one_break_no_rec_inversions_slow() + self.simulator = sim.DTWF_one_break_no_rec_inversions_slow(self.iedge_validate) gig = self.simulator.run( - num_diploids=10, seq_len=self.seq_len, gens=gens, random_seed=1 + num_diploids=10, + seq_len=self.seq_len, + gens=gens, + random_seed=1, + progress_monitor=self.progress_monitor, ) new_gens = 2 - gig = self.simulator.run_more(num_diploids=(4, 2), gens=new_gens, random_seed=1) + gig = self.simulator.run_more( + num_diploids=(4, 2), + gens=new_gens, + random_seed=1, + progress_monitor=self.progress_monitor, + ) times, counts = np.unique(gig.tables.nodes.time, return_counts=True) assert len(times) == gens + new_gens + 1 assert times[0] == 0 @@ -206,9 +222,15 @@ def test_run_more(self): def test_vs_tskit_implementation(self, seed): # The tskit_DTWF_simulator should produce identical results to the GIG simulator gens = self.default_gens - self.simulator = DTWF_one_break_no_rec_inversions_test() + self.simulator = DTWF_one_break_no_rec_inversions_test(self.iedge_validate) ts_simulator = tskit_DTWF_simulator(sequence_length=self.seq_len) - gig = self.simulator.run(7, self.seq_len, gens=gens, random_seed=seed) + gig = self.simulator.run( + 7, + self.seq_len, + gens=gens, + random_seed=seed, + progress_monitor=self.progress_monitor, + ) ts = ts_simulator.run(7, gens=gens, random_seed=seed) ts.tables.assert_equals(gig.to_tree_sequence().tables, ignore_provenance=True) assert ts.num_trees > 0 @@ -225,12 +247,13 @@ def test_inversion(self): Note that this routine can be called directly e.g. from tests.test_gigutil import TestDTWF_one_break_no_rec_inversions_slow - cls = TestDTWF_one_break_no_rec_inversions_slow() - cls.test_inversion() - print(cls.ts) + test = TestDTWF_one_break_no_rec_inversions_slow() + test.test_inversion() + print(test.ts) """ final_pop_size = 100 self.simulator = sim.DTWF_one_break_no_rec_inversions_slow( + self.iedge_validate, initial_sizes={ "nodes": 2 * final_pop_size * self.default_gens, "edges": 2 * final_pop_size * self.default_gens * 2, @@ -246,6 +269,7 @@ def test_inversion(self): further_node_flags=np.array( [[INVERTED_CHILD_FLAG, 0], [0, 0]], dtype=np.int32 ), + progress_monitor=self.progress_monitor, ) # Insert an inversion by editing the tables tables = self.simulator.tables @@ -266,7 +290,11 @@ def test_inversion(self): if ie.child == inverted_child_id and ie.child_left == 0: assert ie.parent_left == 0 assert ie.child_right == ie.parent_right - assert ie.child_right > 200 # check seed gives breakpoint a bit along + # the chosen seed should give a breakpoint a bit along + if ie.child_right <= self.inversion.right: + raise ValueError( + "Choose a different seed (breakpoint not far right enough)" + ) new_tables.add_iedge_row( 0, @@ -320,6 +348,7 @@ def test_inversion(self): num_diploids=final_pop_size, gens=self.default_gens - 1, random_seed=1, + progress_monitor=self.progress_monitor, ) # should have deleted the grand MRCA (used for matching) assert len(gig.nodes) == len(self.simulator.tables.nodes) - 1 @@ -341,6 +370,9 @@ def test_inversion(self): # Check we can turn the decapitated gig tables into a tree sequence # (as long as this isn't simplified, decapitation should remove the only SV) tables = gig.tables.copy() + # shouldn't be able to edit a row like this really (in case + # we change the time) + # tables.nodes[inverted_child_id] = tables. node_map = tables.decapitate(time=gig.max_time) decapitated_gig = tables.graph() # check that decapitation has removed the inversion @@ -389,3 +421,109 @@ def test_inversion(self): assert divergence[0] < max_divergence * 0.999 assert np.isclose(divergence[1], max_divergence) assert divergence[2] < max_divergence * 0.999 + + def test_tandem_duplication(self): + """ + Run a simulation in which a single tandem duplication is introduced then the + population explodes so that the duplication is not lost. + + Unlike an inversion, we can't convert this to a tree sequence because + unequal recombination will keep generating new SVs during the lifetime + of the simulation. + + Note that this routine can be called directly e.g. + from tests.test_gigutil import TestDTWF_one_break_no_rec_inversions_slow + + test = TestDTWF_one_break_no_rec_inversions_slow() + test.progress_monitor = True # to show how the sim is progressing + test.default_gens = 40 # increase number of gens + test.test_tandem_duplication() + print(test.gig) + """ + final_pop_size = 100 + self.simulator = sim.DTWF_one_break_no_rec_inversions_slow( + self.iedge_validate, + initial_sizes={ + "nodes": 2 * final_pop_size * self.default_gens, + "edges": 2 * final_pop_size * self.default_gens * 2, + "individuals": final_pop_size * self.default_gens, + }, + ) + self.simulator.run( + num_diploids=2, + seq_len=self.seq_len, + gens=1, + random_seed=1234, # chosen to give >= 3-fold tandem repeats after 5 gens + further_node_flags=np.array( + [[DUPLICATION_CHILD_FLAG, 0], [0, 0]], dtype=np.int32 + ), + progress_monitor=self.progress_monitor, + ) + # Insert a tandem duplication by editing the tables + tables = self.simulator.tables + times, inverses = np.unique(tables.nodes.time, return_inverse=True) + assert len(times) == 3 + first_gen = np.where(inverses == np.where([times == 1])[0])[0] + second_gen = np.where(inverses == np.where([times == 0])[0])[0] + assert len(first_gen) == 4 # haploid genomes + assert len(second_gen) == 4 # haploid genomes + # Edit the existing iedges to create a duplication in one child + new_tables = tables.copy(omit_iedges=True) + dup_child_id = np.where(tables.nodes.flags == DUPLICATION_CHILD_FLAG)[0] + assert len(dup_child_id) == 1 + dup_child_id = dup_child_id[0] + assert dup_child_id in second_gen + for ie in tables.iedges: + if ie.child == dup_child_id: + assert ie.child_right == ie.parent_right + if ie.child_left == 0: + assert ie.parent_left == 0 + new_tables.add_iedge_row( + 0, + self.duplication.right, + 0, + self.duplication.right, + child=dup_child_id, + parent=ie.parent, + **self.simulator.add_iedge_params(), + ) + new_tables.add_iedge_row( + self.duplication.right, + self.seq_len + self.duplication.span, + self.duplication.left, + self.seq_len, + child=dup_child_id, + parent=ie.parent, + **self.simulator.add_iedge_params(), + ) + else: + # Don't add in the edge for the other parent. This duplication + # will not be associated with a recombination + pass + else: + new_tables.add_iedge_row( + **ie._asdict(), **self.simulator.add_iedge_params() + ) + new_tables.sort() + self.simulator.tables = new_tables + # Check it gives a valid gig + gig = self.simulator.tables.copy().graph() + + # Can progress the simulation + gig = self.simulator.run_more( + num_diploids=final_pop_size, + gens=self.default_gens - 1, + random_seed=1, + progress_monitor=self.progress_monitor, + ) + # should have deleted the grand MRCA (used for matching) + assert len(gig.nodes) == len(self.simulator.tables.nodes) - 1 + + self.gig = gig + # Check that there are a number of duplicated regions: + + lengths = [gig.sequence_length(u) for u in gig.samples] + unique_lengths = np.unique(lengths) + assert len(unique_lengths) > 2 + assert np.all((np.diff(unique_lengths) % np.diff(self.duplication)) == 0) + # These should also form a tree that traces the duplicated regions diff --git a/tests/test_tables.py b/tests/test_tables.py index 74a6173..99dc982 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -389,6 +389,20 @@ def test_flags(self, flag, skip): ) assert tables.iedges.flags == flag + def test_add_iedge_allow_negative_edge_chrom(self): + # Negative chromosome numbers could be used e.g. for circular chromosomes + flags = ValidFlags.IEDGES_INTEGERS + tables = gigl.Tables() + tables.nodes.add_row(time=1) + tables.nodes.add_row(time=0) + tables.add_iedge_row(0, 1, 0, 1, child=1, parent=0, edge=-1, validate=flags) + tables.add_iedge_row( + 0, 1, 0, 1, child=1, parent=0, parent_chromosome=-1, validate=flags + ) + tables.add_iedge_row( + 0, 1, 0, 1, child=1, parent=0, parent_chromosome=-1, validate=flags + ) + def test_bad_flags(self): flags = ValidFlags.IEDGES_COMBO_NODE_TABLE tables = gigl.Tables() @@ -410,8 +424,18 @@ def test_add_iedge_row_fail_negative(self): tables = gigl.Tables() tables.nodes.add_row(time=1) tables.nodes.add_row(time=0) + with pytest.raises(ValueError, match="non-negative"): + tables.add_iedge_row(-1, 0, 0, 1, child=1, parent=0, validate=flags) + with pytest.raises(ValueError, match="non-negative"): + tables.add_iedge_row(0, -1, 0, 1, child=1, parent=0, validate=flags) + with pytest.raises(ValueError, match="non-negative"): + tables.add_iedge_row(0, 1, 0, -1, child=1, parent=0, validate=flags) with pytest.raises(ValueError, match="non-negative"): tables.add_iedge_row(0, 1, -1, 0, child=1, parent=0, validate=flags) + with pytest.raises(ValueError, match="non-negative"): + tables.add_iedge_row(0, 1, 0, 1, child=-1, parent=0, validate=flags) + with pytest.raises(ValueError, match="non-negative"): + tables.add_iedge_row(0, 1, 0, 1, child=0, parent=-1, validate=flags) def test_add_iedge_row_fail_child_nonoverlapping(self): flags = ValidFlags.IEDGES_FOR_CHILD_NONOVERLAPPING