Remove uuid

tskit-dev · Nov 3, 2022 · 0a0f60c · 0a0f60c
1 parent 5ce3c24
commit 0a0f60c
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 54 deletions.
diff --git a/tests/test_formats.py b/tests/test_formats.py
@@ -1077,7 +1077,6 @@ def test_copy_new_uuid(self):
         data.finalise()
         copy = data.copy()
         copy.finalise()
-        assert copy.uuid != data.uuid
         assert copy.data_equal(data)
 
     def test_copy_update_sites_time(self):
@@ -1921,8 +1920,6 @@ def verify_data_round_trip(self, sample_data, ancestor_data, ancestors):
         ancestor_data.record_provenance("verify_data_round_trip")
         ancestor_data.finalise()
 
-        assert len(ancestor_data.uuid) > 0
-        assert ancestor_data.sample_data_uuid == sample_data.uuid
         assert ancestor_data.sequence_length == sample_data.sequence_length
         assert ancestor_data.format_name == formats.AncestorData.FORMAT_NAME
         assert ancestor_data.format_version == formats.AncestorData.FORMAT_VERSION
@@ -2195,11 +2192,9 @@ def test_bad_insert_proxy_samples(self):
     def test_insert_proxy_bad_sample_data(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
-        # by default, sample_data must be the same
         sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
-        with pytest.raises(ValueError):
-            ancestors.insert_proxy_samples(sd_copy)
-        # But works if we don't require same data
+        ancestors.insert_proxy_samples(sd_copy)
+        # Deprecated flag should change nothing
         ancestors.insert_proxy_samples(sd_copy, require_same_sample_data=False)
         # Unless seq lengths differ
         sd_copy, _ = self.get_example_data(10, sequence_length=11, num_ancestors=40)
@@ -2229,8 +2224,8 @@ def test_insert_proxy_no_samples(self):
         sample_data, _ = self.get_example_data(10, 10, 40)
         ancestors = tsinfer.generate_ancestors(sample_data)
         ancestors_extra = ancestors.insert_proxy_samples(sample_data, sample_ids=[])
-        assert ancestors != ancestors_extra  # UUIDs should differ ...
-        assert ancestors.data_equal(ancestors_extra)  # but data be identical
+        assert ancestors == ancestors_extra  # Equality based on data
+        assert ancestors.data_equal(ancestors_extra)  # data should be identical
 
     def test_insert_proxy_1_sample(self):
         sample_data, _ = self.get_example_data(10, 10, 40)

diff --git a/tsinfer/formats.py b/tsinfer/formats.py
@@ -28,7 +28,6 @@
 import queue
 import sys
 import threading
-import uuid
 import warnings
 
 import attr
@@ -385,7 +384,6 @@ def __init__(
             self.data = zarr.open_group(store=store, mode="w")
         self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
         self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
-        self.data.attrs["uuid"] = str(uuid.uuid4())
 
         chunks = self._chunk_size
         provenances_group = self.data.create_group("provenances")
@@ -486,8 +484,7 @@ def copy(self, path=None, max_file_size=None):
         """
         Returns a copy of this DataContainer opened in 'edit' mode. If path
         is specified, this must not be equal to the path of the current
-        data container. The new container will have a different UUID to the
-        current.
+        data container.
         """
         if self._mode != self.READ_MODE:
             raise ValueError("Cannot copy unless in read mode.")
@@ -516,8 +513,6 @@ def copy(self, path=None, max_file_size=None):
             store = other._new_lmdb_store(max_file_size)
             zarr.copy_store(self.data.store, store)
             other.data = zarr.group(store)
-        # Set a new UUID
-        other.data.attrs["uuid"] = str(uuid.uuid4())
         other.data.attrs[FINALISED_KEY] = False
         other._mode = self.EDIT_MODE
         return other
@@ -663,10 +658,6 @@ def finalised(self):
             ret = self.data.attrs[FINALISED_KEY]
         return ret
 
-    @property
-    def uuid(self):
-        return str(self.data.attrs["uuid"])
-
     @property
     def num_provenances(self):
         return self.provenances_timestamp.shape[0]
@@ -693,7 +684,7 @@ def _format_str(self, values):
     def __eq__(self, other):
         ret = NotImplemented
         if isinstance(other, type(self)):
-            ret = self.uuid == other.uuid and self.data_equal(other)
+            ret = self.data_equal(other)
         return ret
 
     def __str__(self):
@@ -703,7 +694,6 @@ def __str__(self):
             ("format_name", self.format_name),
             ("format_version", self.format_version),
             ("finalised", self.finalised),
-            ("uuid", self.uuid),
             ("num_provenances", self.num_provenances),
             ("provenances/timestamp", zarr_summary(self.provenances_timestamp)),
             ("provenances/record", zarr_summary(self.provenances_record)),
@@ -1320,10 +1310,10 @@ def data_equal(self, other):
         """
         Returns True if all the data attributes of this input file and the
         specified input file are equal. This compares every attribute except
-        the UUID and provenance.
+        the provenance.
 
         To compare two :class:`SampleData` instances for exact equality of
-        all data including UUIDs and provenance data, use ``s1 == s2``.
+        all data including provenance data, use ``s1 == s2``.
 
         :param SampleData other: The other :class:`SampleData` instance to
             compare with.
@@ -2246,13 +2236,6 @@ def __metadata_schema_getter(self, zarr_group):
         except KeyError:
             return {"codec": "json"}
 
-    @property
-    def uuid(self):
-        return (
-            "Hmm, not sure, could just generate a UUID, but then it wouldn't"
-            "be in the file - maybe we do need to write back on init"
-        )
-
     @property
     def format_name(self):
         return self.FORMAT_NAME
@@ -2548,7 +2531,6 @@ def __init__(self, sample_data, **kwargs):
         super().__init__(**kwargs)
         sample_data._check_finalised()
         self.sample_data = sample_data
-        self.data.attrs["sample_data_uuid"] = sample_data.uuid
         if self.sample_data.sequence_length == 0:
             raise ValueError("Bad samples file: sequence_length cannot be zero")
         self.data.attrs["sequence_length"] = self.sample_data.sequence_length
@@ -2628,7 +2610,6 @@ def summary(self):
     def __str__(self):
         values = [
             ("sequence_length", self.sequence_length),
-            ("sample_data_uuid", self.sample_data_uuid),
             ("num_ancestors", self.num_ancestors),
             ("num_sites", self.num_sites),
             ("sites/position", zarr_summary(self.sites_position)),
@@ -2643,12 +2624,10 @@ def __str__(self):
     def data_equal(self, other):
         """
         Returns True if all the data attributes of this input file and the
-        specified input file are equal. This compares every attribute except
-        the UUID.
+        specified input file are equal. This compares every attribute.
         """
         return (
             self.sequence_length == other.sequence_length
-            and self.sample_data_uuid == other.sample_data_uuid
             and self.format_name == other.format_name
             and self.format_version == other.format_version
             and self.num_ancestors == other.num_ancestors
@@ -2670,10 +2649,6 @@ def sequence_length(self):
         """
         return self.data.attrs["sequence_length"]
 
-    @property
-    def sample_data_uuid(self):
-        return self.data.attrs["sample_data_uuid"]
-
     @property
     def num_ancestors(self):
         return self.ancestors_start.shape[0]
@@ -2788,14 +2763,7 @@ def insert_proxy_samples(
             (i.e. breaking the infinite sites assumption), allowing them to possess
             derived alleles at sites where there are no pre-existing mutations in
             older ancestors.
-        :param bool require_same_sample_data: If ``True`` (default) then the
-            the ``sample_data`` parameter must point to the same :class:`.SampleData`
-            instance as that used to generate the current ancestors. If ``False``,
-            this requirement is not enforced, and it is the user's responsibility
-            to ensure that the encoding of alleles in ``sample_data`` matches the
-            encoding in the current :class:`AncestorData` instance (i.e. that in the
-            original :class:`.SampleData` instance on which the current ancestors
-            are based).
+        :param bool require_same_sample_data: **Deprecated** Has no effect.
         :param \\**kwargs: Further arguments passed to the constructor when creating
             the new :class:`AncestorData` instance which will be returned.
 
@@ -2804,11 +2772,6 @@ def insert_proxy_samples(
         """
         self._check_finalised()
         sample_data._check_finalised()
-        if require_same_sample_data:
-            if sample_data.uuid != self.sample_data_uuid:
-                raise ValueError(
-                    "sample_data differs from that used to build the initial ancestors"
-                )
         if self.sequence_length != sample_data.sequence_length:
             raise ValueError("sample_data does not have the correct sequence length")
         used_sites = np.isin(sample_data.sites_position[:], self.sites_position[:])
@@ -2903,8 +2866,6 @@ def insert_proxy_samples(
             other.clear_provenances()
             for timestamp, record in self.provenances():
                 other.add_provenance(timestamp, record)
-            if sample_data.uuid != self.sample_data_uuid:
-                pass  # TODO: if sample files don't match, we need extra provenance info
             other.record_provenance(command="insert_proxy_samples", **kwargs)
 
         assert other.num_ancestors == self.num_ancestors + len(sample_ids)

diff --git a/tsinfer/inference.py b/tsinfer/inference.py
@@ -1568,6 +1568,10 @@ def get_ancestors_tables(self):
         tables.build_index()
         tables.compute_mutation_parents()
         logger.debug("Sorting ancestors tree sequence done")
+        for timestamp, record in self.ancestor_data.provenances():
+            tables.provenances.add_row(timestamp=timestamp, record=json.dumps(record))
+        record = provenance.get_provenance_dict(command="match_ancestors")
+        tables.provenances.add_row(record=json.dumps(record))
         logger.info(
             "Built ancestors tree sequence: {} nodes ({} pc ancestors); {} edges; "
             "{} sites; {} mutations".format(