Skip to content

Commit

Permalink
Remove uuid
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Nov 3, 2022
1 parent 5ce3c24 commit 0a0f60c
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 54 deletions.
13 changes: 4 additions & 9 deletions tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,6 @@ def test_copy_new_uuid(self):
data.finalise()
copy = data.copy()
copy.finalise()
assert copy.uuid != data.uuid
assert copy.data_equal(data)

def test_copy_update_sites_time(self):
Expand Down Expand Up @@ -1921,8 +1920,6 @@ def verify_data_round_trip(self, sample_data, ancestor_data, ancestors):
ancestor_data.record_provenance("verify_data_round_trip")
ancestor_data.finalise()

assert len(ancestor_data.uuid) > 0
assert ancestor_data.sample_data_uuid == sample_data.uuid
assert ancestor_data.sequence_length == sample_data.sequence_length
assert ancestor_data.format_name == formats.AncestorData.FORMAT_NAME
assert ancestor_data.format_version == formats.AncestorData.FORMAT_VERSION
Expand Down Expand Up @@ -2195,11 +2192,9 @@ def test_bad_insert_proxy_samples(self):
def test_insert_proxy_bad_sample_data(self):
sample_data, _ = self.get_example_data(10, 10, 40)
ancestors = tsinfer.generate_ancestors(sample_data)
# by default, sample_data must be the same
sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
with pytest.raises(ValueError):
ancestors.insert_proxy_samples(sd_copy)
# But works if we don't require same data
ancestors.insert_proxy_samples(sd_copy)
# Deprecated flag should change nothing
ancestors.insert_proxy_samples(sd_copy, require_same_sample_data=False)
# Unless seq lengths differ
sd_copy, _ = self.get_example_data(10, sequence_length=11, num_ancestors=40)
Expand Down Expand Up @@ -2229,8 +2224,8 @@ def test_insert_proxy_no_samples(self):
sample_data, _ = self.get_example_data(10, 10, 40)
ancestors = tsinfer.generate_ancestors(sample_data)
ancestors_extra = ancestors.insert_proxy_samples(sample_data, sample_ids=[])
assert ancestors != ancestors_extra # UUIDs should differ ...
assert ancestors.data_equal(ancestors_extra) # but data be identical
assert ancestors == ancestors_extra # Equality based on data
assert ancestors.data_equal(ancestors_extra) # data should be identical

def test_insert_proxy_1_sample(self):
sample_data, _ = self.get_example_data(10, 10, 40)
Expand Down
51 changes: 6 additions & 45 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import queue
import sys
import threading
import uuid
import warnings

import attr
Expand Down Expand Up @@ -385,7 +384,6 @@ def __init__(
self.data = zarr.open_group(store=store, mode="w")
self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
self.data.attrs["uuid"] = str(uuid.uuid4())

chunks = self._chunk_size
provenances_group = self.data.create_group("provenances")
Expand Down Expand Up @@ -486,8 +484,7 @@ def copy(self, path=None, max_file_size=None):
"""
Returns a copy of this DataContainer opened in 'edit' mode. If path
is specified, this must not be equal to the path of the current
data container. The new container will have a different UUID to the
current.
data container.
"""
if self._mode != self.READ_MODE:
raise ValueError("Cannot copy unless in read mode.")
Expand Down Expand Up @@ -516,8 +513,6 @@ def copy(self, path=None, max_file_size=None):
store = other._new_lmdb_store(max_file_size)
zarr.copy_store(self.data.store, store)
other.data = zarr.group(store)
# Set a new UUID
other.data.attrs["uuid"] = str(uuid.uuid4())
other.data.attrs[FINALISED_KEY] = False
other._mode = self.EDIT_MODE
return other
Expand Down Expand Up @@ -663,10 +658,6 @@ def finalised(self):
ret = self.data.attrs[FINALISED_KEY]
return ret

@property
def uuid(self):
return str(self.data.attrs["uuid"])

@property
def num_provenances(self):
return self.provenances_timestamp.shape[0]
Expand All @@ -693,7 +684,7 @@ def _format_str(self, values):
def __eq__(self, other):
ret = NotImplemented
if isinstance(other, type(self)):
ret = self.uuid == other.uuid and self.data_equal(other)
ret = self.data_equal(other)
return ret

def __str__(self):
Expand All @@ -703,7 +694,6 @@ def __str__(self):
("format_name", self.format_name),
("format_version", self.format_version),
("finalised", self.finalised),
("uuid", self.uuid),
("num_provenances", self.num_provenances),
("provenances/timestamp", zarr_summary(self.provenances_timestamp)),
("provenances/record", zarr_summary(self.provenances_record)),
Expand Down Expand Up @@ -1320,10 +1310,10 @@ def data_equal(self, other):
"""
Returns True if all the data attributes of this input file and the
specified input file are equal. This compares every attribute except
the UUID and provenance.
the provenance.
To compare two :class:`SampleData` instances for exact equality of
all data including UUIDs and provenance data, use ``s1 == s2``.
all data including provenance data, use ``s1 == s2``.
:param SampleData other: The other :class:`SampleData` instance to
compare with.
Expand Down Expand Up @@ -2246,13 +2236,6 @@ def __metadata_schema_getter(self, zarr_group):
except KeyError:
return {"codec": "json"}

@property
def uuid(self):
return (
"Hmm, not sure, could just generate a UUID, but then it wouldn't"
"be in the file - maybe we do need to write back on init"
)

@property
def format_name(self):
return self.FORMAT_NAME
Expand Down Expand Up @@ -2548,7 +2531,6 @@ def __init__(self, sample_data, **kwargs):
super().__init__(**kwargs)
sample_data._check_finalised()
self.sample_data = sample_data
self.data.attrs["sample_data_uuid"] = sample_data.uuid
if self.sample_data.sequence_length == 0:
raise ValueError("Bad samples file: sequence_length cannot be zero")
self.data.attrs["sequence_length"] = self.sample_data.sequence_length
Expand Down Expand Up @@ -2628,7 +2610,6 @@ def summary(self):
def __str__(self):
values = [
("sequence_length", self.sequence_length),
("sample_data_uuid", self.sample_data_uuid),
("num_ancestors", self.num_ancestors),
("num_sites", self.num_sites),
("sites/position", zarr_summary(self.sites_position)),
Expand All @@ -2643,12 +2624,10 @@ def __str__(self):
def data_equal(self, other):
"""
Returns True if all the data attributes of this input file and the
specified input file are equal. This compares every attribute except
the UUID.
specified input file are equal. This compares every attribute.
"""
return (
self.sequence_length == other.sequence_length
and self.sample_data_uuid == other.sample_data_uuid
and self.format_name == other.format_name
and self.format_version == other.format_version
and self.num_ancestors == other.num_ancestors
Expand All @@ -2670,10 +2649,6 @@ def sequence_length(self):
"""
return self.data.attrs["sequence_length"]

@property
def sample_data_uuid(self):
return self.data.attrs["sample_data_uuid"]

@property
def num_ancestors(self):
return self.ancestors_start.shape[0]
Expand Down Expand Up @@ -2788,14 +2763,7 @@ def insert_proxy_samples(
(i.e. breaking the infinite sites assumption), allowing them to possess
derived alleles at sites where there are no pre-existing mutations in
older ancestors.
:param bool require_same_sample_data: If ``True`` (default) then the
the ``sample_data`` parameter must point to the same :class:`.SampleData`
instance as that used to generate the current ancestors. If ``False``,
this requirement is not enforced, and it is the user's responsibility
to ensure that the encoding of alleles in ``sample_data`` matches the
encoding in the current :class:`AncestorData` instance (i.e. that in the
original :class:`.SampleData` instance on which the current ancestors
are based).
:param bool require_same_sample_data: **Deprecated** Has no effect.
:param \\**kwargs: Further arguments passed to the constructor when creating
the new :class:`AncestorData` instance which will be returned.
Expand All @@ -2804,11 +2772,6 @@ def insert_proxy_samples(
"""
self._check_finalised()
sample_data._check_finalised()
if require_same_sample_data:
if sample_data.uuid != self.sample_data_uuid:
raise ValueError(
"sample_data differs from that used to build the initial ancestors"
)
if self.sequence_length != sample_data.sequence_length:
raise ValueError("sample_data does not have the correct sequence length")
used_sites = np.isin(sample_data.sites_position[:], self.sites_position[:])
Expand Down Expand Up @@ -2903,8 +2866,6 @@ def insert_proxy_samples(
other.clear_provenances()
for timestamp, record in self.provenances():
other.add_provenance(timestamp, record)
if sample_data.uuid != self.sample_data_uuid:
pass # TODO: if sample files don't match, we need extra provenance info
other.record_provenance(command="insert_proxy_samples", **kwargs)

assert other.num_ancestors == self.num_ancestors + len(sample_ids)
Expand Down
4 changes: 4 additions & 0 deletions tsinfer/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,6 +1568,10 @@ def get_ancestors_tables(self):
tables.build_index()
tables.compute_mutation_parents()
logger.debug("Sorting ancestors tree sequence done")
for timestamp, record in self.ancestor_data.provenances():
tables.provenances.add_row(timestamp=timestamp, record=json.dumps(record))
record = provenance.get_provenance_dict(command="match_ancestors")
tables.provenances.add_row(record=json.dumps(record))
logger.info(
"Built ancestors tree sequence: {} nodes ({} pc ancestors); {} edges; "
"{} sites; {} mutations".format(
Expand Down

0 comments on commit 0a0f60c

Please sign in to comment.