From f10553e41f84036c9ce704293eaa4b7ed2be4675 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 25 Apr 2024 17:57:34 -0700 Subject: [PATCH 01/20] initial cleanup --- zarrtraj/ZARRTRAJ.py | 83 ++++++++++---------------------------------- 1 file changed, 19 insertions(+), 64 deletions(-) diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index 0cbf75c..97e400c 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -75,6 +75,7 @@ from MDAnalysis.due import due, Doi from MDAnalysis.lib.util import store_init_arguments import dask.array as da +from enum import Enum import time # NOTE: REMOVE after test @@ -95,78 +96,43 @@ class MockZarrFile: HAS_ZARR = True +class ZarrTrajBoundaryConditions(Enum): + ZARRTRAJ_NONE = 0 + ZARRTRAJ_PERIODIC = 1 + class ZarrTrajReader(base.ReaderBase): format = 'ZARRTRAJ' - # This dictionary is used to translate from Zarrtraj units to MDAnalysis units - _unit_translation = { - 'time': { - 'ps': 'ps', - 'fs': 'fs', - 'ns': 'ns', - 'second': 's', - 'sec': 's', - 's': 's', - 'AKMA': 'AKMA', - }, - 'length': { - 'Angstrom': 'Angstrom', - 'angstrom': 'Angstrom', - 'A': 'Angstrom', - 'nm': 'nm', - 'pm': 'pm', - 'fm': 'fm', - }, - 'velocity': { - 'Angstrom ps-1': 'Angstrom/ps', - 'A ps-1': 'Angstrom/ps', - 'Angstrom fs-1': 'Angstrom/fs', - 'A fs-1': 'Angstrom/fs', - 'Angstrom AKMA-1': 'Angstrom/AKMA', - 'A AKMA-1': 'Angstrom/AKMA', - 'nm ps-1': 'nm/ps', - 'nm ns-1': 'nm/ns', - 'pm ps-1': 'pm/ps', - 'm s-1': 'm/s' - }, - 'force': { - 'kJ mol-1 Angstrom-1': 'kJ/(mol*Angstrom)', - 'kJ mol-1 nm-1': 'kJ/(mol*nm)', - 'Newton': 'Newton', - 'N': 'N', - 'J m-1': 'J/m', - 'kcal mol-1 Angstrom-1': 'kcal/(mol*Angstrom)', - 'kcal mol-1 A-1': 'kcal/(mol*Angstrom)' - } - } - @store_init_arguments def __init__(self, filename, convert_units=True, **kwargs): - + if not HAS_ZARR: raise RuntimeError("Please install zarr") super(ZarrTrajReader, self).__init__(filename, **kwargs) + # ReaderBase calls self.filename = str(filename), which we want to undo self.filename = filename + if not self.filename: + raise PermissionError("The Zarr group is not readable") self.convert_units = convert_units + self._frame = -1 + self._file = self.filename + self._particle_group = self._file['particles'] - self.open_trajectory() - - # _has dictionary used for checking whether zarrtraj file has - # 'position', 'velocity', or 'force' groups in the file # IO CALL self._has = {name: name in self._particle_group for - name in ('position', 'velocity', 'force')} + name in ('positions', 'velocities', 'forces')} # IO CALL - self._has_edges = 'edges' in self._particle_group['box'] - - # Gets some info about what settings the datasets were created with - # from first available group + self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_NONE if + self._particle_group["boundary"] == "none" + else ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC) + # Get n_atoms + numcodecs compressor & filter objects from + # first available dataset # IO CALLS for name, value in self._has.items(): if value: - dset = self._particle_group[f'{name}/value'] + dset = self._particle_group[name] self.n_atoms = dset.shape[1] self.compressor = dset.compressor break @@ -187,17 +153,6 @@ def __init__(self, filename, self._set_translated_units() # fills units dictionary self._read_next_timestep() - def open_trajectory(self): - """opens the trajectory file using zarr library""" - if not self.filename: - raise PermissionError("The Zarr group is not readable") - self._frame = -1 - self._file = self.filename - # pulls first key out of 'particles' - # allows for arbitrary name of group1 in 'particles' - self._particle_group = self._file['particles'][ - list(self._file['particles'])[0]] - def _set_translated_units(self): """converts units from ZARRTRAJ to MDAnalysis notation and fills units dictionary""" From 295e3bdeafc25e3cb758b6e9fb5d3a40db114692 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Thu, 25 Apr 2024 17:58:15 -0700 Subject: [PATCH 02/20] revising compression requirements --- docs/source/zarrtraj-file-spec/v0.1.0.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/zarrtraj-file-spec/v0.1.0.rst b/docs/source/zarrtraj-file-spec/v0.1.0.rst index 14efc51..6027fe3 100644 --- a/docs/source/zarrtraj-file-spec/v0.1.0.rst +++ b/docs/source/zarrtraj-file-spec/v0.1.0.rst @@ -7,6 +7,9 @@ The *ZarrTraj* trajectory file format is based on the streamable, compressible Units ----- +Zarrtraj uses the following units by default, but supports reading and writing all +`units used by MDAnalysis `_ + .. list-table :widths: 25 25 25 :header-rows: 1 @@ -91,7 +94,9 @@ Additional Requirements ----------------------- * The zarrtraj file must contain at least one of positions, velocities, or forces. -* Positions, velocities, and forces must be sampled at the same rate. +* Positions, velocities, and forces must be sampled at the same rate +* Positions, velocities, and forces must be compressed & filtered with the same + numcodecs compressor and filter objects * The step & time arrays must increase monotonically. From 82a1bbd4219a19b80c6c44c6b66c63fee1ab0fdd Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 26 Apr 2024 18:20:09 -0700 Subject: [PATCH 03/20] refactor --- docs/source/zarrtraj-file-spec/v0.1.0.rst | 2 +- zarrtraj/ZARRTRAJ.py | 237 ++++++++-------------- 2 files changed, 89 insertions(+), 150 deletions(-) diff --git a/docs/source/zarrtraj-file-spec/v0.1.0.rst b/docs/source/zarrtraj-file-spec/v0.1.0.rst index 6027fe3..0be6c5d 100644 --- a/docs/source/zarrtraj-file-spec/v0.1.0.rst +++ b/docs/source/zarrtraj-file-spec/v0.1.0.rst @@ -96,7 +96,7 @@ Additional Requirements * The zarrtraj file must contain at least one of positions, velocities, or forces. * Positions, velocities, and forces must be sampled at the same rate * Positions, velocities, and forces must be compressed & filtered with the same - numcodecs compressor and filter objects + numcodecs compressor object and filter list * The step & time arrays must increase monotonically. diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index 97e400c..e87fbf7 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -100,6 +100,7 @@ class ZarrTrajBoundaryConditions(Enum): ZARRTRAJ_NONE = 0 ZARRTRAJ_PERIODIC = 1 + class ZarrTrajReader(base.ReaderBase): format = 'ZARRTRAJ' @@ -119,182 +120,118 @@ def __init__(self, filename, self._frame = -1 self._file = self.filename self._particle_group = self._file['particles'] - - # IO CALL - self._has = {name: name in self._particle_group for - name in ('positions', 'velocities', 'forces')} + self._step_array = self._particle_group['step'] + self._time_array = self._particle_group['time'] # IO CALL self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_NONE if self._particle_group["boundary"] == "none" else ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC) + + if (self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC and + 'dimensions' not in self._particle_group['box']): + raise NoDataError("Triclinic box dimension array must " + + "be provided if `boundary` is set to 'periodic'") + + # IO CALL + self._has = set(name in self._particle_group for + name in ('positions', 'velocities', 'forces')) + # Get n_atoms + numcodecs compressor & filter objects from # first available dataset # IO CALLS - for name, value in self._has.items(): - if value: - dset = self._particle_group[name] - self.n_atoms = dset.shape[1] - self.compressor = dset.compressor - break + for name in self._has: + dset = self._particle_group[name] + self.n_atoms = dset.shape[1] + self.compressor = dset.compressor + # NOTE: fixme + self.filters = dset.store[f"{name}/.zarray"] + break else: - raise NoDataError("Provide at least a position, velocity" - " or force group in the h5md file.") - + raise NoDataError("Provide at least a position, velocity " + + "or force array in the ZarrTraj file.") + + for name in self._has: + self._n_frames = self._particle_group[name].shape[0] + break + self.ts = self._Timestep(self.n_atoms, positions=self.has_positions, velocities=self.has_velocities, forces=self.has_forces, **self._ts_kwargs) - + self.units = {'time': None, 'length': None, 'velocity': None, 'force': None} - self._set_translated_units() # fills units dictionary + self._fill_units_dict() self._read_next_timestep() - def _set_translated_units(self): - """converts units from ZARRTRAJ to MDAnalysis notation - and fills units dictionary""" + def _fill_units_dict(self): + """Verify time unit is present and that units for each + measurement in (positions, forces, and velocites) in the file + is present and fill units dict""" - # need this dictionary to associate 'position': 'length' _group_unit_dict = {'time': 'time', - 'position': 'length', - 'velocity': 'velocity', - 'force': 'force' + 'positions': 'length', + 'velocities': 'velocity', + 'forces': 'force' } for group, unit in _group_unit_dict.items(): - self._translate_zarrtraj_units(group, unit) - self._check_units(group, unit) - - def _translate_zarrtraj_units(self, group, unit): - """stores the translated unit string into the units dictionary""" - - errmsg = "{} unit '{}' is not recognized by ZarrTrajReader." - - if unit == 'time' or self._has[group]: - try: - self.units[unit] = self._unit_translation[unit][ - self._file['particles']['units'].attrs[unit]] - except KeyError: - raise RuntimeError(errmsg.format( - unit, self._file['particles'][ - 'units'].attrs[unit]) - ) from None - - def _check_units(self, group, unit): - """Raises error if no units are provided from Zarrtraj file - and convert_units=True""" - - if not self.convert_units: - return - - errmsg = "Zarrtraj file must have readable units if ``convert_units`` is" - " set to ``True``. MDAnalysis sets ``convert_units=True`` by default." - " Set ``convert_units=False`` to load Universe without units." - - if unit == 'time': - if self.units['time'] is None: - raise ValueError(errmsg) + if group in self._has: + try: + self.units[unit] = self._particle_group[ + "units"].attrs[unit] + except KeyError: + raise ValueError("Zarrtraj file must have units set for " + + "time and each of positions, " + + "velocities, and forces present") - else: - if self._has[group]: - if self.units[unit] is None: - raise ValueError(errmsg) - def _read_next_timestep(self): - """read next frame in trajectory""" + """Read next frame in trajectory""" return self._read_frame(self._frame + 1) def _read_frame(self, frame): - """reads data from zarrtraj file and copies to current timestep""" - try: - for name, value in self._has.items(): - if value and 'step' in self._particle_group[name]: - _ = self._particle_group[name]['step'][frame] - break - else: - if self._has_edges and 'step' in self._particle_group['box']['edges']: - _ = self._particle_group['box']['edges']['step'][frame] - else: - raise NoDataError("Provide at least a position, velocity" - " or force group in the zarrtraj file.") - - except (ValueError, IndexError): - raise IOError from None - + """Reads data from zarrtraj file and copies to current timestep""" self._frame = frame ts = self.ts particle_group = self._particle_group ts.frame = frame - # fills data dictionary from 'observables' group + # NOTE: handle observables + # Fills data dictionary from 'observables' group # Note: dt is not read into data as it is not decided whether # Timestep should have a dt attribute (see Issue #2825) - self._copy_to_data() + self.ts.time = self._time_array[self._frame] + self.ts.data['step'] = self._step_array[self._frame] # Sets frame box dimensions - # Note: Zarrtraj files must contain 'box' group in each 'particles' group - if "edges" in particle_group["box"]: - edges = particle_group["box/edges/value"][frame, :] - # A D-dimensional vector or a D × D matrix, depending on the - # geometry of the box, of Float or Integer type. If edges is a - # vector, it specifies the space diagonal of a cuboid-shaped box. - # If edges is a matrix, the box is of triclinic shape with the edge - # vectors given by the rows of the matrix. - if edges.shape == (3,): - ts.dimensions = [*edges, 90, 90, 90] - else: - ts.dimensions = core.triclinic_box(*edges) + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: + edges = particle_group["box/dimensions"][frame, :] + ts.dimensions = core.triclinic_box(*edges) else: ts.dimensions = None # set the timestep positions, velocities, and forces with # current frame dataset - if self._has['position']: - self._read_dataset_into_ts('position', ts.positions) - if self._has['velocity']: - self._read_dataset_into_ts('velocity', ts.velocities) - if self._has['force']: - self._read_dataset_into_ts('force', ts.forces) + if self.has_positions: + self._read_dataset_into_ts('positions', ts.positions) + if self.has_velocities: + self._read_dataset_into_ts('velocities', ts.velocities) + if self.has_forces: + self._read_dataset_into_ts('forces', ts.forces) if self.convert_units: self._convert_units() return ts - - def _copy_to_data(self): - """assigns values to keys in data dictionary""" - - # pulls 'time' and 'step' out of first available parent group - time_found = False - step_found = False - - for name, value in self._has.items(): - if value: - if 'time' in self._particle_group[name]: - self.ts.time = self._particle_group[name][ - 'time'][self._frame] - break - if not time_found: - self.ts.time = self._particle_group['box']['edges']['time'][self._frame] - - for name, value in self._has.items(): - if value: - if 'step' in self._particle_group[name]: - self.ts.data['step'] = self._particle_group[name][ - 'step'][self._frame] - break - if not step_found: - - self.ts.data['step'] = self._particle_group['box']['edges']['step'][self._frame] def _read_dataset_into_ts(self, dataset, attribute): - """reads position, velocity, or force dataset array at current frame + """Reads position, velocity, or force dataset array at current frame into corresponding ts attribute""" - n_atoms_now = self._particle_group[f'{dataset}/value'][ + n_atoms_now = self._particle_group[dataset][ self._frame].shape[0] if n_atoms_now != self.n_atoms: raise ValueError(f"Frame {self._frame} of the {dataset} dataset" @@ -303,11 +240,8 @@ def _read_dataset_into_ts(self, dataset, attribute): f" dataset had {self.n_atoms} atoms." " MDAnalysis is unable to deal" " with variable topology!") - - self._particle_group[f'{dataset}/value'].get_basic_selection( - selection=np.s_[self._frame, :], - out=attribute - ) + + attribute[:] = self._particle_group[dataset][self._frame, :] def _convert_units(self): """converts time, position, velocity, and force values if they @@ -333,8 +267,6 @@ def _convert_units(self): @staticmethod def _format_hint(thing): """Can this Reader read *thing*""" - # Check if the object is already a zarr.Group - # If it isn't, try opening it as a group and if it excepts, return False if not HAS_ZARR or not isinstance(thing, zarr.Group): return False else: @@ -342,31 +274,29 @@ def _format_hint(thing): @staticmethod def parse_n_atoms(filename, storage_options=None): - for group in filename['particles/trajectory']: + for group in filename['particles']: if group in ('position', 'velocity', 'force'): - n_atoms = filename[f'particles/trajectory/{group}/value'].shape[1] + n_atoms = filename[f'particles/{group}'].shape[1] return n_atoms - raise NoDataError("Could not construct minimal topology from the " - "Zarrtraj trajectory file, as it did not contain a " - "'position', 'velocity', or 'force' group. " - "You must include a topology file.") + raise NoDataError("Could not construct minimal topology from the " + + "Zarrtraj trajectory file, as it did not contain " + + "a 'position', 'velocity', or 'force' group. " + + "You must include a topology file.") def close(self): - """close reader""" + """Close reader""" self._file.store.close() - + def _reopen(self): """reopen trajectory""" self.close() - self.open_trajectory() + self._frame = -1 @property def n_frames(self): """number of frames in trajectory""" - for name, value in self._has.items(): - if value: - return self._particle_group[name]['value'].shape[0] + return self._n_frames def Writer(self, filename, n_atoms=None, **kwargs): """Return writer for trajectory format @@ -383,29 +313,38 @@ def Writer(self, filename, n_atoms=None, **kwargs): @property def has_positions(self): """``True`` if 'position' group is in trajectory.""" - return self._has['position'] + return "positions" in self._has @has_positions.setter def has_positions(self, value: bool): - self._has['position'] = value + if value: + self._has.add("positions") + else: + self._has.remove("positions") @property def has_velocities(self): """``True`` if 'velocity' group is in trajectory.""" - return self._has['velocity'] + return "velocities" in self._has @has_velocities.setter def has_velocities(self, value: bool): - self._has['velocity'] = value + if value: + self._has.add("velocities") + else: + self._has.remove("velocities") @property def has_forces(self): """``True`` if 'force' group is in trajectory.""" - return self._has['force'] + return "forces" in self._has @has_forces.setter def has_forces(self, value: bool): - self._has['force'] = value + if value: + self._has.add("forces") + else: + self._has.remove("forces") def __getstate__(self): state = self.__dict__.copy() From bca57a88ebd0794b5d957aa6933ccc6bf489be8a Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Mon, 29 Apr 2024 14:13:52 -0700 Subject: [PATCH 04/20] passing base api --- docs/source/zarrtraj-file-spec/v0.1.0.rst | 2 +- pyproject.toml | 1 - zarrtraj/ZARRTRAJ.py | 409 +++++++++------------- zarrtraj/tests/create_zarrtraj_data.py | 2 +- zarrtraj/tests/test_zarrtraj_s3.py | 15 +- 5 files changed, 161 insertions(+), 268 deletions(-) diff --git a/docs/source/zarrtraj-file-spec/v0.1.0.rst b/docs/source/zarrtraj-file-spec/v0.1.0.rst index 0be6c5d..611cd2d 100644 --- a/docs/source/zarrtraj-file-spec/v0.1.0.rst +++ b/docs/source/zarrtraj-file-spec/v0.1.0.rst @@ -94,7 +94,7 @@ Additional Requirements ----------------------- * The zarrtraj file must contain at least one of positions, velocities, or forces. -* Positions, velocities, and forces must be sampled at the same rate +* Positions, velocities, forces, and observables must be sampled at the same rate * Positions, velocities, and forces must be compressed & filtered with the same numcodecs compressor object and filter list * The step & time arrays must increase monotonically. diff --git a/pyproject.toml b/pyproject.toml index 4b08c38..4a12f92 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,6 @@ doc = [ [tool.setuptools] packages = ["zarrtraj"] -py-modules = ["__init__"] # Installs tests datafiles [tool.setuptools.package-data] diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index e87fbf7..886eaa7 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -95,6 +95,9 @@ class MockZarrFile: else: HAS_ZARR = True +ZARRTRAJ_DEFAULT_BUFSIZE = 10485760 # 10 MB +ZARRTRAJ_DEFAULT_FPC = 10 # 10 frames per chunk + class ZarrTrajBoundaryConditions(Enum): ZARRTRAJ_NONE = 0 @@ -124,7 +127,7 @@ def __init__(self, filename, self._time_array = self._particle_group['time'] # IO CALL self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_NONE if - self._particle_group["boundary"] == "none" + self._particle_group["box"].attrs["boundary"] == "none" else ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC) if (self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC and @@ -133,7 +136,7 @@ def __init__(self, filename, "be provided if `boundary` is set to 'periodic'") # IO CALL - self._has = set(name in self._particle_group for + self._has = set(name for name in self._particle_group if name in ('positions', 'velocities', 'forces')) # Get n_atoms + numcodecs compressor & filter objects from @@ -143,8 +146,7 @@ def __init__(self, filename, dset = self._particle_group[name] self.n_atoms = dset.shape[1] self.compressor = dset.compressor - # NOTE: fixme - self.filters = dset.store[f"{name}/.zarray"] + # NOTE: add filters break else: raise NoDataError("Provide at least a position, velocity " + @@ -177,7 +179,10 @@ def _fill_units_dict(self): 'velocities': 'velocity', 'forces': 'force' } - + try: + self.units['time'] = self._particle_group["units"].attrs["time"] + except KeyError: + raise ValueError("Zarrtraj file must have unit set for time") for group, unit in _group_unit_dict.items(): if group in self._has: try: @@ -185,7 +190,7 @@ def _fill_units_dict(self): "units"].attrs[unit] except KeyError: raise ValueError("Zarrtraj file must have units set for " + - "time and each of positions, " + + "each of positions, " + "velocities, and forces present") def _read_next_timestep(self): @@ -194,6 +199,9 @@ def _read_next_timestep(self): def _read_frame(self, frame): """Reads data from zarrtraj file and copies to current timestep""" + if frame >= self._n_frames: + raise IOError from None + self._frame = frame ts = self.ts particle_group = self._particle_group @@ -244,7 +252,7 @@ def _read_dataset_into_ts(self, dataset, attribute): attribute[:] = self._particle_group[dataset][self._frame, :] def _convert_units(self): - """converts time, position, velocity, and force values if they + """Converts time, position, velocity, and force values if they are not given in MDAnalysis standard units See https://userguide.mdanalysis.org/stable/units.html @@ -252,16 +260,17 @@ def _convert_units(self): self.ts.time = self.convert_time_from_native(self.ts.time) - if self._has_edges and self.ts.dimensions is not None: + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: + # Only convert [:3] since last 3 elements are angle self.convert_pos_from_native(self.ts.dimensions[:3]) - if self._has['position']: + if self.has_positions: self.convert_pos_from_native(self.ts.positions) - if self._has['velocity']: + if self.has_velocities: self.convert_velocities_from_native(self.ts.velocities) - if self._has['force']: + if self.has_forces: self.convert_forces_from_native(self.ts.forces) @staticmethod @@ -275,7 +284,7 @@ def _format_hint(thing): @staticmethod def parse_n_atoms(filename, storage_options=None): for group in filename['particles']: - if group in ('position', 'velocity', 'force'): + if group in ('positions', 'velocities', 'forces'): n_atoms = filename[f'particles/{group}'].shape[1] return n_atoms @@ -305,6 +314,7 @@ def Writer(self, filename, n_atoms=None, **kwargs): n_atoms = self.n_atoms kwargs.setdefault('format', "ZARRTRAJ") kwargs.setdefault('compressor', self.compressor) + # NOTE: add filters kwargs.setdefault('positions', self.has_positions) kwargs.setdefault('velocities', self.has_velocities) kwargs.setdefault('forces', self.has_forces) @@ -353,8 +363,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__ = state - self._particle_group = self._file['particles'][ - list(self._file['particles'])[0]] + self._particle_group = self._file['particles'] self[self.ts.frame] @@ -362,114 +371,73 @@ class ZarrTrajWriter(base.WriterBase): format = 'ZARRTRAJ' multiframe = True + #: These variables are not written from :attr:`Timestep.data` + #: dictionary to the observables group in the H5MD file + data_blacklist = ['step', 'time', 'dt'] + #: currently written version of the file format - ZARRTRAJ_VERSION = 1 - - _unit_translation_dict = { - 'time': { - 'ps': 'ps', - 'fs': 'fs', - 'ns': 'ns', - 'second': 's', - 'sec': 's', - 's': 's', - 'AKMA': 'AKMA'}, - 'length': { - 'Angstrom': 'Angstrom', - 'angstrom': 'Angstrom', - 'A': 'Angstrom', - 'nm': 'nm', - 'pm': 'pm', - 'fm': 'fm'}, - 'velocity': { - 'Angstrom/ps': 'Angstrom ps-1', - 'A/ps': 'Angstrom ps-1', - 'Angstrom/fs': 'Angstrom fs-1', - 'A/fs': 'Angstrom fs-1', - 'Angstrom/AKMA': 'Angstrom AKMA-1', - 'A/AKMA': 'Angstrom AKMA-1', - 'nm/ps': 'nm ps-1', - 'nm/ns': 'nm ns-1', - 'pm/ps': 'pm ps-1', - 'm/s': 'm s-1'}, - 'force': { - 'kJ/(mol*Angstrom)': 'kJ mol-1 Angstrom-1', - 'kJ/(mol*nm)': 'kJ mol-1 nm-1', - 'Newton': 'Newton', - 'N': 'N', - 'J/m': 'J m-1', - 'kcal/(mol*Angstrom)': 'kcal mol-1 Angstrom-1', - 'kcal/(mol*A)': 'kcal mol-1 Angstrom-1'}} + ZARRTRAJ_VERSION = '0.1.0' def __init__(self, filename, n_atoms, n_frames=None, convert_units=True, chunks=None, positions=True, velocities=True, forces=True, timeunit=None, lengthunit=None, velocityunit=None, forceunit=None, compressor=None, - filters=None, max_memory=None, **kwargs): - + filters=None, max_memory=None, + force_buffered=False, **kwargs): + if not HAS_ZARR: raise RuntimeError("ZarrTrajWriter: Please install zarr") + if not isinstance(filename, zarr.Group): + raise TypeError("Expected a Zarr group object, but " + + "received an instance of type {}" + .format(type(filename).__name__)) + # Verify group is open for writing + if not filename.store.is_writeable(): + raise PermissionError("The Zarr group is not writeable") if n_atoms == 0: raise ValueError("ZarrTrajWriter: no atoms in output trajectory") self.filename = filename + self._file = filename self.n_atoms = n_atoms self.n_frames = n_frames - self.zarr_group = None - self._open_file() + self.force_buffered = force_buffered + + # Fill in Zarrtraj metadata from kwargs + # IO CALL + self._file.require_group('metadata') + self._file['metadata'].attrs['version'] = self.ZARRTRAJ_VERSION + self._determine_if_cloud_storage() - if self._is_cloud_storage: + if self._is_cloud_storage or self.force_buffered: # Ensure n_frames exists if n_frames is None: - raise TypeError("ZarrTrajWriter: Cloud writing requires " + + raise TypeError("ZarrTrajWriter: Buffered writing requires " + "'n_frames' kwarg") - # Default buffer size is 1MB - self.max_memory = 10485760 if max_memory is None else max_memory - - self.chunks = chunks - else: - self.chunks = (1, n_atoms, 3) if chunks is None else chunks + self.max_memory = ZARRTRAJ_DEFAULT_BUFSIZE if max_memory is None else max_memory - self.filters = filters - self.compressor = compressor + self.chunks = (ZARRTRAJ_DEFAULT_FPC, self.n_atoms, 3) if chunks is None else chunks + self.filters = filters if filters is not None else [] + self.compressor = compressor if compressor is not None else zarr.storage.default_compressor self.convert_units = convert_units - # The writer defaults to writing all data from the parent Timestep if # it exists. If these are True, the writer will check each # Timestep.has_* value and fill the self._has dictionary accordingly # in _initialize_hdf5_datasets() - self._write_positions = positions - self._write_velocities = velocities - self._write_forces = forces - if not any([self._write_positions, - self._write_velocities, - self._write_velocities]): + self._write = set() + if positions: self._write.add('positions') + if velocities: self._write.add('velocities') + if forces: self._write.add('forces') + + if not self._write: raise ValueError("At least one of positions, velocities, or " "forces must be set to ``True``.") - self._new_units = {'time': timeunit, - 'length': lengthunit, - 'velocity': velocityunit, - 'force': forceunit} self._initial_write = True - def _open_file(self): - if not isinstance(self.filename, zarr.Group): - raise TypeError("Expected a Zarr group object, but " + - "received an instance of type {}" - .format(type(self.filename).__name__)) - # Verify group is open for writing - if not self.filename.store.is_writeable(): - raise PermissionError("The Zarr group is not writeable") - self.zarr_group = self.filename - - # fill in Zarrtraj metadata from kwargs - self.zarr_group.require_group('zarrtraj') - self.zarr_group['zarrtraj'].attrs['version'] = self.ZARRTRAJ_VERSION - def _determine_if_cloud_storage(self): # Check if we are working with a cloud storage type - store = self.zarr_group.store + store = self._file.store if isinstance(store, zarr.storage.FSStore): if 's3' in store.fs.protocol: # Verify s3fs is installed @@ -492,7 +460,6 @@ def _determine_if_cloud_storage(self): self._is_cloud_storage = True else: self._is_cloud_storage = False - def _write_next_frame(self, ag): """Write information associated with ``ag`` at current frame @@ -532,36 +499,12 @@ def _write_next_frame(self, ag): self._initialize_zarr_datasets(ts) self._initial_write = False return self._write_next_timestep(ts) - - def _determine_units(self, ag): - """determine which units the file will be written with - - By default, it fills the :attr:`self.units` dictionary by copying the - units dictionary of the parent file. Because Zarrtraj files have no - standard unit restrictions, users may pass a kwarg in ``(timeunit, - lengthunit, velocityunit, forceunit)`` to the writer so long as - MDAnalysis has a conversion factor for it (:exc:`ValueError` raised if - it does not). These custom unit arguments must be in - `MDAnalysis notation`_. If custom units are supplied from the user, - :attr`self.units[unit]` is replaced with the corresponding - `unit` argument. - """ + def _determine_units(self, ag): + """Determine which units the file will be written with""" self.units = ag.universe.trajectory.units.copy() - # set user input units - for key, value in self._new_units.items(): - if value is not None: - if value not in self._unit_translation_dict[key]: - raise ValueError(f"{value} is not a unit recognized by" - " MDAnalysis. Allowed units are:" - f" {self._unit_translation_dict.keys()}" - " For more information on units, see" - " `MDAnalysis units`_.") - else: - self.units[key] = self._new_units[key] - if self.convert_units: # check if all units are None if not any(self.units.values()): @@ -571,14 +514,17 @@ def _determine_units(self, ag): "with no units, set ``convert_units=False``.") def _determine_has(self, ts): - # ask the parent file if it has positions, velocities, and forces - # if prompted by the writer with the self._write_* attributes - self._has = {group: getattr(ts, f'has_{attr}') - if getattr(self, f'_write_{attr}') - else False for group, attr in zip( - ('position', 'velocity', 'force'), - ('positions', 'velocities', 'forces'))} - self._has_edges = True if ts.dimensions is not None and np.all(ts.dimensions > 0) else False + # ask the parent file if it has positions, velocities, and forces, + # and dimensions + self._has = set() + if "positions" in self._write and ts.has_positions: + self._has.add("positions") + if "velocities" in self._write and ts.has_velocities: + self._has.add("velocities") + if "forces" in self._write and ts.has_forces: + self._has.add("forces") + self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC if ts.dimensions is not None + else ZarrTrajBoundaryConditions.ZARRTRAJ_NONE) def _check_max_memory(self): """ @@ -594,7 +540,7 @@ def _check_max_memory(self): # Write edges, step, and time in the same pattern as # velocity, force, and position, though it is not # strictly necessary for simplicity - if self._has_edges: + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: mem_per_chunk += float32_size * self.chunks[0] * 9 # Step mem_per_chunk += int32_size * self.chunks[0] @@ -609,7 +555,7 @@ def _check_max_memory(self): if self.has_velocities: mem_per_chunk += float32_size * self.chunks[0] * self.n_atoms * 3 - + if mem_per_chunk > self.max_memory: raise ValueError(f"`max_memory` kwarg " + "must be at least {mem_per_chunk} for " + @@ -617,7 +563,7 @@ def _check_max_memory(self): else: self.n_buffer_frames = self.chunks[0] * (self.max_memory // mem_per_chunk) - + def _determine_chunks(self): """ If ``chunks`` is not provided as a `kwarg` to the writer in the case @@ -634,23 +580,23 @@ def _determine_chunks(self): n_data_buffers = (self.has_positions + self.has_velocities + self.has_forces) if n_data_buffers: - data_buffer_size = (10485760 - (float32_size * 9) - + data_buffer_size = (ZARRTRAJ_DEFAULT_BUFSIZE - (float32_size * 9) - int32_size - float32_size) // n_data_buffers mem_per_buffer = float32_size * self.n_atoms * 3 if mem_per_buffer <= data_buffer_size: mem_per_frame = (n_data_buffers * mem_per_buffer + (float32_size * 9) + int32_size + float32_size) - self.chunks = (10485760 // mem_per_frame, self.n_atoms, 3) + self.chunks = (ZARRTRAJ_DEFAULT_BUFSIZE // mem_per_frame, self.n_atoms, 3) else: raise TypeError(f"Trajectory frame cannot fit in " + - "default buffer size of 10MB") + "default buffer size of {ZARRTRAJ_DEFAULT_BUFSIZE}MB") def _initialize_zarr_datasets(self, ts): """initializes all datasets that will be written to by :meth:`_write_next_timestep`. Datasets must be sampled at the same - rate in version 0.0.0 of zarrtraj + rate in version 0.1.0 of zarrtraj Note ---- @@ -659,142 +605,102 @@ def _initialize_zarr_datasets(self, ts): """ + if self.n_frames is None: + self._first_dim = 0 + else: + self._first_dim = self.n_frames # for keeping track of where to write in the dataset self._counter = 0 - first_dim = self.n_frames if self.n_frames is not None else 0 - - # initialize trajectory group - self.zarr_group.require_group('particles').require_group('trajectory') - self._traj = self.zarr_group['particles/trajectory'] + self._particle_group = self._file.require_group('particles') # NOTE: subselection init goes here when implemented # box group is required for every group in 'particles' # Initialize units group - self.zarr_group['particles'].require_group('units') - self._unit_group = self.zarr_group['particles']['units'] - - self._traj.require_group('box') - if self._has_edges: - self._traj['box'].attrs['boundary'] = 3*['periodic'] - self._traj['box'].require_group('edges') - self._edges = self._traj.require_dataset('box/edges/value', - shape=(first_dim, 3, 3), - dtype=np.float32) - self._step = self._traj.require_dataset('box/edges/step', - shape=(first_dim,), - dtype=np.int32) - self._time = self._traj.require_dataset('box/edges/time', - shape=(first_dim,), - dtype=np.float32) - self._set_attr_unit('length') - self._set_attr_unit('time') + self._particle_group.require_group('units') + + self._particle_group.require_group('box') + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: + self._particle_group['box'].attrs['boundary'] = 'periodic' + self._particle_group['box']['dimensions'] = zarr.empty( + shape=(self._first_dim, 3, 3), + dtype=np.float32, + compressor=self.compressor, + filters=self.filters + ) + self._edges = self._particle_group['box']['dimensions'] else: # if no box, boundary attr must be "none" - self._traj['box'].attrs['boundary'] = 3*['none'] - self._create_step_and_time_datasets() + self._particle_group['box'].attrs['boundary'] = 'none' + + self._particle_group['step'] = (zarr.empty(shape=(self._first_dim,), + dtype=np.int32)) + self._step = self._particle_group['step'] + self._particle_group['time'] = (zarr.empty(shape=(self._first_dim,), + dtype=np.int32)) + self._time = self._particle_group['time'] + self._particle_group['units'].attrs['time'] = self.units['time'] if self.has_positions: - self._create_trajectory_dataset('position') - self._pos = self._traj['position/value'] - self._set_attr_unit('length') + self._create_trajectory_dataset('positions') + self._pos = self._particle_group['positions'] + self._particle_group['units'].attrs['length'] = self.units['length'] if self.has_velocities: - self._create_trajectory_dataset('velocity') - self._vel = self._traj['velocity/value'] - self._set_attr_unit('velocity') + self._create_trajectory_dataset('velocities') + self._vel = self._particle_group['velocities'] + self._particle_group['units'].attrs['velocity'] = self.units['velocity'] if self.has_forces: - self._create_trajectory_dataset('force') - self._force = self._traj['force/value'] - self._set_attr_unit('force') + self._create_trajectory_dataset('forces') + self._force = self._particle_group['forces'] + self._particle_group['units'].attrs['force'] = self.units['force'] # intialize observable datasets from ts.data dictionary that # are NOT in self.data_blacklist - #if self.data_keys: - # for key in self.data_keys: - # self._create_observables_dataset(key, ts.data[key]) - - def _create_step_and_time_datasets(self): - """helper function to initialize a dataset for step and time - - Hunts down first available location to create the step and time - datasets. This should only be called if the trajectory has no - dimension, otherwise the 'box/edges' group creates step and time - datasets since 'box' is the only required group in 'particles'. - - :attr:`self._step` and :attr`self._time` serve as links to the created - datasets that other datasets can also point to for their step and time. - This serves two purposes: - 1. Avoid redundant writing of multiple datasets that share the - same step and time data. - 2. In HDF5, each chunked dataset has a cache (default 1 MiB), - so only 1 read is required to access step and time data - for all datasets that share the same step and time. - - """ - first_dim = self.n_frames if self.n_frames is not None else 0 - for group, value in self._has.items(): - if value: - self._step = self._traj.require_dataset(f'{group}/step', - shape=(first_dim,), - dtype=np.int32) - self._time = self._traj.require_dataset(f'{group}/time', - shape=(first_dim,), - dtype=np.float32) - self._set_attr_unit('time') - break + self.data_keys = [ + key for key in ts.data.keys() if key not in self.data_blacklist] + if self.data_keys: + self._obsv = self._particle_group.require_group('observables') + if self.data_keys: + for key in self.data_keys: + self._create_observables_dataset(key, ts.data[key]) + + def _create_observables_dataset(self, group, data): + """helper function to initialize a dataset for each observable""" + + self._obsv.require_group(group) + # guarantee ints and floats have a shape () + data = np.asarray(data) + self._obsv[group] = zarr.empty(shape=(0,) + data.shape, + dtype=data.dtype) def _create_trajectory_dataset(self, group): """helper function to initialize a dataset for position, velocity, and force""" - - if self.n_frames is None: - shape = (0, self.n_atoms, 3) - else: - shape = (self.n_frames, self.n_atoms, 3) - - self._traj.require_group(group) - self._traj.require_dataset(f'{group}/value', - shape=shape, - dtype=np.float32, - chunks=self.chunks, - filters=self.filters, - compressor=self.compressor) - # Hard linking in zarr is not possible, so we only keep one step and time aray - # if 'step' not in self._traj[group]: - # self._traj[f'{group}/step'] = self._step - # if 'time' not in self._traj[group]: - # self._traj[f'{group}/time'] = self._time - - def _set_attr_unit(self, unit): - """helper function to set a unit attribute for an Zarr dataset""" - - if self.units[unit] is None: - return - - self._unit_group.attrs[unit] = self._unit_translation_dict[unit][self.units[unit]] + self._particle_group[group] = zarr.empty(shape=(self._first_dim, + self.n_atoms, 3), + dtype=np.float32, + chunks=self.chunks, + filters=self.filters, + compressor=self.compressor) def _initialize_memory_buffers(self): self._time_buffer = np.zeros((self.n_buffer_frames,), dtype=np.float32) self._step_buffer = np.zeros((self.n_buffer_frames,), dtype=np.int32) - self._edges_buffer = np.zeros((self.n_buffer_frames, 3, 3), dtype=np.float32) + self._edges_buffer = np.zeros((self.n_buffer_frames, 3, 3), + dtype=np.float32) if self.has_positions: - self._pos_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, + self._pos_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, 3), dtype=np.float32) if self.has_velocities: - self._force_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, - 3), dtype=np.float32) + self._force_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, + 3), dtype=np.float32) if self.has_forces: - self._vel_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, + self._vel_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, 3), dtype=np.float32) # Reduce cloud I/O by storing previous step val for error checking self._prev_step = None def _write_next_cloud_timestep(self, ts): """ - This method performs two steps: - - 1. First, it appends the next frame to - - """ i = self._counter buffer_index = i % self.n_buffer_frames @@ -808,41 +714,40 @@ def _write_next_cloud_timestep(self, ts): raise ValueError("The Zarrtraj standard dictates that the step " "dataset must increase monotonically in value.") self._prev_step = curr_step - start = time.time() - if self.units['time'] is not None: + if self.units.attrs['time'] is not None: self._time_buffer[buffer_index] = self.convert_time_to_native(ts.time) else: self._time_buffer[buffer_index] = ts.time - if self._has_edges: - if self.units['length'] is not None: + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: + if self.units.attrs['length'] is not None: self._edges_buffer[buffer_index, :] = self.convert_pos_to_native(ts.triclinic_dimensions) else: self._edges_buffer[buffer_index, :] = ts.triclinic_dimensions if self.has_positions: - if self.units['length'] is not None: + if self.units.attrs['length'] is not None: self._pos_buffer[buffer_index, :] = self.convert_pos_to_native(ts.positions) else: self._pos_buffer[buffer_index, :] = ts.positions if self.has_velocities: - if self.units['velocity'] is not None: + if self.units.attrs['velocity'] is not None: self._vel_buffer[buffer_index, :] = self.convert_velocities_to_native(ts.velocities) else: self._vel_buffer[buffer_index, :] = ts.velocities if self.has_forces: - if self.units['force'] is not None: + if self.units.attrs['force'] is not None: self._force_buffer[buffer_index, :] = self.convert_forces_to_native(ts.forces) else: self._force_buffer[buffer_index, :] = ts.forces - + # If buffer is full or last write call, write buffer to cloud if (((i + 1) % self.n_buffer_frames == 0) or (i == self.n_frames - 1)): - da.from_array(self._step_buffer[:buffer_index + 1]).to_zarr(self._step, region=(slice(i - buffer_index, i + 1),), return_stored=True) + da.from_array(self._step_buffer[:buffer_index + 1]).to_zarr(self._step, region=(slice(i - buffer_index, i + 1),)) da.from_array(self._time_buffer[:buffer_index + 1]).to_zarr(self._time, region=(slice(i - buffer_index, i + 1),)) if self._has_edges: da.from_array(self._edges_buffer[:buffer_index + 1]).to_zarr(self._edges, region=(slice(i - buffer_index, i + 1),)) @@ -873,7 +778,7 @@ def _write_next_timestep(self, ts): # sampled, therefore ts.data['step'] is the most appropriate value # to use. However, step is also necessary in Zarrtraj to allow # temporal matching of the data, so ts.frame is used as an alternative - new_shape = (self._step.shape[0] + 1,) + self._step.shape[1:] + new_shape = (self._step.shape[0] + 1,) self._step.resize(new_shape) try: self._step[i] = ts.data['step'] @@ -884,11 +789,11 @@ def _write_next_timestep(self, ts): "dataset must increase monotonically in value.") # the dataset.resize() method should work with any chunk shape - new_shape = (self._time.shape[0] + 1,) + self._time.shape[1:] + new_shape = (self._time.shape[0] + 1,) self._time.resize(new_shape) self._time[i] = ts.time - if 'edges' in self._traj['box']: + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: new_shape = (self._edges.shape[0] + 1,) + self._edges.shape[1:] self._edges.resize(new_shape) self._edges[i, :] = ts.triclinic_dimensions @@ -909,7 +814,7 @@ def _write_next_timestep(self, ts): new_shape = (self._force.shape[0] + 1,) + self._force.shape[1:] self._force.resize(new_shape) self._force[i, :] = ts.forces - + # NOTE: Fix me. add observables #if self.convert_units: # self._convert_dataset_with_units(i) @@ -918,17 +823,17 @@ def _write_next_timestep(self, ts): @property def has_positions(self): """``True`` if writer is writing positions from Timestep.""" - return self._has['position'] + return "positions" in self._has @property def has_velocities(self): """``True`` if writer is writing velocities from Timestep.""" - return self._has['velocity'] + return "velocities" in self._has @property def has_forces(self): """``True`` if writer is writing forces from Timestep.""" - return self._has['force'] + return "forces" in self._has ### Developer utils ### def get_frame_size(universe): diff --git a/zarrtraj/tests/create_zarrtraj_data.py b/zarrtraj/tests/create_zarrtraj_data.py index e2b13ff..203ce91 100644 --- a/zarrtraj/tests/create_zarrtraj_data.py +++ b/zarrtraj/tests/create_zarrtraj_data.py @@ -21,7 +21,7 @@ def create_test_trj(uni, fname): with mda.Writer(root, n_atoms, format='ZARRTRAJ', positions=True, - velocities=True, + velocities=True, forces=True, convert_units=False) as w: for i in range(5): diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 0803a12..b1f90e0 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -242,8 +242,6 @@ def test_write_different_box(self, ref, universe, tmpdir): with tmpdir.as_cwd(): with ref.writer(outfile, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, - chunks=(5, universe.trajectory.n_atoms, 3), - max_memory=520, format='ZARRTRAJ') as W: for ts in universe.trajectory: universe.dimensions[:3] += 1 @@ -268,8 +266,6 @@ def test_write_selection(self, ref, reader, universe, u_no_resnames, with tmpdir.as_cwd(): with ref.writer(outfile, sel.n_atoms, n_frames=universe.trajectory.n_frames, - chunks=(5, sel.n_atoms, 3), - max_memory=520, format='ZARRTRAJ') as W: for ts in universe.trajectory: W.write(sel.atoms) @@ -298,9 +294,8 @@ def test_write_not_changing_ts(self, ref, universe, tmpdir): copy_ts = universe.trajectory.ts.copy() with tmpdir.as_cwd(): - with ref.writer(outfile, n_atoms=5, n_frames=1, - chunks=(1, universe.trajectory.n_atoms, 3), - max_memory=104, format='ZARRTRAJ') as W: + with ref.writer(outfile, n_atoms=5, n_frames=1, + format='ZARRTRAJ') as W: W.write(universe) assert_timestep_almost_equal(copy_ts, universe.trajectory.ts) @@ -310,8 +305,6 @@ def test_write_trajectory_atomgroup(self, ref, reader, universe, tmpdir): with tmpdir.as_cwd(): with ref.writer(outfile, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, - chunks=(5, universe.trajectory.n_atoms, 3), - max_memory=520, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe.atoms) @@ -323,8 +316,6 @@ def test_write_trajectory_universe(self, ref, reader, universe, tmpdir): with tmpdir.as_cwd(): with ref.writer(outfile, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, - chunks=(5, universe.trajectory.n_atoms, 3), - max_memory=520, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe) @@ -337,8 +328,6 @@ def test_max_memory_too_low(self, ref, reader, universe, tmpdir): with pytest.raises(ValueError): with ref.writer(outfile, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, - chunks=(1, universe.trajectory.n_atoms, 3), - max_memory=223, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe) From 0304ea1350ec4ce45130e6360c475f49735ab5d7 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Mon, 29 Apr 2024 14:15:56 -0700 Subject: [PATCH 05/20] edges bug fix --- zarrtraj/ZARRTRAJ.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index 886eaa7..6ec09c9 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -715,31 +715,31 @@ def _write_next_cloud_timestep(self, ts): "dataset must increase monotonically in value.") self._prev_step = curr_step - if self.units.attrs['time'] is not None: + if self.units['time'] is not None: self._time_buffer[buffer_index] = self.convert_time_to_native(ts.time) else: self._time_buffer[buffer_index] = ts.time if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: - if self.units.attrs['length'] is not None: + if self.units['length'] is not None: self._edges_buffer[buffer_index, :] = self.convert_pos_to_native(ts.triclinic_dimensions) else: self._edges_buffer[buffer_index, :] = ts.triclinic_dimensions if self.has_positions: - if self.units.attrs['length'] is not None: + if self.units['length'] is not None: self._pos_buffer[buffer_index, :] = self.convert_pos_to_native(ts.positions) else: self._pos_buffer[buffer_index, :] = ts.positions if self.has_velocities: - if self.units.attrs['velocity'] is not None: + if self.units['velocity'] is not None: self._vel_buffer[buffer_index, :] = self.convert_velocities_to_native(ts.velocities) else: self._vel_buffer[buffer_index, :] = ts.velocities if self.has_forces: - if self.units.attrs['force'] is not None: + if self.units['force'] is not None: self._force_buffer[buffer_index, :] = self.convert_forces_to_native(ts.forces) else: self._force_buffer[buffer_index, :] = ts.forces @@ -749,7 +749,7 @@ def _write_next_cloud_timestep(self, ts): (i == self.n_frames - 1)): da.from_array(self._step_buffer[:buffer_index + 1]).to_zarr(self._step, region=(slice(i - buffer_index, i + 1),)) da.from_array(self._time_buffer[:buffer_index + 1]).to_zarr(self._time, region=(slice(i - buffer_index, i + 1),)) - if self._has_edges: + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: da.from_array(self._edges_buffer[:buffer_index + 1]).to_zarr(self._edges, region=(slice(i - buffer_index, i + 1),)) if self.has_positions: da.from_array(self._pos_buffer[:buffer_index + 1]).to_zarr(self._pos, region=(slice(i - buffer_index, i + 1),)) From ea8289adf652f79c8a6586052d98b81f763beedd Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Mon, 29 Apr 2024 15:53:17 -0700 Subject: [PATCH 06/20] format fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a12f92..7dcc0cb 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ test = [ "pytest-xdist>=3.5.0", "pytest-cov>=4.1.0", "MDAnalysisTests>=2.7.0", - "s3fs=2024.3.0", + "s3fs==2024.3.0", ] doc = [ "sphinx", From 5aabf65a2828b70722fe1b51df630b5db586a878 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 3 May 2024 16:27:01 -0700 Subject: [PATCH 07/20] completed rewrite with passing synthetic tests --- docs/source/zarrtraj-file-spec/v0.1.0.rst | 77 ++-- zarrtraj/ZARRTRAJ.py | 404 ++++++++++-------- zarrtraj/data/test.zarrtraj/.zattrs | 3 + zarrtraj/data/test.zarrtraj/metadata/.zattrs | 5 + .../trajectory => metadata}/.zgroup | 0 .../data/test.zarrtraj/particles/box/.zattrs | 3 + .../particles/{trajectory => }/box/.zgroup | 0 .../edges/value => box/dimensions}/.zarray | 0 .../particles/box/dimensions/0.0.0 | Bin 0 -> 52 bytes .../particles/box/dimensions/1.0.0 | Bin 0 -> 52 bytes .../particles/box/dimensions/2.0.0 | Bin 0 -> 52 bytes .../particles/box/dimensions/3.0.0 | Bin 0 -> 52 bytes .../particles/box/dimensions/4.0.0 | Bin 0 -> 52 bytes .../force/value => forces}/.zarray | 10 +- .../data/test.zarrtraj/particles/forces/0.0.0 | Bin 0 -> 215 bytes .../box/edges => observables}/.zgroup | 0 .../particles/observables/occupancy/.zarray | 22 + .../particles/observables/tempfactor/.zarray | 22 + .../position/value => positions}/.zarray | 10 +- .../test.zarrtraj/particles/positions/0.0.0 | Bin 0 -> 215 bytes .../{trajectory/box/edges => }/step/.zarray | 0 .../{trajectory/box/edges => }/step/0 | Bin .../{trajectory/box/edges => }/step/1 | Bin .../{trajectory/box/edges => }/step/2 | Bin .../{trajectory/box/edges => }/step/3 | Bin .../{trajectory/box/edges => }/step/4 | Bin .../{trajectory/box/edges => }/time/.zarray | 4 +- .../{trajectory/box/edges => }/time/0 | Bin zarrtraj/data/test.zarrtraj/particles/time/1 | Bin 0 -> 20 bytes zarrtraj/data/test.zarrtraj/particles/time/2 | Bin 0 -> 20 bytes zarrtraj/data/test.zarrtraj/particles/time/3 | Bin 0 -> 20 bytes zarrtraj/data/test.zarrtraj/particles/time/4 | Bin 0 -> 20 bytes .../particles/trajectory/box/.zattrs | 7 - .../particles/trajectory/box/edges/time/1 | Bin 20 -> 0 bytes .../particles/trajectory/box/edges/time/2 | Bin 20 -> 0 bytes .../particles/trajectory/box/edges/time/3 | Bin 20 -> 0 bytes .../particles/trajectory/box/edges/time/4 | Bin 20 -> 0 bytes .../trajectory/box/edges/value/0.0.0 | Bin 52 -> 0 bytes .../trajectory/box/edges/value/1.0.0 | Bin 52 -> 0 bytes .../trajectory/box/edges/value/2.0.0 | Bin 52 -> 0 bytes .../trajectory/box/edges/value/3.0.0 | Bin 52 -> 0 bytes .../trajectory/box/edges/value/4.0.0 | Bin 52 -> 0 bytes .../particles/trajectory/force/.zgroup | 3 - .../particles/trajectory/force/value/0.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/force/value/1.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/force/value/2.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/force/value/3.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/force/value/4.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/position/.zgroup | 3 - .../particles/trajectory/position/value/0.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/position/value/1.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/position/value/2.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/position/value/3.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/position/value/4.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/velocity/.zgroup | 3 - .../particles/trajectory/velocity/value/0.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/velocity/value/1.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/velocity/value/2.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/velocity/value/3.0.0 | Bin 60 -> 0 bytes .../particles/trajectory/velocity/value/4.0.0 | Bin 60 -> 0 bytes .../test.zarrtraj/particles/units/.zattrs | 6 +- .../velocity/value => velocities}/.zarray | 10 +- .../test.zarrtraj/particles/velocities/0.0.0 | Bin 0 -> 203 bytes zarrtraj/data/test.zarrtraj/zarrtraj/.zattrs | 3 - zarrtraj/data/test.zarrtraj/zarrtraj/.zgroup | 3 - 65 files changed, 361 insertions(+), 237 deletions(-) create mode 100644 zarrtraj/data/test.zarrtraj/.zattrs create mode 100644 zarrtraj/data/test.zarrtraj/metadata/.zattrs rename zarrtraj/data/test.zarrtraj/{particles/trajectory => metadata}/.zgroup (100%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/.zattrs rename zarrtraj/data/test.zarrtraj/particles/{trajectory => }/box/.zgroup (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges/value => box/dimensions}/.zarray (100%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/dimensions/0.0.0 create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/dimensions/1.0.0 create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/dimensions/2.0.0 create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/dimensions/3.0.0 create mode 100644 zarrtraj/data/test.zarrtraj/particles/box/dimensions/4.0.0 rename zarrtraj/data/test.zarrtraj/particles/{trajectory/force/value => forces}/.zarray (57%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/forces/0.0.0 rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => observables}/.zgroup (100%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/observables/occupancy/.zarray create mode 100644 zarrtraj/data/test.zarrtraj/particles/observables/tempfactor/.zarray rename zarrtraj/data/test.zarrtraj/particles/{trajectory/position/value => positions}/.zarray (57%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/positions/0.0.0 rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/.zarray (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/0 (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/1 (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/2 (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/3 (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/step/4 (100%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/time/.zarray (86%) rename zarrtraj/data/test.zarrtraj/particles/{trajectory/box/edges => }/time/0 (100%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/time/1 create mode 100644 zarrtraj/data/test.zarrtraj/particles/time/2 create mode 100644 zarrtraj/data/test.zarrtraj/particles/time/3 create mode 100644 zarrtraj/data/test.zarrtraj/particles/time/4 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/.zattrs delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/time/1 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/time/2 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/time/3 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/time/4 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/0.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/1.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/2.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/3.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/4.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/.zgroup delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/0.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/1.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/2.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/3.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/4.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/.zgroup delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/0.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/1.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/2.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/3.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/4.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/.zgroup delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/0.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/1.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/2.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/3.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/4.0.0 rename zarrtraj/data/test.zarrtraj/particles/{trajectory/velocity/value => velocities}/.zarray (57%) create mode 100644 zarrtraj/data/test.zarrtraj/particles/velocities/0.0.0 delete mode 100644 zarrtraj/data/test.zarrtraj/zarrtraj/.zattrs delete mode 100644 zarrtraj/data/test.zarrtraj/zarrtraj/.zgroup diff --git a/docs/source/zarrtraj-file-spec/v0.1.0.rst b/docs/source/zarrtraj-file-spec/v0.1.0.rst index 611cd2d..c53ad92 100644 --- a/docs/source/zarrtraj-file-spec/v0.1.0.rst +++ b/docs/source/zarrtraj-file-spec/v0.1.0.rst @@ -1,3 +1,5 @@ +.. _zarrtraj_spec: + ZarrTraj Specification ====================== @@ -7,8 +9,8 @@ The *ZarrTraj* trajectory file format is based on the streamable, compressible Units ----- -Zarrtraj uses the following units by default, but supports reading and writing all -`units used by MDAnalysis `_ +Zarrtraj only supports the following units, meaning implementations must +handle conversion: .. list-table :widths: 25 25 25 @@ -19,17 +21,21 @@ Zarrtraj uses the following units by default, but supports reading and writing a - Abbreviation * - Time - picosecond - - pm + - ps * - Distance - nanometer - nm * - Velocity - nanometer / picosecond - - nm / ps + - nm/ps * - Force - kilojoules / (mol * nanometer) - kJ/(mol*nm) +Zarrtraj implementations should fail out if any other units are encountered. +All zarrtraj units must be present in file even if they are not used. + +Positions are in absolute units and are not relative to the box vectors. Structure --------- @@ -38,66 +44,65 @@ Notation ^^^^^^^^ - ``(name)`` is a Zarr group -- ``{name}`` is a Zarr group with an arbitrary name +- ``{name}`` is an optional Zarr group - ``[variable]`` is a Zarr array - ```` is the Zarr array datatype -- ``+--`` is an attribute of a group or Zarr array +- ``+--`` is a required attribute of a group or Zarr array +- ``---`` is an optional attribute of a group or Zarr array .. code-block:: none Zarr root + +-- version : , Zarrtraj specification version \-- (metadata) - +-- version - +-- authors + --- authors : + --- : , arbitrary attribute which contains user-defined metadata \-- (particles) \-- (units) - +-- length - +-- velocity - +-- force - +-- time + +-- length : , "nm" + +-- velocity : , "nm/ps" + +-- force : , "kJ/(mol*nm)" + +-- time : , "ps" \-- (box) +-- boundary : , boundary conditions of unit cell. either "periodic" or "none" (see below) - \-- [dimensions] , gives box dimensions array in triclinic vectors - with shape (n_frames, 3, 3) - +-- [subselection] : , optional array of indices of atoms which - make up a selected subsystem of the trajectory - at each frame with shape - (n_frames, n_selected_atoms) + \-- [dimensions] : , gives box dimensions array in triclinic vectors + with shape (n_frames, 3, 3) + +-- [step] : , array of integration steps at which positions, velocities, and forces were sampled with shape (n_frames) - +-- [time] : , array of simulation times at which + +-- [time] : , array of simulation times at which positions, velocities, and forces were sampled given in physical units with shape (n_frames) - +-- [positions] : , gives array of positions - with shape (n_frames, n_atoms, 3) - +-- [velocities] : , gives array of velocities + +-- [positions] : , gives array of positions with shape (n_frames, n_atoms, 3) - +-- [forces] : , gives array of forces - with shape (n_frames, n_atoms, 3) - \-- (observables) - +-- [] - \-- (userdata) : Optional scratch space for arbitrary data the user wishes to include in the file + +-- [velocities] : , gives array of velocities + with shape (n_frames, n_atoms, 3) + +-- [forces] : , gives array of forces + with shape (n_frames, n_atoms, 3) + --- [subselection] : , optional array of indices of atoms which + make up a selected subsystem of the trajectory + at each frame with shape + (n_frames, n_selected_atoms) + \-- {observables} + --- [] : + \-- {userdata} : Optional scratch space for arbitrary data the user wishes to include in the file Boundary values: """""""""""""""" "periodic" : The simulation box is periodically continued along the given dimension and serves as the unit cell for an infinite tiling of space. - In this case, the position value for each atom is its absolute position in space of an arbitrary periodic image of that particle. + Any arbitrary box shape is supported and only 3 dimensionional boxes are allowed. "none" : No boundary condition is imposed. This summarizes the situations of open systems (i.e., an infinitely large box) and closed systems (e.g., due to an impenetrable wall). - In such systems, the "dimensions" array is not necessary. The position value for each atom is its absolute position in space. + In such systems, the "dimensions" array is not necessary. Additional Requirements ----------------------- * The zarrtraj file must contain at least one of positions, velocities, or forces. -* Positions, velocities, forces, and observables must be sampled at the same rate -* Positions, velocities, and forces must be compressed & filtered with the same - numcodecs compressor object and filter list -* The step & time arrays must increase monotonically. - - - +* Positions, velocities, forces, and observables must be sampled at the same rate. + Future versions of Zarrtraj may relax this requirement. +* The step & time arrays must increase monotonically. \ No newline at end of file diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index 6ec09c9..bcb6f09 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -97,6 +97,8 @@ class MockZarrFile: ZARRTRAJ_DEFAULT_BUFSIZE = 10485760 # 10 MB ZARRTRAJ_DEFAULT_FPC = 10 # 10 frames per chunk +#: currently implemented version of the file format +ZARRTRAJ_VERSION = '0.1.0' class ZarrTrajBoundaryConditions(Enum): @@ -105,13 +107,43 @@ class ZarrTrajBoundaryConditions(Enum): class ZarrTrajReader(base.ReaderBase): + """Reader for the `Zarrtraj` format version 0.1.0 + + For more information on the format, see the :ref:`zarrtraj-spec` + """ + format = 'ZARRTRAJ' @store_init_arguments def __init__(self, filename, convert_units=True, **kwargs): - + """ + Parameters + ---------- + filename : :class:`zarr.Group` + Open, readable zarrtraj file + convert_units : bool (optional) + convert units to MDAnalysis units + **kwargs : dict + General reader arguments. + + Raises + ------ + RuntimeError + when `zarr` is not installed + PermissionError + when the Zarr group is not readable + RuntimeError + when an incorrect unit is provided + ValueError + when ``n_atoms`` changes values between timesteps + NoDataError + when the Zarrtraj file has no 'position', 'velocity', or + 'force' group + RuntimeError + when the Zarrtraj file version is incompatibile with the reader + """ if not HAS_ZARR: raise RuntimeError("Please install zarr") super(ZarrTrajReader, self).__init__(filename, **kwargs) @@ -119,6 +151,12 @@ def __init__(self, filename, self.filename = filename if not self.filename: raise PermissionError("The Zarr group is not readable") + if self.filename.attrs["version"] != ZARRTRAJ_VERSION: + raise RuntimeError("Zarrtraj file version " + + f"{self.filename.attrs['version']} " + + "is not compatible with reader version " + + f"{ZARRTRAJ_VERSION}") + self.convert_units = convert_units self._frame = -1 self._file = self.filename @@ -127,7 +165,8 @@ def __init__(self, filename, self._time_array = self._particle_group['time'] # IO CALL self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_NONE if - self._particle_group["box"].attrs["boundary"] == "none" + self._particle_group["box"].attrs[ + "boundary"] == "none" else ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC) if (self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC and @@ -162,36 +201,29 @@ def __init__(self, filename, forces=self.has_forces, **self._ts_kwargs) - self.units = {'time': None, - 'length': None, - 'velocity': None, - 'force': None} - self._fill_units_dict() + self._verify_correct_units() + self.units = {'time': 'ps', + 'length': 'nm', + 'velocity': 'nm/ps', + 'force': 'kJ/(mol*nm)'} self._read_next_timestep() - def _fill_units_dict(self): - """Verify time unit is present and that units for each - measurement in (positions, forces, and velocites) in the file - is present and fill units dict""" - - _group_unit_dict = {'time': 'time', - 'positions': 'length', - 'velocities': 'velocity', - 'forces': 'force' - } - try: - self.units['time'] = self._particle_group["units"].attrs["time"] - except KeyError: - raise ValueError("Zarrtraj file must have unit set for time") - for group, unit in _group_unit_dict.items(): - if group in self._has: - try: - self.units[unit] = self._particle_group[ - "units"].attrs[unit] - except KeyError: - raise ValueError("Zarrtraj file must have units set for " + - "each of positions, " + - "velocities, and forces present") + def _verify_correct_units(self): + self._unit_group = self._particle_group['units'] + if ('length' not in self._unit_group.attrs or + self._unit_group.attrs['length'] != "nm"): + raise RuntimeError("Zarrtraj file with positions must contain " + + "'nm' length unit") + if ('velocity' not in self._unit_group.attrs or + self._unit_group.attrs['velocity'] != "nm/ps"): + raise RuntimeError("Zarrtraj file must contain " + + "'nm/ps' velocity unit") + if ('force' not in self._unit_group.attrs or + self._unit_group.attrs['force'] != "kJ/(mol*nm)"): + raise RuntimeError("Zarrtraj file with forces must contain " + + "'kJ/(mol*nm)' force unit") + if (self._unit_group.attrs['time'] != "ps"): + raise RuntimeError("Zarrtraj file must contain 'ps' for time unit") def _read_next_timestep(self): """Read next frame in trajectory""" @@ -252,13 +284,8 @@ def _read_dataset_into_ts(self, dataset, attribute): attribute[:] = self._particle_group[dataset][self._frame, :] def _convert_units(self): - """Converts time, position, velocity, and force values if they - are not given in MDAnalysis standard units - - See https://userguide.mdanalysis.org/stable/units.html - """ - - self.ts.time = self.convert_time_from_native(self.ts.time) + """Converts position, velocity, and force values to + MDAnalysis units. Time does not need to be converted""" if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: # Only convert [:3] since last 3 elements are angle @@ -368,6 +395,8 @@ def __setstate__(self, state): class ZarrTrajWriter(base.WriterBase): + """Writer for `Zarrtraj` format version 0.1.0.""" + format = 'ZARRTRAJ' multiframe = True @@ -375,23 +404,81 @@ class ZarrTrajWriter(base.WriterBase): #: dictionary to the observables group in the H5MD file data_blacklist = ['step', 'time', 'dt'] - #: currently written version of the file format - ZARRTRAJ_VERSION = '0.1.0' - def __init__(self, filename, n_atoms, n_frames=None, - convert_units=True, chunks=None, + chunks=None, positions=True, velocities=True, - forces=True, timeunit=None, lengthunit=None, - velocityunit=None, forceunit=None, compressor=None, + forces=True, compressor=None, filters=None, max_memory=None, - force_buffered=False, **kwargs): + force_buffered=False, + author_email=None, author='N/A', + creator='MDAnalysis', creator_version=mda.__version__, + **kwargs): + """ + All data from the input + :class:`~MDAnalysis.coordinates.timestep.Timestep` is + written by default. For detailed information on how + :class:`ZarrTrajWriter`handles units, compression, and chunking, + see the Notes section below. + + Parameters + ---------- + filename : :class:`zarr.Group` + Open, readable zarrtraj file + n_atoms : int + number of atoms in trajectory + n_frames : int (required for cloud and buffered writing) + number of frames to be written in trajectory + chunks : tuple (optional) + Custom chunk layout to be applied to the position, + velocity, and force datasets. By default, these datasets + are chunked in ``{10, n_atoms, 3}`` blocks + compressor : str or int (optional) + `numcodecs` compressor object to be applied + to position, velocity, force, and observables datasets. + filters : list (optional) + list of `numcodecs` filter objects to be applied to + to position, velocity, force, and observables datasets. + positions : bool (optional) + Write positions into the trajectory [``True``] + velocities : bool (optional) + Write velocities into the trajectory [``True``] + forces : bool (optional) + Write forces into the trajectory [``True``] + author : str (optional) + Name of the author of the file + author_email : str (optional) + Email of the author of the file + creator : str (optional) + Software that wrote the file [``MDAnalysis``] + creator_version : str (optional) + Version of software that wrote the file + [:attr:`MDAnalysis.__version__`] + + Raises + ------ + RuntimeError + when `zarr` is not installed + PermissionError + when the Zarr group is not writeable + ValueError + when `n_atoms` is 0 + ValueError + when ``chunks=False`` but the user did not specify `n_frames` + ValueError + when `positions`, `velocities`, and `forces` are all + set to ``False`` + TypeError + when the input object is not a :class:`Universe` or + :class:`AtomGroup` + IOError + when `n_atoms` of the :class:`Universe` or :class:`AtomGroup` + being written does not match `n_atoms` passed as an argument + to the writer + + """ if not HAS_ZARR: raise RuntimeError("ZarrTrajWriter: Please install zarr") - if not isinstance(filename, zarr.Group): - raise TypeError("Expected a Zarr group object, but " + - "received an instance of type {}" - .format(type(filename).__name__)) # Verify group is open for writing if not filename.store.is_writeable(): raise PermissionError("The Zarr group is not writeable") @@ -405,8 +492,14 @@ def __init__(self, filename, n_atoms, n_frames=None, # Fill in Zarrtraj metadata from kwargs # IO CALL + self._file.attrs['version'] = ZARRTRAJ_VERSION self._file.require_group('metadata') - self._file['metadata'].attrs['version'] = self.ZARRTRAJ_VERSION + self._file['metadata'].attrs['author'] = author + if author_email is not None: + self._file['metadata'].attrs['author_email'] = author_email + self._file['metadata'].attrs['creator'] = creator + if creator == 'MDAnalysis': + self._file['metadata'].attrs['creator_version'] = creator_version self._determine_if_cloud_storage() if self._is_cloud_storage or self.force_buffered: @@ -414,20 +507,30 @@ def __init__(self, filename, n_atoms, n_frames=None, if n_frames is None: raise TypeError("ZarrTrajWriter: Buffered writing requires " + "'n_frames' kwarg") - self.max_memory = ZARRTRAJ_DEFAULT_BUFSIZE if max_memory is None else max_memory + self.max_memory = (ZARRTRAJ_DEFAULT_BUFSIZE if max_memory is None + else max_memory) - self.chunks = (ZARRTRAJ_DEFAULT_FPC, self.n_atoms, 3) if chunks is None else chunks + self.chunks = ((ZARRTRAJ_DEFAULT_FPC, self.n_atoms, 3) + if chunks is None else chunks) self.filters = filters if filters is not None else [] - self.compressor = compressor if compressor is not None else zarr.storage.default_compressor - self.convert_units = convert_units + self.compressor = (compressor if compressor is not None else + zarr.storage.default_compressor) # The writer defaults to writing all data from the parent Timestep if # it exists. If these are True, the writer will check each # Timestep.has_* value and fill the self._has dictionary accordingly # in _initialize_hdf5_datasets() self._write = set() - if positions: self._write.add('positions') - if velocities: self._write.add('velocities') - if forces: self._write.add('forces') + if positions: + self._write.add('positions') + if velocities: + self._write.add('velocities') + if forces: + self._write.add('forces') + + self.units = {'time': 'ps', + 'length': 'nm', + 'velocity': 'nm/ps', + 'force': 'kJ/(mol*nm)'} if not self._write: raise ValueError("At least one of positions, velocities, or " @@ -485,12 +588,11 @@ def _write_next_frame(self, ag): raise IOError("ZarrTrajWriter: Timestep does not have" " the correct number of atoms") - # This should only be called once when first timestep is read. + # This will only be called once when first timestep is read. if self._initial_write: - self._determine_units(ag) self._determine_has(ts) + self._determine_units(ag) if self._is_cloud_storage: - self._determine_chunks() self._check_max_memory() self._initialize_zarr_datasets(ts) self._initialize_memory_buffers() @@ -501,17 +603,19 @@ def _write_next_frame(self, ag): return self._write_next_timestep(ts) def _determine_units(self, ag): - """Determine which units the file will be written with""" + """Verifies the trajectory contains all + necessary units so conversion to zarrtraj units can happen""" - self.units = ag.universe.trajectory.units.copy() + from_units = ag.universe.trajectory.units.copy() - if self.convert_units: - # check if all units are None - if not any(self.units.values()): - raise ValueError("The trajectory has no units, but " - "`convert_units` is set to ``True`` by " - "default in MDAnalysis. To write the file " - "with no units, set ``convert_units=False``.") + if self.has_positions and not from_units["length"]: + raise ValueError("The trajectory is missing length units.") + if self.has_velocities and not from_units["velocity"]: + raise ValueError("The trajectory is missing velocity units.") + if self.has_forces and not from_units["force"]: + raise ValueError("The trajectory is missing force units.") + if not from_units["time"]: + raise ValueError("The trajectory is missing time units.") def _determine_has(self, ts): # ask the parent file if it has positions, velocities, and forces, @@ -523,14 +627,15 @@ def _determine_has(self, ts): self._has.add("velocities") if "forces" in self._write and ts.has_forces: self._has.add("forces") - self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC if ts.dimensions is not None + self._boundary = (ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC + if ts.dimensions is not None else ZarrTrajBoundaryConditions.ZARRTRAJ_NONE) def _check_max_memory(self): """ - Determines if at least one chunk of size ``chunks`` fits in the ``max_memory`` - sized buffer. If not, the writer will fail without allocating space for - the trajectory on the cloud. + Determines if at least one chunk of size ``chunks`` fits in the + ``max_memory``sized buffer. If not, the writer will fail without + allocating space for trajectory data on the cloud. """ float32_size = np.dtype(np.float32).itemsize @@ -557,42 +662,13 @@ def _check_max_memory(self): mem_per_chunk += float32_size * self.chunks[0] * self.n_atoms * 3 if mem_per_chunk > self.max_memory: - raise ValueError(f"`max_memory` kwarg " + - "must be at least {mem_per_chunk} for " + - "chunking pattern of {self.chunks}") + raise ValueError("`max_memory` kwarg " + + f"must be at least {mem_per_chunk} for " + + f"chunking pattern of {self.chunks}") else: - self.n_buffer_frames = self.chunks[0] * (self.max_memory // + self.n_buffer_frames = self.chunks[0] * (self.max_memory // mem_per_chunk) - def _determine_chunks(self): - """ - If ``chunks`` is not provided as a `kwarg` to the writer in the case - of cloud writing, this method will determine a default - chunking strategy of 10 MB per sum total of - position, force, velocity, step, and time chunks - based on the `zarr` reccomendation of >= 1mb per chunk. - """ - if self.chunks: - return - - float32_size = np.dtype(np.float32).itemsize - int32_size = np.dtype(np.int32).itemsize - - n_data_buffers = (self.has_positions + self.has_velocities + self.has_forces) - if n_data_buffers: - data_buffer_size = (ZARRTRAJ_DEFAULT_BUFSIZE - (float32_size * 9) - - int32_size - float32_size) // n_data_buffers - mem_per_buffer = float32_size * self.n_atoms * 3 - if mem_per_buffer <= data_buffer_size: - mem_per_frame = (n_data_buffers * mem_per_buffer + - (float32_size * 9) + int32_size + - float32_size) - self.chunks = (ZARRTRAJ_DEFAULT_BUFSIZE // mem_per_frame, self.n_atoms, 3) - else: - raise TypeError(f"Trajectory frame cannot fit in " + - "default buffer size of {ZARRTRAJ_DEFAULT_BUFSIZE}MB") - - def _initialize_zarr_datasets(self, ts): """initializes all datasets that will be written to by :meth:`_write_next_timestep`. Datasets must be sampled at the same @@ -602,8 +678,6 @@ def _initialize_zarr_datasets(self, ts): ---- :exc:`NoDataError` is raised if no positions, velocities, or forces are found in the input trajectory. - - """ if self.n_frames is None: self._first_dim = 0 @@ -614,9 +688,13 @@ def _initialize_zarr_datasets(self, ts): self._counter = 0 self._particle_group = self._file.require_group('particles') # NOTE: subselection init goes here when implemented - # box group is required for every group in 'particles' + # Initialize units group self._particle_group.require_group('units') + self._particle_group["units"].attrs['time'] = self.units['time'] + self._particle_group["units"].attrs['length'] = self.units['length'] + self._particle_group["units"].attrs['velocity'] = self.units['velocity'] + self._particle_group["units"].attrs['force'] = self.units['force'] self._particle_group.require_group('box') if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: @@ -627,9 +705,9 @@ def _initialize_zarr_datasets(self, ts): compressor=self.compressor, filters=self.filters ) - self._edges = self._particle_group['box']['dimensions'] + self._dimensions = self._particle_group['box']['dimensions'] else: - # if no box, boundary attr must be "none" + # boundary attr must be "none" self._particle_group['box'].attrs['boundary'] = 'none' self._particle_group['step'] = (zarr.empty(shape=(self._first_dim,), @@ -638,20 +716,16 @@ def _initialize_zarr_datasets(self, ts): self._particle_group['time'] = (zarr.empty(shape=(self._first_dim,), dtype=np.int32)) self._time = self._particle_group['time'] - self._particle_group['units'].attrs['time'] = self.units['time'] if self.has_positions: self._create_trajectory_dataset('positions') self._pos = self._particle_group['positions'] - self._particle_group['units'].attrs['length'] = self.units['length'] if self.has_velocities: self._create_trajectory_dataset('velocities') self._vel = self._particle_group['velocities'] - self._particle_group['units'].attrs['velocity'] = self.units['velocity'] if self.has_forces: self._create_trajectory_dataset('forces') self._force = self._particle_group['forces'] - self._particle_group['units'].attrs['force'] = self.units['force'] # intialize observable datasets from ts.data dictionary that # are NOT in self.data_blacklist @@ -669,7 +743,7 @@ def _create_observables_dataset(self, group, data): self._obsv.require_group(group) # guarantee ints and floats have a shape () data = np.asarray(data) - self._obsv[group] = zarr.empty(shape=(0,) + data.shape, + self._obsv[group] = zarr.empty(shape=(self._first_dim,) + data.shape, dtype=data.dtype) def _create_trajectory_dataset(self, group): @@ -685,8 +759,8 @@ def _create_trajectory_dataset(self, group): def _initialize_memory_buffers(self): self._time_buffer = np.zeros((self.n_buffer_frames,), dtype=np.float32) self._step_buffer = np.zeros((self.n_buffer_frames,), dtype=np.int32) - self._edges_buffer = np.zeros((self.n_buffer_frames, 3, 3), - dtype=np.float32) + self._dimensions_buffer = np.zeros((self.n_buffer_frames, 3, 3), + dtype=np.float32) if self.has_positions: self._pos_buffer = np.zeros((self.n_buffer_frames, self.n_atoms, 3), dtype=np.float32) @@ -700,8 +774,8 @@ def _initialize_memory_buffers(self): self._prev_step = None def _write_next_cloud_timestep(self, ts): - """ - """ + """Write the next timestep to a cloud or buffered zarr group. + Will only actually perform write if buffer is full""" i = self._counter buffer_index = i % self.n_buffer_frames # Add the current timestep information to the buffer @@ -715,42 +789,33 @@ def _write_next_cloud_timestep(self, ts): "dataset must increase monotonically in value.") self._prev_step = curr_step - if self.units['time'] is not None: - self._time_buffer[buffer_index] = self.convert_time_to_native(ts.time) - else: - self._time_buffer[buffer_index] = ts.time - + self._time_buffer[buffer_index] = (self.convert_time_to_native( + ts.time, inplace=False)) + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: - if self.units['length'] is not None: - self._edges_buffer[buffer_index, :] = self.convert_pos_to_native(ts.triclinic_dimensions) - else: - self._edges_buffer[buffer_index, :] = ts.triclinic_dimensions + self._dimensions_buffer[buffer_index, :] = ( + self.convert_pos_to_native(ts.triclinic_dimensions, + inplace=False)) if self.has_positions: - if self.units['length'] is not None: - self._pos_buffer[buffer_index, :] = self.convert_pos_to_native(ts.positions) - else: - self._pos_buffer[buffer_index, :] = ts.positions + self._pos_buffer[buffer_index, :] = self.convert_pos_to_native( + ts.positions, inplace=False) if self.has_velocities: - if self.units['velocity'] is not None: - self._vel_buffer[buffer_index, :] = self.convert_velocities_to_native(ts.velocities) - else: - self._vel_buffer[buffer_index, :] = ts.velocities + self._vel_buffer[buffer_index, :] = ( + self.convert_velocities_to_native(ts.velocities, inplace=False)) if self.has_forces: - if self.units['force'] is not None: - self._force_buffer[buffer_index, :] = self.convert_forces_to_native(ts.forces) - else: - self._force_buffer[buffer_index, :] = ts.forces + self._force_buffer[buffer_index, :] = ( + self.convert_forces_to_native(ts.forces, inplace=False)) - # If buffer is full or last write call, write buffer to cloud + # If buffer is full or last write call, write buffers to cloud if (((i + 1) % self.n_buffer_frames == 0) or (i == self.n_frames - 1)): da.from_array(self._step_buffer[:buffer_index + 1]).to_zarr(self._step, region=(slice(i - buffer_index, i + 1),)) da.from_array(self._time_buffer[:buffer_index + 1]).to_zarr(self._time, region=(slice(i - buffer_index, i + 1),)) if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: - da.from_array(self._edges_buffer[:buffer_index + 1]).to_zarr(self._edges, region=(slice(i - buffer_index, i + 1),)) + da.from_array(self._dimensions_buffer[:buffer_index + 1]).to_zarr(self._dimensions, region=(slice(i - buffer_index, i + 1),)) if self.has_positions: da.from_array(self._pos_buffer[:buffer_index + 1]).to_zarr(self._pos, region=(slice(i - buffer_index, i + 1),)) if self.has_velocities: @@ -771,15 +836,31 @@ def _write_next_timestep(self, ts): then the data is written to the new slot. """ - i = self._counter + # Resize all datasets if needed + # These datasets are not resized if n_frames was provided as an + # argument, as they were initialized with their full size. + if self.n_frames is None: + self._step.resize((self._step.shape[0] + 1,)) + self._time.resize((self._time.shape[0] + 1,)) + if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: + self._dimensions.resize((self._dimensions.shape[0] + 1,) + + self._dimensions.shape[1:]) + if self.has_positions: + self._pos.resize((self._pos.shape[0] + 1,) + + self._pos.shape[1:]) + if self.has_velocities: + self._vel.resize((self._vel.shape[0] + 1,) + + self._vel.shape[1:]) + if self.has_forces: + self._force.resize((self._force.shape[0] + 1,) + + self._force.shape[1:]) + # Zarrtraj step refers to the integration step at which the data were # sampled, therefore ts.data['step'] is the most appropriate value # to use. However, step is also necessary in Zarrtraj to allow # temporal matching of the data, so ts.frame is used as an alternative - new_shape = (self._step.shape[0] + 1,) - self._step.resize(new_shape) try: self._step[i] = ts.data['step'] except KeyError: @@ -787,35 +868,22 @@ def _write_next_timestep(self, ts): if len(self._step) > 1 and self._step[i] < self._step[i-1]: raise ValueError("The Zarrtraj standard dictates that the step " "dataset must increase monotonically in value.") - - # the dataset.resize() method should work with any chunk shape - new_shape = (self._time.shape[0] + 1,) - self._time.resize(new_shape) - self._time[i] = ts.time + self._time[i] = self.convert_time_to_native(ts.time, inplace=False) if self._boundary == ZarrTrajBoundaryConditions.ZARRTRAJ_PERIODIC: - new_shape = (self._edges.shape[0] + 1,) + self._edges.shape[1:] - self._edges.resize(new_shape) - self._edges[i, :] = ts.triclinic_dimensions - # These datasets are not resized if n_frames was provided as an - # argument, as they were initialized with their full size. + self._dimensions[i, :] = self.convert_pos_to_native( + ts.triclinic_dimensions, inplace=False) if self.has_positions: - if self.n_frames is None: - new_shape = (self._pos.shape[0] + 1,) + self._pos.shape[1:] - self._pos.resize(new_shape) - self._pos[i, :] = ts.positions + self._pos[i, :] = self.convert_pos_to_native( + ts.positions, inplace=False) if self.has_velocities: - if self.n_frames is None: - new_shape = (self._vel.shape[0] + 1,) + self._vel.shape[1:] - self._vel.resize(new_shape) - self._vel[i, :] = ts.velocities + self._vel[i, :] = self.convert_velocities_to_native( + ts.velocities, inplace=False) if self.has_forces: - if self.n_frames is None: - new_shape = (self._force.shape[0] + 1,) + self._force.shape[1:] - self._force.resize(new_shape) - self._force[i, :] = ts.forces + self._force[i, :] = self.convert_forces_to_native( + ts.forces, inplace=False) # NOTE: Fix me. add observables - #if self.convert_units: + # if self.convert_units: # self._convert_dataset_with_units(i) self._counter += 1 diff --git a/zarrtraj/data/test.zarrtraj/.zattrs b/zarrtraj/data/test.zarrtraj/.zattrs new file mode 100644 index 0000000..dfba51c --- /dev/null +++ b/zarrtraj/data/test.zarrtraj/.zattrs @@ -0,0 +1,3 @@ +{ + "version": "0.1.0" +} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/metadata/.zattrs b/zarrtraj/data/test.zarrtraj/metadata/.zattrs new file mode 100644 index 0000000..7810d50 --- /dev/null +++ b/zarrtraj/data/test.zarrtraj/metadata/.zattrs @@ -0,0 +1,5 @@ +{ + "author": "N/A", + "creator": "MDAnalysis", + "creator_version": "2.7.0" +} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/.zgroup b/zarrtraj/data/test.zarrtraj/metadata/.zgroup similarity index 100% rename from zarrtraj/data/test.zarrtraj/particles/trajectory/.zgroup rename to zarrtraj/data/test.zarrtraj/metadata/.zgroup diff --git a/zarrtraj/data/test.zarrtraj/particles/box/.zattrs b/zarrtraj/data/test.zarrtraj/particles/box/.zattrs new file mode 100644 index 0000000..7128417 --- /dev/null +++ b/zarrtraj/data/test.zarrtraj/particles/box/.zattrs @@ -0,0 +1,3 @@ +{ + "boundary": "periodic" +} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/box/.zgroup b/zarrtraj/data/test.zarrtraj/particles/box/.zgroup similarity index 100% rename from zarrtraj/data/test.zarrtraj/particles/trajectory/box/.zgroup rename to zarrtraj/data/test.zarrtraj/particles/box/.zgroup diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/.zarray b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/.zarray similarity index 100% rename from zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/.zarray rename to zarrtraj/data/test.zarrtraj/particles/box/dimensions/.zarray diff --git a/zarrtraj/data/test.zarrtraj/particles/box/dimensions/0.0.0 b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..3a84ec4aa60eea98f828ba792a70bd3ba8db838c GIT binary patch literal 52 vcmZQ#G-gp@U|;}Y6CmzC#OMg5z#t&q-2OicGnhX?ZKwUY3?_$ekADsTtjY=X literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/particles/box/dimensions/1.0.0 b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/1.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..b0840a9f7c8bec03b397b04e851788fd6841b8d2 GIT binary patch literal 52 vcmZQ#G-gp@U|;}Y6Cl=%VRi&kV37XK)Lv&13z)yXWvBgn4`v6RgA9%Uov{fQ literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/particles/box/dimensions/2.0.0 b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/2.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..b2078bf163372768ee5f32e5cc69d9500afa777c GIT binary patch literal 52 vcmZQ#G-gp@U|;}Y6CghPmBkTAfkA(qiG8d+8<<~weW$&o3X6kz9Frpev;hc` literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/particles/box/dimensions/3.0.0 b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/3.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..16a0736a872ba09ef10d55a289dd2a96b8fa05e2 GIT binary patch literal 52 wcmZQ#G-gp@U|;}Y6CjT1XLAHnV6b(&k^SU5>|p*Lxn1`5zgQfO{$_Rr0J+Hu(EtDd literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/particles/box/dimensions/4.0.0 b/zarrtraj/data/test.zarrtraj/particles/box/dimensions/4.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..f29d2647be47c1468869efe432f0035654613693 GIT binary patch literal 52 wcmZQ#G-gp@U|;}Y6CnPk!Qlv`z~IkM1N&2zoM8Uqs9p9+r&%4g&SZ520LuIe3jhEB literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/.zarray b/zarrtraj/data/test.zarrtraj/particles/forces/.zarray similarity index 57% rename from zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/.zarray rename to zarrtraj/data/test.zarrtraj/particles/forces/.zarray index 349b639..851d760 100644 --- a/zarrtraj/data/test.zarrtraj/particles/trajectory/force/value/.zarray +++ b/zarrtraj/data/test.zarrtraj/particles/forces/.zarray @@ -1,10 +1,16 @@ { "chunks": [ - 1, + 10, 5, 3 ], - "compressor": null, + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, "dtype": "kJGGB0ww$#Qzx>&YYQfhGC}hnRJG;Gt-Ru8MOEr8H@seau_Po z7|x)p&;^;n&v3>EXojS*Pg+CI%w?Mye9lPvG|b$5<{1Nzq?)lDgEYcOFQA4mf(*8H pc6Rn~zyRksfB=IX7Xu>`!xtt7dwZ~egM%XoFxZ2|kJGGB0ww$#Qzx>&YqohmSL9h*>r}pv(k2D* diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/1.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/1.0.0 deleted file mode 100644 index 9aec48444dbde53ab4f179fd7da57ac241475109..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 wcmZQ#G-gp@U|;}Y6CgG=Ug89#z<@F9vBUOLOTqjnPtzT>rI$GRe_Y@M0K`WNw*UYD diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/2.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/box/edges/value/2.0.0 deleted file mode 100644 index 7cc7e838e63d45f4b9d87142b7e6aa0e44826c8e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52 wcmZQ#G-gp@U|;}Y6CgG=UgiX(z`(fdp@Yf!V8N=@Y0N`mEYybcN diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/.zgroup b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/.zgroup deleted file mode 100644 index 3b7daf2..0000000 --- a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/.zgroup +++ /dev/null @@ -1,3 +0,0 @@ -{ - "zarr_format": 2 -} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/0.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/0.0.0 deleted file mode 100644 index 0e2dad818f9798509d9d958ad72b0a3cebe0749f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 vcmZQz0D%U3AmIQ+K->Vt3xN0l5I+E721f=40U%ZYVgn#{0O9~3PH+SO@j?h! diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/1.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/1.0.0 deleted file mode 100644 index 536e1d9fa7f73446052d934b5376d38790df0027..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 wcmZQz00RdGh6W%$0K^QA3=9fD>;S|GK->Vt6M%RD5N`nD13-KMh#xot0Pq6}TmS$7 diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/2.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/2.0.0 deleted file mode 100644 index d41c27b8aff51eb4f23e47d78770cd8d43141d9b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 wcmZQz0D%SvAmPZs-~hx8K)e8m4*>B4AZBo4U=RRe1t2y6Vh11&0OABE0P?&DWdHyG diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/3.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/3.0.0 deleted file mode 100644 index 8c0931be0ac304310d6962fb56f19e8d518146e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 wcmZQz00Tz`h6W%$0K^PV3=9fD>;S|GK->Vt6M%RD5N`nD13-KMh#xos0P%eaY5)KL diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/4.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/position/value/4.0.0 deleted file mode 100644 index da208b8d28c9f97a971752492989037652ed1f0c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 wcmZQz0D%TaAmPNo-~hx8K)e8m4*>B4AZBo8U=RRe1t2y6Vh11&0OAB^0Q5Epa{vGU diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/.zgroup b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/.zgroup deleted file mode 100644 index 3b7daf2..0000000 --- a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/.zgroup +++ /dev/null @@ -1,3 +0,0 @@ -{ - "zarr_format": 2 -} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/0.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/0.0.0 deleted file mode 100644 index f20f57af5096e6e3a0fcc934d17bbf178a9ab8d6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 zcmZQzU|=|V=8WyxGd^~+X3n$&vh9FE_OoV6+8Y}i+XKbz)6&xH85kOX>>i-_Odx+5 IP;Rq50M;BA;Q#;t diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/1.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/1.0.0 deleted file mode 100644 index 08a266a063440dc133a7f01894c79fd89aeec431..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 zcmZQzU|=|V#>ejLnKO2?W=h%v+4c+!4feBU&a^i+-VEfQu}@2T1{7m(ID3Z20jS0y MElthA*x1+s0NT_R=>Px# diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/2.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/2.0.0 deleted file mode 100644 index 5bdb5a34b3a384faef3960fc8e9d9de6d798b6b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 zcmZQzU|=|V=8WChGd}jSX3n$+vh9IF4zp%TIv5)pI{?KU($dl#7#JFW>>i-_Odx+5 IP;RpW0N0im?*IS* diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/3.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/3.0.0 deleted file mode 100644 index 0b7226fe8179f2859e13be1c8d1ce685988f147f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 zcmZQzU|=|V#>f8bnKSmYW=c8$*$xa04Gyzr&U7#~-VEfQaY##h1{7m(JbQ-65vay7 MElthQ*x1++0NhR%_W%F@ diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/4.0.0 b/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/4.0.0 deleted file mode 100644 index 91196684b298b15ad83a1d344e967a0d2c01339a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60 zcmZQzU|=|V=8XN>Gd>QpX3lf~vK@dzj>i-_Odx+5 IP;Rp$0ND^1{Qv*} diff --git a/zarrtraj/data/test.zarrtraj/particles/units/.zattrs b/zarrtraj/data/test.zarrtraj/particles/units/.zattrs index d73138e..6daae20 100644 --- a/zarrtraj/data/test.zarrtraj/particles/units/.zattrs +++ b/zarrtraj/data/test.zarrtraj/particles/units/.zattrs @@ -1,6 +1,6 @@ { - "force": "kJ mol-1 Angstrom-1", - "length": "Angstrom", + "force": "kJ/(mol*nm)", + "length": "nm", "time": "ps", - "velocity": "Angstrom ps-1" + "velocity": "nm/ps" } \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/.zarray b/zarrtraj/data/test.zarrtraj/particles/velocities/.zarray similarity index 57% rename from zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/.zarray rename to zarrtraj/data/test.zarrtraj/particles/velocities/.zarray index 349b639..851d760 100644 --- a/zarrtraj/data/test.zarrtraj/particles/trajectory/velocity/value/.zarray +++ b/zarrtraj/data/test.zarrtraj/particles/velocities/.zarray @@ -1,10 +1,16 @@ { "chunks": [ - 1, + 10, 5, 3 ], - "compressor": null, + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, "dtype": "xVa~ApPit|{rc?s2|G0T8MOEr8H@sea&Q&b zuOGU8=1|P_z%%ZL09kP?U;qFB literal 0 HcmV?d00001 diff --git a/zarrtraj/data/test.zarrtraj/zarrtraj/.zattrs b/zarrtraj/data/test.zarrtraj/zarrtraj/.zattrs deleted file mode 100644 index 5ce59aa..0000000 --- a/zarrtraj/data/test.zarrtraj/zarrtraj/.zattrs +++ /dev/null @@ -1,3 +0,0 @@ -{ - "version": 1 -} \ No newline at end of file diff --git a/zarrtraj/data/test.zarrtraj/zarrtraj/.zgroup b/zarrtraj/data/test.zarrtraj/zarrtraj/.zgroup deleted file mode 100644 index 3b7daf2..0000000 --- a/zarrtraj/data/test.zarrtraj/zarrtraj/.zgroup +++ /dev/null @@ -1,3 +0,0 @@ -{ - "zarr_format": 2 -} \ No newline at end of file From 4d5df049bb819124d27b6ad0af66cbd0b53fefb9 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 3 May 2024 16:31:43 -0700 Subject: [PATCH 08/20] ci fix- minoconda --- .github/workflows/gh-ci.yaml | 3 +++ .../{tests => data}/create_zarrtraj_data.py | 5 ++--- zarrtraj/tests/sandbox.py | 21 ------------------- zarrtraj/tests/test_zarrtraj_s3.py | 11 +++++----- 4 files changed, 11 insertions(+), 29 deletions(-) rename zarrtraj/{tests => data}/create_zarrtraj_data.py (92%) delete mode 100644 zarrtraj/tests/sandbox.py diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index 64bb0ea..40e8d20 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -74,6 +74,9 @@ jobs: auto-update-conda: true auto-activate-base: false show-channel-urls: true + # Need this to ensure miniconda installed + # if not already on MacOS + miniconda-version: latest - name: Install MDAnalysis version diff --git a/zarrtraj/tests/create_zarrtraj_data.py b/zarrtraj/data/create_zarrtraj_data.py similarity index 92% rename from zarrtraj/tests/create_zarrtraj_data.py rename to zarrtraj/data/create_zarrtraj_data.py index 203ce91..6f60ee0 100644 --- a/zarrtraj/tests/create_zarrtraj_data.py +++ b/zarrtraj/data/create_zarrtraj_data.py @@ -21,9 +21,8 @@ def create_test_trj(uni, fname): with mda.Writer(root, n_atoms, format='ZARRTRAJ', positions=True, - velocities=True, - forces=True, - convert_units=False) as w: + velocities=True, + forces=True) as w: for i in range(5): uni.atoms.positions = 2 ** i * pos uni.trajectory.ts.time = i diff --git a/zarrtraj/tests/sandbox.py b/zarrtraj/tests/sandbox.py deleted file mode 100644 index 50f4c68..0000000 --- a/zarrtraj/tests/sandbox.py +++ /dev/null @@ -1,21 +0,0 @@ -import zarr -import zarrtraj -from zarrtraj.tests.datafiles import * -import MDAnalysis as mda -from MDAnalysisTests.datafiles import (TPR_xvf, TRR_xvf, - COORDINATES_TOPOLOGY) - - -print(zarrtraj.__path__) - -# -# -z = zarr.open_group(COORDINATES_ZARRTRAJ) - -print(z.tree()) -#print(COORDINATES_ZARRTRAJ) - -#u = mda.Universe(COORDINATES_TOPOLOGY, zarr.open_group(COORDINATES_ZARRTRAJ, 'r')) -# -#for ts in u: -# print(ts) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index b1f90e0..952aec1 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -112,7 +112,6 @@ class ZARRTRAJAWSReference(BaseReference): copied from test_xdr.TRRReference""" def __init__(self): super(ZARRTRAJAWSReference, self).__init__() - # self.trajectory already setup in setup_class self.trajectory = zarr_file_to_s3_bucket(COORDINATES_ZARRTRAJ) self.topology = COORDINATES_TOPOLOGY self.reader = zarrtraj.ZarrTrajReader @@ -158,7 +157,7 @@ def run_server(self): def reset_server(self): yield requests.post("http://localhost:5000/moto-api/reset") - + @staticmethod @pytest.fixture() def ref(): @@ -214,7 +213,7 @@ def test_copying(self, ref, reader): @pytest.mark.skipif(not HAS_ZARR, reason="zarr not installed") -class TestZarrTrajWriterBaseAPI(BaseWriterTest): +class TestZarrTrajAWSWriterBaseAPI(BaseWriterTest): """Tests ZarrTrajWriter with with synthetic trajectory.""" @@ -328,6 +327,8 @@ def test_max_memory_too_low(self, ref, reader, universe, tmpdir): with pytest.raises(ValueError): with ref.writer(outfile, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, + chunks=(1, universe.trajectory.n_atoms, 3), + max_memory=223, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe) @@ -348,7 +349,7 @@ def test_max_memory_usage(self, ref, reader, universe, tmpdir): # Helper Functions def get_memory_usage(writer): - mem = (writer._time_buffer.nbytes + writer._step_buffer.nbytes + - writer._edges_buffer.nbytes + writer._pos_buffer.nbytes + + mem = (writer._time_buffer.nbytes + writer._step_buffer.nbytes + + writer._dimensions_buffer.nbytes + writer._pos_buffer.nbytes + writer._force_buffer.nbytes + writer._vel_buffer.nbytes) return mem \ No newline at end of file From f1e6e46ac8667b91e6ecb21ca368c39c484a9a31 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 3 May 2024 17:36:14 -0700 Subject: [PATCH 09/20] uuid fix for GH Actions --- .github/workflows/gh-ci.yaml | 231 +++++++++++++++-------------- devtools/conda-envs/test_env.yaml | 5 +- zarrtraj/tests/test_zarrtraj_s3.py | 24 +-- 3 files changed, 134 insertions(+), 126 deletions(-) diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index 40e8d20..762fed8 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -1,5 +1,5 @@ name: GH Actions CI -on: +'on': push: branches: - main @@ -10,7 +10,7 @@ on: # Weekly tests at midnight on Sundays run on main by default: # Scheduled workflows run on the latest commit on the default or base branch. # (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule) - - cron: "0 0 * * 0" + - cron: 0 0 * * 0 concurrency: # Specific group naming so CI is only cancelled @@ -20,7 +20,7 @@ concurrency: defaults: run: - shell: bash -l {0} + shell: bash -l {0} jobs: environment-config: @@ -31,142 +31,143 @@ jobs: steps: - uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: '3.11' - id: get-compatible-python uses: MDAnalysis/mdanalysis-compatible-python@main with: - release: "latest" + release: latest main-tests: - if: "github.repository == 'Becksteinlab/zarrtraj'" + if: github.repository == 'Becksteinlab/zarrtraj' needs: environment-config runs-on: ${{ matrix.os }} strategy: - fail-fast: false - matrix: - os: [macOS-latest, ubuntu-latest, windows-latest] - python-version: ${{ fromJSON(needs.environment-config.outputs.python-matrix) }} - mdanalysis-version: ["latest", "develop"] + fail-fast: false + matrix: + os: + - macOS-latest + - ubuntu-latest + - windows-latest + python-version: ${{ fromJSON(needs.environment-config.outputs.python-matrix) }} + mdanalysis-version: + - latest + - develop steps: - - uses: actions/checkout@v4 - - - name: Build information - run: | - uname -a - df -h - ulimit -a - - - # More info on options: https://github.com/conda-incubator/setup-miniconda - - name: Install conda dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - python-version: ${{ matrix.python-version }} - environment-file: devtools/conda-envs/test_env.yaml - add-pip-as-python-dependency: true - architecture: x64 - - channels: defaults - - activate-environment: zarrtraj-test - auto-update-conda: true - auto-activate-base: false - show-channel-urls: true - # Need this to ensure miniconda installed - # if not already on MacOS - miniconda-version: latest - - - - name: Install MDAnalysis version - uses: MDAnalysis/install-mdanalysis@main - with: - version: ${{ matrix.mdanalysis-version }} - install-tests: true - installer: conda - shell: bash -l {0} - - - name: Install package - run: | - python --version - python -m pip install . --no-deps - - - name: Python information - run: | - which python - which pip - pip list - - conda info - conda list - - - - name: Run tests - run: | - pytest -n 2 -v --cov=zarrtraj --cov-report=xml --color=yes zarrtraj/tests/ - - - name: codecov - if: github.repository == 'Becksteinlab/zarrtraj' && github.event_name != 'schedule' - uses: codecov/codecov-action@v4.0.1 - with: - file: coverage.xml - name: codecov-${{ matrix.os }}-py${{ matrix.python-version }} - verbose: True - token: ${{ secrets.CODECOV_TOKEN }} - slug: Becksteinlab/zarrtraj + - uses: actions/checkout@v4 + - name: Build information + run: | + uname -a + df -h + ulimit -a + + # More info on options: https://github.com/conda-incubator/setup-miniconda + - name: Install conda dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.python-version }} + environment-file: devtools/conda-envs/test_env.yaml + add-pip-as-python-dependency: true + architecture: x64 + + channels: defaults + + activate-environment: zarrtraj-test + auto-update-conda: true + auto-activate-base: false + show-channel-urls: true + # Need this to ensure miniconda installed + # if not already on MacOS + miniconda-version: latest + + - name: Install MDAnalysis version + uses: MDAnalysis/install-mdanalysis@main + with: + version: ${{ matrix.mdanalysis-version }} + install-tests: true + installer: conda + shell: bash -l {0} + + - name: Install package + run: | + python --version + python -m pip install . --no-deps + + - name: Python information + run: | + which python + which pip + pip list + conda info + conda list + + - name: Run tests + run: | + pytest -n 2 -v --cov=zarrtraj --cov-report=xml --color=yes zarrtraj/tests/ + + - name: codecov + if: github.repository == 'Becksteinlab/zarrtraj' && github.event_name != + 'schedule' + uses: codecov/codecov-action@v4.0.1 + with: + file: coverage.xml + name: codecov-${{ matrix.os }}-py${{ matrix.python-version }} + verbose: true + token: ${{ secrets.CODECOV_TOKEN }} + slug: Becksteinlab/zarrtraj pylint_check: - if: "github.repository == 'Becksteinlab/zarrtraj'" + if: github.repository == 'Becksteinlab/zarrtraj' needs: environment-config runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ needs.environment-config.outputs.stable-python-version }} - - - name: Install Pylint - run: | - which pip - which python - pip install pylint mdanalysis + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ needs.environment-config.outputs.stable-python-version }} - - name: Run Pylint - env: - PYLINTRC: .pylintrc - run: | - pylint zarrtraj + - name: Install Pylint + run: | + which pip + which python + pip install pylint mdanalysis + - name: Run Pylint + env: + PYLINTRC: .pylintrc + run: | + pylint zarrtraj pypi_check: - if: "github.repository == 'Becksteinlab/zarrtraj'" + if: github.repository == 'Becksteinlab/zarrtraj' needs: environment-config runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ needs.environment-config.outputs.stable-python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ needs.environment-config.outputs.stable-python-version }} - - - name: Install dependencies - run: | - pip install pipx twine - - - name: Build package - run: | - python -m pipx run build --sdist - - - name: Check package build - run: | - DISTRIBUTION=$(ls -t1 dist/zarrtraj-*.tar.gz | head -n 1) - test -n "${DISTRIBUTION}" || { echo "no distribution dist/zarrtraj-*.tar.gz found"; exit 1; } - echo "twine check $DISTRIBUTION" - twine check $DISTRIBUTION + - uses: actions/checkout@v4 + + - name: Set up Python ${{ needs.environment-config.outputs.stable-python-version + }} + uses: actions/setup-python@v4 + with: + python-version: ${{ needs.environment-config.outputs.stable-python-version }} + + - name: Install dependencies + run: | + pip install pipx twine + + - name: Build package + run: | + python -m pipx run build --sdist + + - name: Check package build + run: | + DISTRIBUTION=$(ls -t1 dist/zarrtraj-*.tar.gz | head -n 1) + test -n "${DISTRIBUTION}" || { echo "no distribution dist/zarrtraj-*.tar.gz found"; exit 1; } + echo "twine check $DISTRIBUTION" + twine check $DISTRIBUTION \ No newline at end of file diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 38f93be..3736603 100755 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -21,7 +21,7 @@ dependencies: # AWS testing - moto=5.0.3 - ### Testing ### + ### General testing ### - MDAnalysisTests>=2.7.0 - pytest>=7.4.0 - pytest-xdist>=3.5.0 @@ -31,6 +31,9 @@ dependencies: ### Notebooks ### - jupyter + ### Benchmarking ### + - asv + # Pip-only installs # - pip: diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 952aec1..f3e0edc 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -27,9 +27,13 @@ assert_array_almost_equal) from .conftest import ZARRTRAJReference import requests +# Must ensure unique bucket name is created for GH actions +import uuid # Only call this once a Moto Server is running def zarr_file_to_s3_bucket(fname): + bucket_name = f"testbucket-{uuid.uuid4()}" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" os.environ["AWS_SECURITY_TOKEN"] = "testing" @@ -42,7 +46,7 @@ def zarr_file_to_s3_bucket(fname): region_name="us-east-1", endpoint_url="http://localhost:5000" ) - s3_resource.create_bucket(Bucket="testbucket") + s3_resource.create_bucket(Bucket=bucket_name) source = zarr.open_group(fname, mode='r') @@ -54,7 +58,7 @@ def zarr_file_to_s3_bucket(fname): ) ) cloud_store = s3fs.S3Map( - root=f'testbucket/{os.path.basename(fname)}', + root=f'{bucket_name}/{os.path.basename(fname)}', s3=s3_fs, check=False ) @@ -67,7 +71,8 @@ def zarr_file_to_s3_bucket(fname): # Only call this once a Moto Server is running def new_zarrgroup_in_bucket(fname): - + bucket_name = f"testbucket-{uuid.uuid4()}" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" os.environ["AWS_SECURITY_TOKEN"] = "testing" @@ -80,7 +85,7 @@ def new_zarrgroup_in_bucket(fname): region_name="us-east-1", endpoint_url="http://localhost:5000" ) - s3_resource.create_bucket(Bucket="testbucket") + s3_resource.create_bucket(Bucket=bucket_name) s3_fs = s3fs.S3FileSystem( anon=False, @@ -90,7 +95,7 @@ def new_zarrgroup_in_bucket(fname): ) ) cloud_store = s3fs.S3Map( - root=f'testbucket/{os.path.basename(fname)}', + root=f'{bucket_name}/{os.path.basename(fname)}', s3=s3_fs, check=False ) @@ -140,7 +145,7 @@ def iter_ts(self, i): ts.time = i ts.frame = i return ts - + @pytest.mark.skipif(not HAS_ZARR, reason="zarr not installed") class TestZarrTrajAWSReaderBaseAPI(MultiframeReaderTest): @@ -240,8 +245,8 @@ def test_write_different_box(self, ref, universe, tmpdir): outfile = new_zarrgroup_in_bucket(outfn) with tmpdir.as_cwd(): with ref.writer(outfile, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - format='ZARRTRAJ') as W: + n_frames=universe.trajectory.n_frames, + format='ZARRTRAJ') as W: for ts in universe.trajectory: universe.dimensions[:3] += 1 W.write(universe) @@ -261,7 +266,6 @@ def test_write_selection(self, ref, reader, universe, u_no_resnames, outfn = 'write-selection-test.' + ref.ext outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): with ref.writer(outfile, sel.n_atoms, n_frames=universe.trajectory.n_frames, @@ -319,7 +323,7 @@ def test_write_trajectory_universe(self, ref, reader, universe, tmpdir): for ts in universe.trajectory: w.write(universe) self._check_copy(outfile, ref, reader) - + def test_max_memory_too_low(self, ref, reader, universe, tmpdir): outfn = 'write-max-memory-test.' + ref.ext outfile = new_zarrgroup_in_bucket(outfn) From c8951fb5cad6c18f18fe8a20bd3d1f64fb7dcc17 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 3 May 2024 18:07:07 -0700 Subject: [PATCH 10/20] benchmarks + location constraint fix --- benchmarks/__init__.py | 1 + benchmarks/asv.conf.json | 184 +++++++++++++++++++++++++++++ benchmarks/benchmarks.py | 31 +++++ zarrtraj/tests/test_zarrtraj_s3.py | 8 +- 4 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/asv.conf.json create mode 100644 benchmarks/benchmarks.py diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ + diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 0000000..41e6388 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,184 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "zarrtraj", + + // The project's homepage + "project_url": "https://zarrtraj.readthedocs.io/en/latest/index.html", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/Becksteinlab/zarrtraj/commit", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + "pythons": ["3.10"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // A conda environment file that is used for environment creation. + // "conda_environment_file": "environment.yml", + + // The matrix of dependencies to test. Each key of the "req" + // requirements dictionary is the name of a package (in PyPI) and + // the values are version numbers. An empty list or empty string + // indicates to just test against the default (latest) + // version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed + // via pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // The ``@env`` and ``@env_nobuild`` keys contain the matrix of + // environment variables to pass to build and benchmark commands. + // An environment will be created for every combination of the + // cartesian product of the "@env" variables in this matrix. + // Variables in "@env_nobuild" will be passed to every environment + // during the benchmark phase, but will not trigger creation of + // new environments. A value of ``null`` means that the variable + // will not be set for the current combination. + // + // "matrix": { + // "req": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""] // emcee is only available for install with pip. + // }, + // "env": {"ENV_VAR_1": ["val1", "val2"]}, + // "env_nobuild": {"ENV_VAR_2": ["val3", null]}, + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // - req + // Required packages + // - env + // Environment variables + // - env_nobuild + // Non-build environment variables + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda + // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1 + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py new file mode 100644 index 0000000..2678143 --- /dev/null +++ b/benchmarks/benchmarks.py @@ -0,0 +1,31 @@ +# Write the benchmarking functions here. +# See "Writing benchmarks" in the asv docs for more information. + + +class TimeSuite: + """ + An example benchmark that times the performance of various kinds + of iterating over dictionaries in Python. + """ + def setup(self): + self.d = {} + for x in range(500): + self.d[x] = None + + def time_keys(self): + for key in self.d.keys(): + pass + + def time_values(self): + for value in self.d.values(): + pass + + def time_range(self): + d = self.d + for key in range(500): + x = d[key] + + +class MemSuite: + def mem_list(self): + return [0] * 256 diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index f3e0edc..8118d78 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -43,7 +43,7 @@ def zarr_file_to_s3_bucket(fname): # Need granular control s3_resource = boto3.resource( "s3", - region_name="us-east-1", + region_name="us-west-1", endpoint_url="http://localhost:5000" ) s3_resource.create_bucket(Bucket=bucket_name) @@ -53,7 +53,7 @@ def zarr_file_to_s3_bucket(fname): s3_fs = s3fs.S3FileSystem( anon=False, client_kwargs=dict( - region_name='us-east-1', + region_name='us-west-1', endpoint_url="http://localhost:5000" ) ) @@ -82,7 +82,7 @@ def new_zarrgroup_in_bucket(fname): # Need granular control s3_resource = boto3.resource( "s3", - region_name="us-east-1", + region_name="us-west-1", endpoint_url="http://localhost:5000" ) s3_resource.create_bucket(Bucket=bucket_name) @@ -90,7 +90,7 @@ def new_zarrgroup_in_bucket(fname): s3_fs = s3fs.S3FileSystem( anon=False, client_kwargs=dict( - region_name='us-east-1', + region_name='us-west-1', endpoint_url="http://localhost:5000" ) ) From f962397d8e82cb8529662ee03bc8188fcdb3ab0e Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Fri, 3 May 2024 18:29:43 -0700 Subject: [PATCH 11/20] Location constraint --- zarrtraj/tests/test_zarrtraj_s3.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 8118d78..4efaef5 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -46,7 +46,9 @@ def zarr_file_to_s3_bucket(fname): region_name="us-west-1", endpoint_url="http://localhost:5000" ) - s3_resource.create_bucket(Bucket=bucket_name) + s3_resource.create_bucket(Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': + 'us-west-1'}) source = zarr.open_group(fname, mode='r') @@ -85,7 +87,9 @@ def new_zarrgroup_in_bucket(fname): region_name="us-west-1", endpoint_url="http://localhost:5000" ) - s3_resource.create_bucket(Bucket=bucket_name) + s3_resource.create_bucket(Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': + 'us-west-1'}) s3_fs = s3fs.S3FileSystem( anon=False, From a8dcc865470d068ca37b3f9d572f7840867c40d0 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 15:12:44 -0700 Subject: [PATCH 12/20] test optimizations --- .github/workflows/gh-ci.yaml | 3 +- zarrtraj/tests/test_zarrtraj_s3.py | 211 +++++++++++++---------------- 2 files changed, 93 insertions(+), 121 deletions(-) diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index 762fed8..fa613c7 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -151,8 +151,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ needs.environment-config.outputs.stable-python-version - }} + - name: Set up Python ${{ needs.environment-config.outputs.stable-python-version }} uses: actions/setup-python@v4 with: python-version: ${{ needs.environment-config.outputs.stable-python-version }} diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 4efaef5..8c47af1 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -30,6 +30,7 @@ # Must ensure unique bucket name is created for GH actions import uuid + # Only call this once a Moto Server is running def zarr_file_to_s3_bucket(fname): bucket_name = f"testbucket-{uuid.uuid4()}" @@ -68,9 +69,10 @@ def zarr_file_to_s3_bucket(fname): zarr.convenience.copy_store(source.store, cloud_store, if_exists='replace') - cloud_dest = zarr.open_group(store=cloud_store, mode='a') + cloud_dest = zarr.open_group(store=cloud_store, mode='r') return cloud_dest + # Only call this once a Moto Server is running def new_zarrgroup_in_bucket(fname): bucket_name = f"testbucket-{uuid.uuid4()}" @@ -108,12 +110,6 @@ def new_zarrgroup_in_bucket(fname): return cloud_dest -# Helper function to calculate the memory usage of a writer at a frame -def get_memory_usage(writer): - mem = (writer._time_buffer.nbytes + writer._step_buffer.nbytes + - writer._edges_buffer.nbytes + writer._pos_buffer.nbytes + - writer._force_buffer.nbytes + writer._vel_buffer.nbytes) - return mem @pytest.mark.skipif(not HAS_ZARR, reason="Zarr not installed") class ZARRTRAJAWSReference(BaseReference): @@ -162,17 +158,12 @@ def run_server(self): yield self.server.stop() - @pytest.fixture(autouse=True) - def reset_server(self): - yield - requests.post("http://localhost:5000/moto-api/reset") - + # Only create one ref to avoid high memory usage + @pytest.fixture(scope='class') @staticmethod - @pytest.fixture() def ref(): - yield ZARRTRAJAWSReference() - - + r = ZARRTRAJAWSReference() + yield r def test_get_writer_1(self, ref, reader, tmpdir): with tmpdir.as_cwd(): @@ -224,140 +215,122 @@ def test_copying(self, ref, reader): @pytest.mark.skipif(not HAS_ZARR, reason="zarr not installed") class TestZarrTrajAWSWriterBaseAPI(BaseWriterTest): """Tests ZarrTrajWriter with with synthetic trajectory.""" - - + # Run one moto server for the entire class + # And only keep one zarr group to write to @pytest.fixture(autouse=True, scope='class') def run_server(self): self.server = ThreadedMotoServer() self.server.start() + self.outgroup = new_zarrgroup_in_bucket("test-write.zarrtraj") yield self.server.stop() + @pytest.fixture(scope='class') + @staticmethod + def outgroup(): + r = new_zarrgroup_in_bucket("test-write.zarrtraj") + yield r + + # After each test, clear the cloud zarr group @pytest.fixture(autouse=True) - def reset_server(self): + def clear_outgroup(self, outgroup): yield - requests.post("http://localhost:5000/moto-api/reset") + outgroup.clear() @staticmethod @pytest.fixture() def ref(): yield ZARRTRAJReference() - def test_write_different_box(self, ref, universe, tmpdir): + def test_write_different_box(self, ref, universe, outgroup): if ref.changing_dimensions: - outfn = 'write-dimensions-test' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with ref.writer(outfile, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - format='ZARRTRAJ') as W: - for ts in universe.trajectory: - universe.dimensions[:3] += 1 - W.write(universe) - - written = ref.reader(outfile) - - for ts_ref, ts_w in zip(universe.trajectory, written): + with ref.writer(outgroup, universe.atoms.n_atoms, + n_frames=universe.trajectory.n_frames, + format='ZARRTRAJ') as W: + for ts in universe.trajectory: universe.dimensions[:3] += 1 - assert_array_almost_equal(universe.dimensions, - ts_w.dimensions, - decimal=ref.prec) + W.write(universe) + written = ref.reader(outgroup) + + for ts_ref, ts_w in zip(universe.trajectory, written): + universe.dimensions[:3] += 1 + assert_array_almost_equal(universe.dimensions, + ts_w.dimensions, + decimal=ref.prec) def test_write_selection(self, ref, reader, universe, u_no_resnames, - u_no_resids, u_no_names, tmpdir): + u_no_resids, u_no_names, outgroup): sel_str = 'resid 1' sel = universe.select_atoms(sel_str) - outfn = 'write-selection-test.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - - with tmpdir.as_cwd(): - with ref.writer(outfile, sel.n_atoms, - n_frames=universe.trajectory.n_frames, - format='ZARRTRAJ') as W: - for ts in universe.trajectory: - W.write(sel.atoms) - - copy = ref.reader(outfile) - for orig_ts, copy_ts in zip(universe.trajectory, copy): - assert_array_almost_equal( - copy_ts._pos, sel.atoms.positions, ref.prec, - err_msg="coordinate mismatch between original and written " - "trajectory at frame {} (orig) vs {} (copy)".format( - orig_ts.frame, copy_ts.frame)) - - def test_write_none(self, ref, tmpdir): - outfn = 'write-none.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with pytest.raises(TypeError): - with ref.writer(outfile, 42, n_frames=1, - max_memory=1, - chunks=(1, 5, 3), format='ZARRTRAJ') as w: - w.write(None) - - def test_write_not_changing_ts(self, ref, universe, tmpdir): - outfn = 'write-not-changing-ts.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) + with ref.writer(outgroup, sel.n_atoms, + n_frames=universe.trajectory.n_frames, + format='ZARRTRAJ') as W: + for ts in universe.trajectory: + W.write(sel.atoms) + copy = ref.reader(outgroup) + for orig_ts, copy_ts in zip(universe.trajectory, copy): + assert_array_almost_equal( + copy_ts._pos, sel.atoms.positions, ref.prec, + err_msg="coordinate mismatch between original and written " + "trajectory at frame {} (orig) vs {} (copy)".format( + orig_ts.frame, copy_ts.frame)) + + def test_write_none(self, ref, outgroup): + with pytest.raises(TypeError): + with ref.writer(outgroup, 42, n_frames=1, + max_memory=1, + chunks=(1, 5, 3), format='ZARRTRAJ') as w: + w.write(None) + + def test_write_not_changing_ts(self, ref, universe, outgroup): copy_ts = universe.trajectory.ts.copy() - with tmpdir.as_cwd(): - with ref.writer(outfile, n_atoms=5, n_frames=1, - format='ZARRTRAJ') as W: - W.write(universe) - assert_timestep_almost_equal(copy_ts, universe.trajectory.ts) - - def test_write_trajectory_atomgroup(self, ref, reader, universe, tmpdir): - outfn = 'write-atoms-test.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with ref.writer(outfile, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - format='ZARRTRAJ') as w: - for ts in universe.trajectory: - w.write(universe.atoms) - self._check_copy(outfile, ref, reader) - - def test_write_trajectory_universe(self, ref, reader, universe, tmpdir): - outfn = 'write-uni-test.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with ref.writer(outfile, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - format='ZARRTRAJ') as w: - for ts in universe.trajectory: - w.write(universe) - self._check_copy(outfile, ref, reader) - - def test_max_memory_too_low(self, ref, reader, universe, tmpdir): - outfn = 'write-max-memory-test.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with pytest.raises(ValueError): - with ref.writer(outfile, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - chunks=(1, universe.trajectory.n_atoms, 3), - max_memory=223, - format='ZARRTRAJ') as w: - for ts in universe.trajectory: - w.write(universe) - - def test_max_memory_usage(self, ref, reader, universe, tmpdir): - outfn = 'write-max-memory-test.' + ref.ext - outfile = new_zarrgroup_in_bucket(outfn) - with tmpdir.as_cwd(): - with ref.writer(outfile, universe.atoms.n_atoms, + with ref.writer(outgroup, n_atoms=5, n_frames=1, + format='ZARRTRAJ') as W: + W.write(universe) + assert_timestep_almost_equal(copy_ts, universe.trajectory.ts) + + def test_write_trajectory_atomgroup(self, ref, reader, universe, outgroup): + with ref.writer(outgroup, universe.atoms.n_atoms, + n_frames=universe.trajectory.n_frames, + format='ZARRTRAJ') as w: + for ts in universe.trajectory: + w.write(universe.atoms) + self._check_copy(outgroup, ref, reader) + + def test_write_trajectory_universe(self, ref, reader, universe, outgroup): + with ref.writer(outgroup, universe.atoms.n_atoms, + n_frames=universe.trajectory.n_frames, + format='ZARRTRAJ') as w: + for ts in universe.trajectory: + w.write(universe) + self._check_copy(outgroup, ref, reader) + + def test_max_memory_too_low(self, ref, reader, universe, outgroup): + with pytest.raises(ValueError): + with ref.writer(outgroup, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, chunks=(1, universe.trajectory.n_atoms, 3), - max_memory=224, + max_memory=223, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe) - # Each frame of synthetic trajectory should be 224 bytes - assert get_memory_usage(w) <= 224 + + def test_max_memory_usage(self, ref, reader, universe, outgroup): + with ref.writer(outgroup, universe.atoms.n_atoms, + n_frames=universe.trajectory.n_frames, + chunks=(1, universe.trajectory.n_atoms, 3), + max_memory=224, + format='ZARRTRAJ') as w: + for ts in universe.trajectory: + w.write(universe) + # Each frame of synthetic trajectory should be 224 bytes + assert get_memory_usage(w) <= 224 + # Helper Functions def get_memory_usage(writer): mem = (writer._time_buffer.nbytes + writer._step_buffer.nbytes + writer._dimensions_buffer.nbytes + writer._pos_buffer.nbytes + writer._force_buffer.nbytes + writer._vel_buffer.nbytes) - return mem \ No newline at end of file + return mem From 5dbfe0b59b69bdd7cd9a208e9e546f0908db5a56 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 15:26:44 -0700 Subject: [PATCH 13/20] removed static modifiers --- zarrtraj/tests/test_zarrtraj_s3.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 8c47af1..33dfef2 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -160,8 +160,7 @@ def run_server(self): # Only create one ref to avoid high memory usage @pytest.fixture(scope='class') - @staticmethod - def ref(): + def ref(self): r = ZARRTRAJAWSReference() yield r @@ -226,9 +225,8 @@ def run_server(self): self.server.stop() @pytest.fixture(scope='class') - @staticmethod - def outgroup(): - r = new_zarrgroup_in_bucket("test-write.zarrtraj") + def outgroup(self): + r = new_zarrgroup_in_bucket("test-write.zarrtraj") yield r # After each test, clear the cloud zarr group From 97f72d6a790005ed89cb9d522b66b8ea7f875a0d Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 15:51:21 -0700 Subject: [PATCH 14/20] test optimization --- .github/workflows/gh-ci.yaml | 2 +- zarrtraj/tests/test_zarrtraj_s3.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index fa613c7..85fc339 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -49,7 +49,7 @@ jobs: - macOS-latest - ubuntu-latest - windows-latest - python-version: ${{ fromJSON(needs.environment-config.outputs.python-matrix) }} + python-version: ["3.10", "3.11", "3.12"] mdanalysis-version: - latest - develop diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 33dfef2..a0e7dfa 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -314,16 +314,16 @@ def test_max_memory_too_low(self, ref, reader, universe, outgroup): for ts in universe.trajectory: w.write(universe) - def test_max_memory_usage(self, ref, reader, universe, outgroup): + def test_max_memory_usage(self, ref, universe, outgroup): with ref.writer(outgroup, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, chunks=(1, universe.trajectory.n_atoms, 3), max_memory=224, format='ZARRTRAJ') as w: - for ts in universe.trajectory: + for ts in universe.trajectory[:5]: w.write(universe) # Each frame of synthetic trajectory should be 224 bytes - assert get_memory_usage(w) <= 224 + assert get_memory_usage(w) <= 224 # Helper Functions From f132501bc1ba0c1c488a2e1a9d625dea5cd48402 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 15:57:15 -0700 Subject: [PATCH 15/20] test remove max mem usage --- zarrtraj/tests/test_zarrtraj_s3.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index a0e7dfa..f2182c0 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -314,16 +314,6 @@ def test_max_memory_too_low(self, ref, reader, universe, outgroup): for ts in universe.trajectory: w.write(universe) - def test_max_memory_usage(self, ref, universe, outgroup): - with ref.writer(outgroup, universe.atoms.n_atoms, - n_frames=universe.trajectory.n_frames, - chunks=(1, universe.trajectory.n_atoms, 3), - max_memory=224, - format='ZARRTRAJ') as w: - for ts in universe.trajectory[:5]: - w.write(universe) - # Each frame of synthetic trajectory should be 224 bytes - assert get_memory_usage(w) <= 224 # Helper Functions From 47f6d36e1a4cf0dbd75b9346992003034a94ac5e Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 18:03:01 -0700 Subject: [PATCH 16/20] de parallelize tests --- .github/workflows/gh-ci.yaml | 2 +- zarrtraj/ZARRTRAJ.py | 1 + zarrtraj/tests/test_zarrtraj_s3.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index 85fc339..c8f8ac7 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -105,7 +105,7 @@ jobs: - name: Run tests run: | - pytest -n 2 -v --cov=zarrtraj --cov-report=xml --color=yes zarrtraj/tests/ + pytest -v --cov=zarrtraj --cov-report=xml --color=yes zarrtraj/tests/ - name: codecov if: github.repository == 'Becksteinlab/zarrtraj' && github.event_name != diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index bcb6f09..7257bf2 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -596,6 +596,7 @@ def _write_next_frame(self, ag): self._check_max_memory() self._initialize_zarr_datasets(ts) self._initialize_memory_buffers() + # pylint: disable=method-hidden self._write_next_timestep = self._write_next_cloud_timestep else: self._initialize_zarr_datasets(ts) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index f2182c0..a0e7dfa 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -314,6 +314,16 @@ def test_max_memory_too_low(self, ref, reader, universe, outgroup): for ts in universe.trajectory: w.write(universe) + def test_max_memory_usage(self, ref, universe, outgroup): + with ref.writer(outgroup, universe.atoms.n_atoms, + n_frames=universe.trajectory.n_frames, + chunks=(1, universe.trajectory.n_atoms, 3), + max_memory=224, + format='ZARRTRAJ') as w: + for ts in universe.trajectory[:5]: + w.write(universe) + # Each frame of synthetic trajectory should be 224 bytes + assert get_memory_usage(w) <= 224 # Helper Functions From 45e676addf6dd160855f0bc9f7b7b9eca341b64f Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 18:22:48 -0700 Subject: [PATCH 17/20] docs + test fix --- zarrtraj/ZARRTRAJ.py | 41 +++++++++++++++++++++--------- zarrtraj/tests/test_zarrtraj_s3.py | 1 - 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index 7257bf2..b56bef0 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -3,33 +3,50 @@ Example: Loading a .zarrtraj file from disk ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To load a ZarrTraj simulation from a .zarrtraj trajectory file, pass a topology file -and a `zarr.Group` object to :class:`~MDAnalysis.core.universe.Universe`:: +To load a ZarrTraj simulation from a .zarrtraj trajectory file, pass a +topology file and a `zarr.Group` object to +:class:`~MDAnalysis.core.universe.Universe`:: import zarrtraj import MDAnalysis as mda - u = mda.Universe("topology.tpr", zarr.open_group("trajectory.zarrtraj", mode="r")) + u = mda.Universe("topology.tpr", zarr.open_group("trajectory.zarrtraj", + mode="r")) Example: Reading from cloud services ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Zarrtraj currently supports AWS, Google Cloud, and Azure Block Storage backed Zarr groups. +Zarrtraj currently supports AWS, Google Cloud, and Azure Block Storage backed +Zarr groups. -To read from AWS S3, wrap your Zarr group in a Least-Recently-Used cache to reduce I/O:: +To read from AWS S3, wrap your Zarr group in a Least-Recently-Used cache to +reduce I/O:: import s3fs import zarrtraj import MDAnalysis as mda - key = os.getenv('AWS_KEY') - secret = os.getenv('AWS_SECRET_KEY') - s3 = s3fs.S3FileSystem(key=key, secret=secret) - store = s3fs.S3Map(root='/trajectory.zarrtraj', s3=s3, check=False) - cache = LRUStoreCache(store, max_size=2**25) # max_size is cache size in bytes + + # Profiles are setup in ./aws/credentials + s3 = s3fs.S3FileSystem( + anon=False, + profile='sample_profile', + client_kwargs=dict( + region_name='us-east-1'' + ) + ) + + cloud_store = s3fs.S3Map( + root='/trajectory.zarrtraj', + s3=s3, + check=False + ) + + # max_size is cache size in bytes + cache = LRUStoreCache(cloud_store, max_size=2**25) root = zarr.group(store=cache) u = mda.Universe("topology.tpr", root) -Because of this local cache model, random-access trajectory reading for cloud-backed Zarr Groups -is not currently supported. +Because of this local cache model, random-access trajectory reading for +cloud-backed Zarr Groups is not currently supported. Example: Writing to cloud services ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index a0e7dfa..203e152 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -220,7 +220,6 @@ class TestZarrTrajAWSWriterBaseAPI(BaseWriterTest): def run_server(self): self.server = ThreadedMotoServer() self.server.start() - self.outgroup = new_zarrgroup_in_bucket("test-write.zarrtraj") yield self.server.stop() From 19e5a1d0699325d10f02ed9e83a85fbe8d11576a Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 18:34:58 -0700 Subject: [PATCH 18/20] test cleanup improvement --- zarrtraj/tests/test_zarrtraj_s3.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index 203e152..e9b56ea 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -223,16 +223,11 @@ def run_server(self): yield self.server.stop() - @pytest.fixture(scope='class') + @pytest.fixture def outgroup(self): r = new_zarrgroup_in_bucket("test-write.zarrtraj") yield r - - # After each test, clear the cloud zarr group - @pytest.fixture(autouse=True) - def clear_outgroup(self, outgroup): - yield - outgroup.clear() + zarr.storage.rmdir(r.store) @staticmethod @pytest.fixture() From 26c1fa9ba6737968beb3a18099950d03fe8a38e8 Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 18:46:18 -0700 Subject: [PATCH 19/20] explicit method assignment --- .github/workflows/gh-ci.yaml | 3 +++ zarrtraj/ZARRTRAJ.py | 42 ++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/.github/workflows/gh-ci.yaml b/.github/workflows/gh-ci.yaml index c8f8ac7..a7ab2c8 100755 --- a/.github/workflows/gh-ci.yaml +++ b/.github/workflows/gh-ci.yaml @@ -104,6 +104,9 @@ jobs: conda list - name: Run tests + # Tests must be sequential to avoid + # moto buckets and groups interfering with + # one another run: | pytest -v --cov=zarrtraj --cov-report=xml --color=yes zarrtraj/tests/ diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index b56bef0..c47f021 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -42,10 +42,10 @@ # max_size is cache size in bytes cache = LRUStoreCache(cloud_store, max_size=2**25) - root = zarr.group(store=cache) - u = mda.Universe("topology.tpr", root) + zgroup = zarr.group(store=cache) + u = mda.Universe("topology.tpr", zgroup) -Because of this local cache model, random-access trajectory reading for +Because of this local cache model, random-access trajectory reading for cloud-backed Zarr Groups is not currently supported. Example: Writing to cloud services @@ -60,14 +60,24 @@ import s3fs import zarrtraj import MDAnalysis as mda - key = os.getenv('AWS_KEY') - secret = os.getenv('AWS_SECRET_KEY') - s3 = s3fs.S3FileSystem(key=key, secret=secret) - store = s3fs.S3Map(root='/trajectory.zarrtraj', s3=s3, check=False) - root = zarr.open_group(store=store) - with mda.Writer(root, u.trajectory.n_atoms, n_frames=u.trajectory.n_frames, - chunks=(10, u.trajectory.n_atoms, 3), - max_memory=2**20, + + s3 = s3fs.S3FileSystem( + anon=False, + profile='sample_profile', + client_kwargs=dict( + region_name='us-east-1'' + ) + ) + + cloud_store = s3fs.S3Map( + root='/trajectory.zarrtraj', + s3=s3, + check=False + ) + + zgroup = zarr.open_group(store=cloud_store) + with mda.Writer(zgroup, u.trajectory.n_atoms, + n_frames=u.trajectory.n_frames, format='ZARRTRAJ') as w: for ts in u.trajectory: w.write(u.atoms) @@ -84,7 +94,6 @@ :inherited-members: """ - import numpy as np import MDAnalysis as mda from MDAnalysis.coordinates import base, core @@ -93,7 +102,6 @@ from MDAnalysis.lib.util import store_init_arguments import dask.array as da from enum import Enum -import time # NOTE: REMOVE after test try: @@ -613,12 +621,14 @@ def _write_next_frame(self, ag): self._check_max_memory() self._initialize_zarr_datasets(ts) self._initialize_memory_buffers() - # pylint: disable=method-hidden - self._write_next_timestep = self._write_next_cloud_timestep else: self._initialize_zarr_datasets(ts) self._initial_write = False - return self._write_next_timestep(ts) + + if self._is_cloud_storage: + return self._write_next_cloud_timestep(ts) + else: + return self._write_next_timestep(ts) def _determine_units(self, ag): """Verifies the trajectory contains all From 7fd230633a3ef8b14ff543accea0bbce198ab8ec Mon Sep 17 00:00:00 2001 From: Lawson Woods Date: Sat, 4 May 2024 18:57:22 -0700 Subject: [PATCH 20/20] fix force buffer --- zarrtraj/ZARRTRAJ.py | 24 +++++++++++++----------- zarrtraj/tests/test_zarrtraj_s3.py | 6 ++++++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/zarrtraj/ZARRTRAJ.py b/zarrtraj/ZARRTRAJ.py index c47f021..5b36681 100755 --- a/zarrtraj/ZARRTRAJ.py +++ b/zarrtraj/ZARRTRAJ.py @@ -526,8 +526,8 @@ def __init__(self, filename, n_atoms, n_frames=None, if creator == 'MDAnalysis': self._file['metadata'].attrs['creator_version'] = creator_version - self._determine_if_cloud_storage() - if self._is_cloud_storage or self.force_buffered: + self._determine_if_buffered_storage() + if self._is_buffered_store: # Ensure n_frames exists if n_frames is None: raise TypeError("ZarrTrajWriter: Buffered writing requires " + @@ -563,7 +563,7 @@ def __init__(self, filename, n_atoms, n_frames=None, self._initial_write = True - def _determine_if_cloud_storage(self): + def _determine_if_buffered_storage(self): # Check if we are working with a cloud storage type store = self._file.store if isinstance(store, zarr.storage.FSStore): @@ -575,7 +575,7 @@ def _determine_if_cloud_storage(self): except ImportError: raise Exception("Writing to AWS S3 requires installing " + + "s3fs") - self._is_cloud_storage = True + self._is_buffered_store = True elif 'gcs' in store.fs.protocol: # Verify gcsfs is installed try: @@ -583,11 +583,13 @@ def _determine_if_cloud_storage(self): except ImportError: raise Exception("Writing to Google Cloud Storage " + + "requires installing gcsfs") - self._is_cloud_storage = True + self._is_buffered_store = True elif isinstance(store, zarr.storage.ABSStore): - self._is_cloud_storage = True + self._is_buffered_store = True + elif self.force_buffered: + self._is_buffered_store = True else: - self._is_cloud_storage = False + self._is_buffered_store = False def _write_next_frame(self, ag): """Write information associated with ``ag`` at current frame @@ -617,7 +619,7 @@ def _write_next_frame(self, ag): if self._initial_write: self._determine_has(ts) self._determine_units(ag) - if self._is_cloud_storage: + if self._is_buffered_store: self._check_max_memory() self._initialize_zarr_datasets(ts) self._initialize_memory_buffers() @@ -625,8 +627,8 @@ def _write_next_frame(self, ag): self._initialize_zarr_datasets(ts) self._initial_write = False - if self._is_cloud_storage: - return self._write_next_cloud_timestep(ts) + if self._is_buffered_store: + return self._write_next_buffered_timestep(ts) else: return self._write_next_timestep(ts) @@ -801,7 +803,7 @@ def _initialize_memory_buffers(self): # Reduce cloud I/O by storing previous step val for error checking self._prev_step = None - def _write_next_cloud_timestep(self, ts): + def _write_next_buffered_timestep(self, ts): """Write the next timestep to a cloud or buffered zarr group. Will only actually perform write if buffer is full""" i = self._counter diff --git a/zarrtraj/tests/test_zarrtraj_s3.py b/zarrtraj/tests/test_zarrtraj_s3.py index e9b56ea..98f4575 100644 --- a/zarrtraj/tests/test_zarrtraj_s3.py +++ b/zarrtraj/tests/test_zarrtraj_s3.py @@ -238,6 +238,7 @@ def test_write_different_box(self, ref, universe, outgroup): if ref.changing_dimensions: with ref.writer(outgroup, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, + force_buffered=True, format='ZARRTRAJ') as W: for ts in universe.trajectory: universe.dimensions[:3] += 1 @@ -257,6 +258,7 @@ def test_write_selection(self, ref, reader, universe, u_no_resnames, with ref.writer(outgroup, sel.n_atoms, n_frames=universe.trajectory.n_frames, + force_buffered=True, format='ZARRTRAJ') as W: for ts in universe.trajectory: W.write(sel.atoms) @@ -278,6 +280,7 @@ def test_write_none(self, ref, outgroup): def test_write_not_changing_ts(self, ref, universe, outgroup): copy_ts = universe.trajectory.ts.copy() with ref.writer(outgroup, n_atoms=5, n_frames=1, + force_buffered=True, format='ZARRTRAJ') as W: W.write(universe) assert_timestep_almost_equal(copy_ts, universe.trajectory.ts) @@ -285,6 +288,7 @@ def test_write_not_changing_ts(self, ref, universe, outgroup): def test_write_trajectory_atomgroup(self, ref, reader, universe, outgroup): with ref.writer(outgroup, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, + force_buffered=True, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe.atoms) @@ -293,6 +297,7 @@ def test_write_trajectory_atomgroup(self, ref, reader, universe, outgroup): def test_write_trajectory_universe(self, ref, reader, universe, outgroup): with ref.writer(outgroup, universe.atoms.n_atoms, n_frames=universe.trajectory.n_frames, + force_buffered=True, format='ZARRTRAJ') as w: for ts in universe.trajectory: w.write(universe) @@ -313,6 +318,7 @@ def test_max_memory_usage(self, ref, universe, outgroup): n_frames=universe.trajectory.n_frames, chunks=(1, universe.trajectory.n_atoms, 3), max_memory=224, + force_buffered=True, format='ZARRTRAJ') as w: for ts in universe.trajectory[:5]: w.write(universe)