Skip to content

Commit

Permalink
Merge pull request #368 from OpenFreeEnergy/serialization
Browse files Browse the repository at this point in the history
JSON serialization on `GufeTokenizable`
  • Loading branch information
jthorton authored Oct 31, 2024
2 parents 1b65523 + f15ce1e commit 4e13ab8
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 4 deletions.
43 changes: 40 additions & 3 deletions docs/guide/serialization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,54 @@ Similarly, you can reload the object with:
.. code::
import json
from gufe.tokenization import JSON_HANDLER
from gufe.tokenization import JSON_HANDLER, GufeTokenizable
with open(filename, mode='r') as f:
obj = json.load(f, cls=JSON_HANDLER.decoder)
obj = GufeTokenizable.from_dict(json.load(f, cls=JSON_HANDLER.decoder))
Note that these objects are not space-efficient: that is, if you have
the same object in memory referenced by multiple objects (e.g., an identical
``ProteinComponent`` in more than one ``ChemicalSystem``), then you will
save multiple copies of its JSON representation.

On reloading, tools that use the recommended ``from_dict`` method will undo
do this duplication; see :ref:`gufe-memory-deduplication` for details.
this duplication; see :ref:`gufe-memory-deduplication` for details.

As a more space-efficient alternative to ``to_dict``/``from_dict``, consider
using ``to_keyed_chain``/``from_keyed_chain`` instead.
This deals in a representation using the :class:`.KeyedChain` approach, which
avoids duplication of dependent :class:`.GufeTokenizables` in the serialized
JSON representation.

Convenient serialization
~~~~~~~~~~~~------------

We also provide convenience methods to convert any :class:`.GufeTokenizable` to
and from JSON using a space-efficient serialization strategy based on our
:class:`.KeyedChain` representation. This is intended for developers that want
to serialise these objects using the current best practice and are not
concerned with the details of the process. The :func:`to_json
<gufe.tokenization.GufeTokenizable.to_json>` API offers the flexibility to
convert to JSON directly or to write to a filelike object:

.. code::
# get a json representation in-memory
json = obj.to_json()
# save to a file directly
obj.to_json(file=filename)
Similarly, you can recreate the object using the :func:`from_json <gufe.tokenization.GufeTokenizable.from_json>`
classmethod:

.. code::
# load the object from a json file produced with `to_json`
obj = cls.from_json(file=filename)
# load from a string produced with `to_json`
obj = cls.from_json(content=json)
.. Using JSON codecs outside of JSON
.. ---------------------------------
Expand Down
52 changes: 52 additions & 0 deletions gufe/tests/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,21 @@ def leaf_dict(a):
':version:': 1,
}

self.expected_keyed_chain = [
(str(leaf.key),
leaf_dict("foo")),
(str(bar.key),
leaf_dict({':gufe-key:': str(leaf.key)})),
(str(self.cont.key),
{':version:': 1,
'__module__': __name__,
'__qualname__': 'Container',
'dct': {'a': 'b',
'leaf': {':gufe-key:': str(leaf.key)}},
'lst': [{':gufe-key:': str(leaf.key)}, 0],
'obj': {':gufe-key:': str(bar.key)}})
]

def test_set_key(self):
leaf = Leaf("test-set-key")
key = leaf.key
Expand Down Expand Up @@ -232,6 +247,43 @@ def test_from_keyed_dict(self):
assert recreated == self.cont
assert recreated is self.cont

def test_to_keyed_chain(self):
assert self.cont.to_keyed_chain() == self.expected_keyed_chain

def test_from_keyed_chain(self):
recreated = self.cls.from_keyed_chain(self.expected_keyed_chain)
assert recreated == self.cont
assert recreated is self.cont

def test_to_json_string(self):
raw_json = self.cont.to_json()

# tuples are converted to lists in JSON so fix the expected result to use lists
expected_key_chain = [list(tok) for tok in self.expected_keyed_chain]
assert json.loads(raw_json, cls=JSON_HANDLER.decoder) == expected_key_chain

def test_from_json_string(self):
recreated = self.cls.from_json(content=json.dumps(self.expected_keyed_chain, cls=JSON_HANDLER.encoder))

assert recreated == self.cont
assert recreated is self.cont

def test_to_json_file(self, tmpdir):
file_path = tmpdir / "container.json"
self.cont.to_json(file=file_path)

# tuples are converted to lists in JSON so fix the expected result to use lists
expected_key_chain = [list(tok) for tok in self.expected_keyed_chain]
assert json.load(file_path.open(mode="r"), cls=JSON_HANDLER.decoder) == expected_key_chain

def test_from_json_file(self, tmpdir):
file_path = tmpdir / "container.json"
json.dump(self.expected_keyed_chain, file_path.open(mode="w"), cls=JSON_HANDLER.encoder)
recreated = self.cls.from_json(file=file_path)

assert recreated == self.cont
assert recreated is self.cont

def test_to_shallow_dict(self):
assert self.cont.to_shallow_dict() == self.expected_shallow

Expand Down
93 changes: 92 additions & 1 deletion gufe/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import warnings
import weakref
from itertools import chain
from typing import Any, Union, List, Tuple, Dict, Generator
from os import PathLike
from typing import Any, Union, List, Tuple, Dict, Generator, TextIO, Optional
from typing_extensions import Self

from gufe.custom_codecs import (
Expand Down Expand Up @@ -622,6 +623,96 @@ def copy_with_replacements(self, **replacements):
dct.update(replacements)
return self._from_dict(dct)

def to_keyed_chain(self) -> List[Tuple[str, Dict]]:
"""
Generate a keyed chain representation of the object.
See Also
--------
KeyedChain
"""
return KeyedChain.gufe_to_keyed_chain_rep(self)

@classmethod
def from_keyed_chain(cls, keyed_chain: List[Tuple[str, Dict]]):
"""
Generate an instance from keyed chain representation.
Parameters
----------
keyed_chain : List[Tuple[str, Dict]]
The keyed_chain representation of the GufeTokenizable.
See Also
--------
KeyedChain
"""
return KeyedChain(keyed_chain=keyed_chain).to_gufe()

def to_json(self, file: Optional[PathLike | TextIO] = None) -> None | str:
"""
Generate a JSON keyed chain representation.
This will be writen to the filepath or filelike object if passed.
Parameters
----------
file
A filepath or filelike object to write the JSON to.
Returns
-------
str
A minimal JSON representation of the object if `file` is `None`; else None.
See Also
--------
from_json
"""

if file is None:
return json.dumps(self.to_keyed_chain(), cls=JSON_HANDLER.encoder)

from gufe.utils import ensure_filelike
with ensure_filelike(file, mode="w") as out:
json.dump(self.to_keyed_chain(), out, cls=JSON_HANDLER.encoder)

return None

@classmethod
def from_json(cls, file: Optional[PathLike | TextIO] = None, content: Optional[str] = None):
"""
Generate an instance from JSON keyed chain representation.
Can provide either a filepath/filelike as `file`, or JSON content via `content`.
Parameters
----------
file
A filepath or filelike object to read JSON data from.
content
A string to read JSON data from.
See Also
--------
to_json
"""

if content is not None and file is not None:
raise ValueError("Cannot specify both `content` and `file`; only one input allowed")
elif content is None and file is None:
raise ValueError("Must specify either `content` and `file` for JSON input")

if content is not None:
keyed_chain = json.loads(content, cls=JSON_HANDLER.decoder)
return cls.from_keyed_chain(keyed_chain=keyed_chain)

from gufe.utils import ensure_filelike
with ensure_filelike(file, mode="r") as f:
keyed_chain = json.load(f, cls=JSON_HANDLER.decoder)

return cls.from_keyed_chain(keyed_chain=keyed_chain)


class GufeKey(str):
def __repr__(self): # pragma: no cover
Expand Down

0 comments on commit 4e13ab8

Please sign in to comment.