Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add trailing_commas to dump/dumps #372

Merged
merged 6 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions amazon/ion/simpleion.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@


def dump(obj, fp, imports=None, binary=True, sequence_as_stream=False, indent=None,
tuple_as_sexp=False, omit_version_marker=False):
tuple_as_sexp=False, omit_version_marker=False, trailing_commas=False):
"""Serialize ``obj`` as an Ion formatted stream and write it to fp.

The python object hierarchy is mapped to the Ion data model as described in the module pydoc.
Expand All @@ -121,11 +121,15 @@ def dump(obj, fp, imports=None, binary=True, sequence_as_stream=False, indent=No
indent (Str): If binary is False and indent is a string, then members of containers will be pretty-printed with
a newline followed by that string repeated for each level of nesting. None (the default) selects the most
compact representation without any newlines. Example: to indent with four spaces per level of nesting,
use ``' '``.
use ``' '``. Supported only in the pure Python implementation (because pretty printing is not yet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np!

supported in the C implementation).
tuple_as_sexp (Optional[True|False]): When True, all tuple values will be written as Ion s-expressions.
When False, all tuple values will be written as Ion lists. Default: False.
omit_version_marker (Optional[True|False]): If binary is False and omit_version_marker is True, omits the
Ion Version Marker ($ion_1_0) from the output. Default: False.
trailing_commas (Optional[True|False]): If binary is False and pretty printing (indent is not None), includes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: it may actually be clearer to a user to make it independent of indent I know you don't intend to use it that way, and TBH I don't know the value of doing so, but it would be avoid the need to explain and users understand the dependent relationship, or test for it in the code. 🤷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a thought.

I actually had it accidentally including trailing commas in non-pretty-printed output for the first revision, so it would be easy to put back to that (except for the problem explained in the next paragraph). I guess if you say you don't want indent but you do want trailing commas, there's no strong reason not to.

However ... there is one reason not to. This is a reason why I'd rather that be a feature we could consider but not try to include here. That reason is that the Python C module implementation already doesn't support pretty-printing, so I was able to leave it out of scope. If we make trailing_commas applicable to non-pretty-printed output, then I have to go and do all the C module implementation.

I'd rather not hold this PR up to implement non-pretty-printed trailing-comma output in the C module; especially since I'm not sure we're likely to see any strong demand for trailing commas in non-pretty-printed output. I guess we could make it not depend on indent, but still disclaim it as "not supported in the C implementation". So that's an option.

trailing commas in containers. Default: False. Supported only in the pure Python implementation (because
pretty printing is not yet supported in the C implementation).

Returns None.
"""
Expand All @@ -135,11 +139,12 @@ def dump(obj, fp, imports=None, binary=True, sequence_as_stream=False, indent=No
else:
return dump_python(obj, fp, imports=imports, binary=binary, sequence_as_stream=sequence_as_stream,
indent=indent,
tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker)
tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker,
trailing_commas=trailing_commas)


def dumps(obj, imports=None, binary=True, sequence_as_stream=False,
indent=None, tuple_as_sexp=False, omit_version_marker=False):
indent=None, tuple_as_sexp=False, omit_version_marker=False, trailing_commas=False):
"""Serialize obj as described by dump, return the serialized data as bytes or unicode.

Returns:
Expand All @@ -149,7 +154,8 @@ def dumps(obj, imports=None, binary=True, sequence_as_stream=False,
ion_buffer = io.BytesIO()

dump(obj, ion_buffer, imports=imports, sequence_as_stream=sequence_as_stream, binary=binary,
indent=indent, tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker)
indent=indent, tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker,
trailing_commas=trailing_commas)

ret_val = ion_buffer.getvalue()
ion_buffer.close()
Expand Down Expand Up @@ -292,9 +298,10 @@ def loads(ion_str: Union[bytes, str], catalog=None, single_value=True, parse_eag


def dump_python(obj, fp, imports=None, binary=True, sequence_as_stream=False,
indent=None, tuple_as_sexp=False, omit_version_marker=False):
indent=None, tuple_as_sexp=False, omit_version_marker=False,
trailing_commas=False):
"""'pure' Python implementation. Users should prefer to call ``dump``."""
raw_writer = binary_writer(imports) if binary else text_writer(indent=indent)
raw_writer = binary_writer(imports) if binary else text_writer(indent=indent, trailing_commas=trailing_commas)
writer = blocking_writer(raw_writer, fp)
from_type = _FROM_TYPE_TUPLE_AS_SEXP if tuple_as_sexp else _FROM_TYPE
if binary or not omit_version_marker:
Expand Down Expand Up @@ -470,10 +477,11 @@ def add(obj):
event = reader.send(NEXT_EVENT)


def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp=False, omit_version_marker=False):
def dump_extension(obj, fp, binary=True, sequence_as_stream=False, tuple_as_sexp=False, omit_version_marker=False,
trailing_commas=False):
"""C-extension implementation. Users should prefer to call ``dump``."""

res = ionc.ionc_write(obj, binary, sequence_as_stream, tuple_as_sexp)
res = ionc.ionc_write(obj, binary, sequence_as_stream, tuple_as_sexp, trailing_commas)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This call is what is causing the segfault in the unit tests. ionc_write isn't expecting to get the trailing_commas argument, and is not handling it well.

If you want to put in a # TODO:, and drop the trailing_commas from the ionc.ionc_write call, I can dig into why it isn't handling that a bit more gracefully, and that should clear up the test errors (the ones that aren't already explained).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops! I meant to omit the argument from the call to ionc...

I'll fix that and post a new rev.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, I must be confused. I'm not passing the arg to the C code, as far as I can see?
(Note lack of trailing_commas=trailing_commas in the first branch.)

    if c_ext and __IS_C_EXTENSION_SUPPORTED and (imports is None and indent is None):
        return dump_extension(obj, fp, binary=binary, sequence_as_stream=sequence_as_stream,
                              tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker)
    else:
        return dump_python(obj, fp, imports=imports, binary=binary, sequence_as_stream=sequence_as_stream,
                           indent=indent,
                           tuple_as_sexp=tuple_as_sexp, omit_version_marker=omit_version_marker,
                           trailing_commas=trailing_commas)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, oops, in dump_extension. D'oh.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

(Removed trailing_commas from dump_extension() and added a TODO).


# TODO support "omit_version_marker" rather than hacking.
if not binary and not omit_version_marker:
Expand Down
11 changes: 6 additions & 5 deletions amazon/ion/writer_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def serialize(ion_event):


@coroutine
def _raw_writer_coroutine(depth=0, container_event=None, whence=None, indent=None):
def _raw_writer_coroutine(depth=0, container_event=None, whence=None, indent=None, trailing_commas=False):
pretty = indent is not None
serialize_container_delimiter = \
_serialize_container_delimiter_pretty if pretty else _serialize_container_delimiter_normal
Expand All @@ -369,7 +369,7 @@ def _raw_writer_coroutine(depth=0, container_event=None, whence=None, indent=Non
ion_event, self = (yield transition)
delegate = self

if has_written_values and not ion_event.event_type.ends_container:
if has_written_values and ((indent and trailing_commas) or not ion_event.event_type.ends_container):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may be wrong but I think this will also add a space after the last item in an s-exp: (foo bar bar ) I'm not sure that's bad but I don't think it's intended.

Copy link
Contributor Author

@mavjop mavjop Oct 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I did think you were right, but now I actually think not, because there's a separate set of delimiters defined for use when pretty printing:

_CONTAINER_DELIMITER_MAP_PRETTY = {
    IonType.STRUCT: b',',
    IonType.LIST: b',',
    IonType.SEXP: b'', # we use newlines when pretty printing
}

which uses an empty string as the delimiter for s-expressions, since they add newlines.

I could test this to make sure, but I'm pretty sure.

If we do want to apply this to non-pretty-printing, then we could do that, but then your concern expressed here would need an additional work-around, or there would indeed be an extra space.

I think the solution, if needed, would be to fork:

_CONTAINER_DELIMITER_MAP_NORMAL = {
    IonType.STRUCT: b',',
    IonType.LIST: b',',
    IonType.SEXP: b' ',
}

into:

_CONTAINER_DELIMITER_MAP_NORMAL = {
    IonType.STRUCT: b',',
    IonType.LIST: b',',
    IonType.SEXP: b' ',
}
_CONTAINER_END_DELIMITER_MAP_NORMAL = {
    IonType.STRUCT: b',',
    IonType.LIST: b',',
    IonType.SEXP: b'',
}

and use _CONTAINER_END_DELIMITER_MAP_NORMAL for the final trailing delimiter.

# TODO This will always emit a delimiter for containers--should make it not do that.
# Write the delimiter for the next value.
if depth == 0:
Expand Down Expand Up @@ -402,7 +402,8 @@ def _raw_writer_coroutine(depth=0, container_event=None, whence=None, indent=Non

if ion_event.event_type is IonEventType.CONTAINER_START:
writer_event = DataEvent(WriteEventType.NEEDS_INPUT, _serialize_container_start(ion_event))
delegate = _raw_writer_coroutine(depth + 1, ion_event, self, indent=indent)
delegate = _raw_writer_coroutine(depth + 1, ion_event, self, indent=indent,
trailing_commas=trailing_commas)
elif depth == 0:
# Serialize at the top-level.
if ion_event.event_type is IonEventType.STREAM_END:
Expand All @@ -429,7 +430,7 @@ def _raw_writer_coroutine(depth=0, container_event=None, whence=None, indent=Non


# TODO Add options for text formatting.
def raw_writer(indent=None):
def raw_writer(indent=None, trailing_commas=False):
"""Returns a raw text writer co-routine.

Yields:
Expand All @@ -447,7 +448,7 @@ def raw_writer(indent=None):
# of writing (Dec 2022).
indent_bytes = indent.encode("UTF-8") if isinstance(indent, str) else indent

return writer_trampoline(_raw_writer_coroutine(indent=indent_bytes))
return writer_trampoline(_raw_writer_coroutine(indent=indent_bytes, trailing_commas=trailing_commas))

# TODO Determine if we need to do anything special for non-raw writer. Validation?
text_writer = raw_writer
93 changes: 58 additions & 35 deletions tests/test_simpleion.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,34 +556,35 @@ def dump_func(*args, **kw):
def _generate_roundtrips(roundtrips):
for is_binary in (True, False):
for indent in ('not used',) if is_binary else (None, '', ' ', ' ', '\t', '\n\t\n '):
def _adjust_sids(annotations=()):
if is_binary and isinstance(obj, SymbolToken):
return SymbolToken(obj.text, 10 + len(annotations))
return obj

def _to_obj(to_type=None, annotations=(), tuple_as_sexp=False):
if to_type is None:
to_type = ion_type
obj_out = _adjust_sids(annotations)
return _FROM_ION_TYPE[ion_type].from_value(to_type, obj_out, annotations=annotations), is_binary, indent, tuple_as_sexp

for obj in roundtrips:
obj = _adjust_sids()
yield obj, is_binary, indent, False
if not obj_has_ion_type_and_annotation(obj):
ion_type = _ion_type(obj, _FROM_TYPE)
yield _to_obj()
else:
ion_type = obj.ion_type
if isinstance(obj, IonPyNull):
obj = None
yield _to_obj(annotations=(u'annot1', u'annot2'))
if isinstance(obj, list):
yield _to_obj(IonType.SEXP)
yield _to_obj(IonType.SEXP, annotations=(u'annot1', u'annot2'))
if isinstance(obj, tuple) and not isinstance(obj, SymbolToken):
yield _to_obj(IonType.SEXP, tuple_as_sexp=True)
yield _to_obj(IonType.SEXP, annotations=(u'annot1', u'annot2'), tuple_as_sexp=True)
for trailing_commas in (True, False):
def _adjust_sids(annotations=()):
if is_binary and isinstance(obj, SymbolToken):
return SymbolToken(obj.text, 10 + len(annotations))
return obj

def _to_obj(to_type=None, annotations=(), tuple_as_sexp=False):
if to_type is None:
to_type = ion_type
obj_out = _adjust_sids(annotations)
return _FROM_ION_TYPE[ion_type].from_value(to_type, obj_out, annotations=annotations), is_binary, indent, tuple_as_sexp, trailing_commas

for obj in roundtrips:
obj = _adjust_sids()
yield obj, is_binary, indent, False, trailing_commas
if not obj_has_ion_type_and_annotation(obj):
ion_type = _ion_type(obj, _FROM_TYPE)
yield _to_obj()
else:
ion_type = obj.ion_type
if isinstance(obj, IonPyNull):
obj = None
yield _to_obj(annotations=(u'annot1', u'annot2'))
if isinstance(obj, list):
yield _to_obj(IonType.SEXP)
yield _to_obj(IonType.SEXP, annotations=(u'annot1', u'annot2'))
if isinstance(obj, tuple) and not isinstance(obj, SymbolToken):
yield _to_obj(IonType.SEXP, tuple_as_sexp=True)
yield _to_obj(IonType.SEXP, annotations=(u'annot1', u'annot2'), tuple_as_sexp=True)


def _assert_roundtrip(before, after, tuple_as_sexp):
Expand All @@ -594,9 +595,9 @@ def _assert_roundtrip(before, after, tuple_as_sexp):
*tuple(_generate_roundtrips(_ROUNDTRIPS))
)
def test_roundtrip(p):
obj, is_binary, indent, tuple_as_sexp = p
obj, is_binary, indent, tuple_as_sexp, trailing_commas = p
out = BytesIO()
dump(obj, out, binary=is_binary, indent=indent, tuple_as_sexp=tuple_as_sexp)
dump(obj, out, binary=is_binary, indent=indent, tuple_as_sexp=tuple_as_sexp, trailing_commas=trailing_commas)
out.seek(0)
res = load(out)
_assert_roundtrip(obj, res, tuple_as_sexp)
Expand All @@ -606,10 +607,10 @@ def test_roundtrip(p):
*tuple(_generate_roundtrips(_ROUNDTRIPS))
)
def test_roundtrip_ion_stream(p):
obj, is_binary, indent, tuple_as_sexp = p
obj, is_binary, indent, tuple_as_sexp, trailing_commas = p
expected = [obj]
out = BytesIO()
dump(obj, out, binary=is_binary, indent=indent, tuple_as_sexp=tuple_as_sexp)
dump(obj, out, binary=is_binary, indent=indent, tuple_as_sexp=tuple_as_sexp, trailing_commas=trailing_commas)
out.seek(0)
res = load(out, single_value=False, parse_eagerly=True)
_assert_roundtrip(expected, res, tuple_as_sexp)
Expand Down Expand Up @@ -643,6 +644,7 @@ class PrettyPrintParams(NamedTuple):
indent: str
exact_text: Optional[str] = None
regexes: Sequence = []
trailing_commas: bool = False

@parametrize(
PrettyPrintParams(ion_text='a', indent=' ', exact_text="$ion_1_0\na"),
Expand All @@ -659,15 +661,36 @@ class PrettyPrintParams(NamedTuple):
exact_text="$ion_1_0\n[\n\tapple,\n\t\"banana\",\n\t{\n\t\troof: false\n\t}\n]"),
PrettyPrintParams(ion_text='[apple, {roof: false, walls:4, door: wood::large::true}]', indent='\t',
regexes=["\\A\\$ion_1_0\n\\[\n\tapple,\n\t\\{", "\n\t\tdoor: wood::large::true,?\n",
"\n\t\troof: false,?\n", "\n\t\twalls: 4,?\n", "\n\t\\}\n\\]\\Z"])
"\n\t\troof: false,?\n", "\n\t\twalls: 4,?\n", "\n\t\\}\n\\]\\Z"]),

PrettyPrintParams(ion_text='a', indent=' ', trailing_commas=True, exact_text="$ion_1_0\na"),
PrettyPrintParams(ion_text='"a"', indent=' ', trailing_commas=True, exact_text="$ion_1_0\n\"a\""),
PrettyPrintParams(ion_text='\'$a__9\'', indent=' ', trailing_commas=True,
exact_text="$ion_1_0\n$a__9"),
PrettyPrintParams(ion_text='\'$a_\\\'_9\'', indent=' ', trailing_commas=True,
exact_text="$ion_1_0\n\'$a_\\\'_9\'"),
PrettyPrintParams(ion_text='[a, b, chair::2008-08-08T]', indent=' ', trailing_commas=True,
exact_text="$ion_1_0\n[\n a,\n b,\n chair::2008-08-08T,\n]"),
PrettyPrintParams(ion_text='[a, b, chair::2008-08-08T]',
indent=None, trailing_commas=True, # not pretty print
exact_text="$ion_1_0 [a,b,chair::2008-08-08T]"),
PrettyPrintParams(ion_text='[apple, {roof: false}]', indent='\t', trailing_commas=True,
exact_text="$ion_1_0\n[\n\tapple,\n\t{\n\t\troof: false,\n\t},\n]"),
PrettyPrintParams(ion_text='[apple, "banana", {roof: false}]', indent='\t', trailing_commas=True,
exact_text="$ion_1_0\n[\n\tapple,\n\t\"banana\",\n\t{\n\t\troof: false,\n\t},\n]"),
PrettyPrintParams(ion_text='[apple, {roof: false, walls:4, door: wood::large::true}]', indent='\t',
trailing_commas=True,
regexes=["\\A\\$ion_1_0\n\\[\n\tapple,\n\t\\{", "\n\t\tdoor: wood::large::true,\n",
"\n\t\troof: false,\n", "\n\t\twalls: 4,\n", "\n\t\\},\n\\]\\Z"])
)
def test_pretty_print(p):
if c_ext:
# TODO support pretty print for C extension.
return
ion_text, indent, exact_text, regexes = p
ion_text, indent, exact_text, regexes, trailing_commas = p
ion_value = loads(ion_text)
actual_pretty_ion_text = dumps(ion_value, binary=False, indent=indent)
actual_pretty_ion_text = dumps(ion_value, binary=False, indent=indent,
trailing_commas=trailing_commas)
if exact_text is not None:
assert actual_pretty_ion_text == exact_text
for regex_str in regexes:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_writer_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def new_writer():
def test_raw_writer(p):
assert_writer_events(p, new_writer)


@parametrize(
(None, True),
('', True),
Expand Down Expand Up @@ -250,6 +251,7 @@ class _P_Q(NamedTuple):

pass


@parametrize(
_P_Q('a', False),
_P_Q('a0', False),
Expand Down Expand Up @@ -296,3 +298,19 @@ def test_quote_symbols(p):
assert needs_quotes == quoted
assert len(ion_value.ion_annotations) == 1
assert ion_value.ion_annotations[0].text == symbol_text


@parametrize(
(' ', True),
(' ', False),
(None, True),
(None, False))
def test_trailing_commas(p):
indent, trailing_commas = p

# Ensure raw_writer can handle trailing_commas value with indent value for all value combinations.
raw_writer(indent, trailing_commas)

# Ensure that a value dumped using the indent and trailing_commas values can be successfully loaded.
ion_val = loads('[a, {x:2, y: height::17}]')
assert ion_val == loads(dumps(ion_val, binary=False, indent=indent, trailing_commas=trailing_commas))
Loading