Skip to content

Commit

Permalink
Manually merged #222
Browse files Browse the repository at this point in the history
  • Loading branch information
cowtowncoder committed Oct 29, 2020
1 parent 1f3cbdc commit 314bd30
Show file tree
Hide file tree
Showing 5 changed files with 238 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ public class CBORGenerator extends GeneratorBase
*/
final static int BYTE_BUFFER_FOR_OUTPUT = 16000;

/**
* The replacement character to use to fix invalid Unicode sequences
* (mismatched surrogate pair).
*
* @since 2.12
*/
final static int REPLACEMENT_CHAR = 0xfffd;

/**
* Longest char chunk we will output is chosen so that it is guaranteed to
* fit in an empty buffer even if everything encoded in 3-byte sequences;
Expand Down Expand Up @@ -58,13 +66,25 @@ public enum Feature implements FormatFeature {
* 55799, encoded as 3-byte sequence of <code>0xD9, 0xD9, 0xF7</code>)
* should be written at the beginning of document or not.
* <p>
* Default value is <code>false</code> meaning that type tag will not be
* Default value is {@code false} meaning that type tag will not be
* written at the beginning of a new document.
*
* @since 2.5
*/
WRITE_TYPE_HEADER(false)
WRITE_TYPE_HEADER(false),

/**
* Feature that determines if an invalid surrogate encoding found in the
* incoming String should fail with an exception or silently be output
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
* an exception will be thrown to indicate invalid content.
*<p>
* Default value is {@code false} (for backwards compatibility) meaning that
* an invalide surrogate will result in exception ({@link IllegalArgumentException}
*
* @since 2.12
*/
LENIENT_UTF_ENCODING(false),
;

protected final boolean _defaultState;
Expand Down Expand Up @@ -201,7 +221,7 @@ public int getMask() {

/**
* Number of elements remaining in the current complex structure (if any),
* when writing defined-length Arrays, Objects; marker {@link #INDEFINITE_LENGTH}
* when writing defined-length Arrays, Objects; marker {code INDEFINITE_LENGTH}
* otherwise.
*/
protected int _currentRemainingElements = INDEFINITE_LENGTH;
Expand Down Expand Up @@ -1452,29 +1472,25 @@ private final int _shortUTF8Encode2(char[] str, int i, int end,
continue;
}
// 3 or 4 bytes (surrogate)
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= end) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str[i++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
final int d = str[i];
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++i;
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
continue;
}
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
continue;
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
}
return (outputPtr - outputStart);
}
Expand Down Expand Up @@ -1510,70 +1526,76 @@ private final int _encode2(int i, int outputPtr, String str, int len,
continue;
}
// 3 or 4 bytes (surrogate)
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte
// character
if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= len) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str.charAt(i++));
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (i < len)) { // must be from first range and have another char
final int d = str.charAt(i);
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++i;
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
continue;
}
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
continue;
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
}
return (outputPtr - outputStart);
}

/**
* Method called to calculate UTF codepoint, from a surrogate pair.
*/
private int _convertSurrogate(int firstPart, int secondPart) {
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IllegalArgumentException(
"Broken surrogate pair: first char 0x"
+ Integer.toHexString(firstPart) + ", second 0x"
+ Integer.toHexString(secondPart)
+ "; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+ (secondPart - SURR2_FIRST);
}

private void _throwIllegalSurrogate(int code) {
if (code > 0x10FFFF) { // over max?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code)
+ ") to output; max is 0x10FFFF as per RFC 4627");
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without
// second part?)
throw new IllegalArgumentException(
"Unmatched first part of surrogate pair (0x"
+ Integer.toHexString(code) + ")");
}
throw new IllegalArgumentException(
"Unmatched second part of surrogate pair (0x"
+ Integer.toHexString(code) + ")");
private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr) {
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
return _appendReplacementChar(outBuf, outputPtr);
}
// Will be called in two distinct cases: either first character is
// invalid (code range of second part), or first character is valid
// but there is no second part to encode
if (code <= SURR1_LAST) {
// Unmatched first part (closing without second part?)
throw new IllegalArgumentException(String.format(
"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
code));
}
throw new IllegalArgumentException(String.format(
"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
code));
}

private int _invalidSurrogateEnd(int surr1, int surr2,
byte[] outBuf, int outputPtr)
{
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
return _appendReplacementChar(outBuf, outputPtr);
}
// should we ever get this?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code) + ") to output");
throw new IllegalArgumentException(String.format(
"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
+" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
surr1, surr2));
}

private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
return outputPtr;
}

private int _decodeAndWriteSurrogate(int surr1, int surr2,
byte[] outBuf, int outputPtr)
{
final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
+ (surr2 - SURR2_FIRST);
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
return outputPtr;
}

/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,16 @@ protected CBORGenerator cborGenerator(CBORFactory f,
return f.createGenerator(result, null);
}

// @since 2.12
protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
throws IOException
{
return cborFactoryBuilder()
.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING)
.build()
.createGenerator(result);
}

/*
/**********************************************************
/* Additional assertion methods
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package com.fasterxml.jackson.dataformat.cbor.gen;

import java.io.ByteArrayOutputStream;

import com.fasterxml.jackson.dataformat.cbor.*;

public class LenientUnicodeGenerationTest extends CBORTestBase
{
/**
* Test that encoding a String containing invalid surrogates fail with an exception
*/
public void testFailForInvalidSurrogate() throws Exception
{
ByteArrayOutputStream out = new ByteArrayOutputStream();
CBORGenerator gen = cborGenerator(out);

assertEquals(0, gen.getOutputBuffered());

// Invalid first surrogate character
try {
gen.writeString("x\ud83d");
} catch (IllegalArgumentException e) {
verifyException(e, "Unmatched surrogate pair");
verifyException(e, "0xD83D");
verifyException(e, "without low surrogate");
}
assertEquals(0, gen.getOutputBuffered());

// Missing second surrogate character
try {
gen.writeString("x\ude01");
} catch (IllegalArgumentException e) {
verifyException(e, "Invalid surrogate pair");
verifyException(e, "0xDE01");
verifyException(e, "invalid high surrogate");
}
assertEquals(0, gen.getOutputBuffered());

// Invalid second surrogate character (1)
try {
gen.writeString("x\ud801\ud802");
} catch (IllegalArgumentException e) {
verifyException(e, "Invalid surrogate pair");
verifyException(e, "0xD801");
verifyException(e, "0xD802");
verifyException(e, "valid high surrogate");
verifyException(e, "invalid low surrogate");
}
assertEquals(0, gen.getOutputBuffered());

// Invalid second surrogate character (2)
try {
gen.writeString("x\ud83dx");
} catch (IllegalArgumentException e) {
verifyException(e, "Invalid surrogate pair");
verifyException(e, "0xD83D");
verifyException(e, "0x0078");
verifyException(e, "valid high surrogate");
verifyException(e, "invalid low surrogate");
}
assertEquals(0, gen.getOutputBuffered());
}

/**
* Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
*/
public void testRecoverInvalidSurrogate1() throws Exception
{
ByteArrayOutputStream out;
CBORGenerator gen;
byte[] b;

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched first surrogate character
gen.writeString("x\ud83d");
gen.close();
b = "x\ufffd".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character
gen.writeString("x\ude01");
gen.close();
b = "x\ufffd".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character (2)
gen.writeString("x\ude01x");
gen.close();
b = "x\ufffdx".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
}

public void testRecoverInvalidSurrogate2() throws Exception
{
ByteArrayOutputStream out;
CBORGenerator gen;
byte[] b;

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Broken surrogate pair
gen.writeString("X\ud83dY");
gen.close();
b = "X\ufffdY".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
}
}
6 changes: 6 additions & 0 deletions release-notes/CREDITS-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,9 @@ Michael Liedtke (mcliedtke@github)
* Contributed fix for #212: (ion) Optimize `IonParser.getNumberType()` using
`IonReader.getIntegerSize()`
(2.12.0)

Guillaume Bort (guillaumebort@github)

* Contributed implementation of #222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING`
for lenient handling of Unicode surrogate pairs on writing
(2.12.0)
3 changes: 3 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ Project: jackson-datatypes-binaryModules:
(contributed by Paul F)
#212: (ion) Optimize `IonParser.getNumberType()` using `IonReader.getIntegerSize()`
(contributed by Michael L)
#222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling of
Unicode surrogate pairs on writing
(contributed by Guillaume B)
- Add Gradle Module Metadata (https://blog.gradle.org/alignment-with-gradle-module-metadata)

2.11.3 (02-Oct-2020)
Expand Down

0 comments on commit 314bd30

Please sign in to comment.