Skip to content

Commit

Permalink
Add a CBORGenerator feature for lenient unicode encoding
Browse files Browse the repository at this point in the history
If enabled, the generator will output the Unicode Replacement Character
for invalid unicode sequence (invalid surrogate chars in the Java
String) instead of failing with an IllegalArgumentException
  • Loading branch information
guillaumebort committed Sep 30, 2020
1 parent f5853dc commit 02a2cbc
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
{
private final static int[] NO_INTS = new int[0];

/**
* The replacement character to use to fix invalid unicode sequences.
*/
final static int REPLACEMENT_CHAR = 0xfffd;

/**
* Let's ensure that we have big enough output buffer because of safety
* margins we need for UTF-8 encoding.
Expand Down Expand Up @@ -63,7 +68,14 @@ public enum Feature implements FormatFeature {
*
* @since 2.5
*/
WRITE_TYPE_HEADER(false)
WRITE_TYPE_HEADER(false),

/**
* Feature that determines if an invalid surrogate encoding found in the
* incoming String should fail with an exception or silently be outputed
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
*/
LENIENT_UTF_ENCODING(false),

;

Expand Down Expand Up @@ -140,6 +152,8 @@ public int getMask() {

protected boolean _cfgMinimalInts;

protected boolean _cfgLenientUnicodeEncoding;

/*
/**********************************************************
/* Output state
Expand Down Expand Up @@ -234,6 +248,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
_cborContext = CBORWriteContext.createRootContext(dups);
_formatFeatures = formatFeatures;
_cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
_cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
_ioContext = ctxt;
_out = out;
_bufferRecyclable = true;
Expand Down Expand Up @@ -406,6 +421,9 @@ public CBORGenerator enable(Feature f) {
if (f == Feature.WRITE_MINIMAL_INTS) {
_cfgMinimalInts = true;
}
if (f == Feature.LENIENT_UTF_ENCODING) {
_cfgLenientUnicodeEncoding = true;
}
return this;
}

Expand All @@ -414,6 +432,9 @@ public CBORGenerator disable(Feature f) {
if (f == Feature.WRITE_MINIMAL_INTS) {
_cfgMinimalInts = false;
}
if (f == Feature.LENIENT_UTF_ENCODING) {
_cfgLenientUnicodeEncoding = false;
}
return this;
}

Expand Down Expand Up @@ -1424,81 +1445,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
do {
int c = str[i];
if (c > 0x7F) {
return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
return _encode2(i, outputPtr, str, end, outputStart);
}
outBuf[outputPtr++] = (byte) c;
} while (++i < end);
return outputPtr - outputStart;
}

/**
* Helper method called when the whole character sequence is known to fit in
* the output buffer, but not all characters are single-byte (ASCII)
* characters.
*/
private final int _shortUTF8Encode2(char[] str, int i, int end,
int outputPtr, int outputStart) {
final byte[] outBuf = _outputBuffer;
while (i < end) {
int c = str[i++];
if (c <= 0x7F) {
outBuf[outputPtr++] = (byte) c;
continue;
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// 3 or 4 bytes (surrogate)
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= end) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str[i++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
return (outputPtr - outputStart);
}

private final int _encode(int outputPtr, String str, int len) {
final byte[] outBuf = _outputBuffer;
final int outputStart = outputPtr;

for (int i = 0; i < len; ++i) {
int c = str.charAt(i);
if (c > 0x7F) {
return _encode2(i, outputPtr, str, len, outputStart);
return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
}
outBuf[outputPtr++] = (byte) c;
}
return (outputPtr - outputStart);
}

private final int _encode2(int i, int outputPtr, String str, int len,
private final int _encode2(int i, int outputPtr, char[] str, int len,
int outputStart) {
final byte[] outBuf = _outputBuffer;
// no; non-ASCII stuff, slower loop
while (i < len) {
int c = str.charAt(i++);
int c = str[i++];
if (c <= 0x7F) {
outBuf[outputPtr++] = (byte) c;
continue;
Expand All @@ -1520,20 +1493,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
if (_cfgLenientUnicodeEncoding) {
c = REPLACEMENT_CHAR;
} else {
_throwIllegalSurrogate(c);
}
}
// ... meaning it must have a pair
if (i >= len) {
_throwIllegalSurrogate(c);
else if (i >= len) {
if (_cfgLenientUnicodeEncoding) {
c = REPLACEMENT_CHAR;
} else {
_throwIllegalSurrogate(c);
}
}
c = _convertSurrogate(c, str.charAt(i++));
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// ... verify that the next character is in range
else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
if (_cfgLenientUnicodeEncoding) {
c = REPLACEMENT_CHAR;
} else {
_throwIllegalSurrogatePair(c, str[i]);
}
}
// ... we have a valid surrogate pair
else {
c = _convertSurrogate(c, str[i++]);
}
// if we replaced by the replacement char we actually have a 3 bytes char
if (c == REPLACEMENT_CHAR) {
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
} else {
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
return (outputPtr - outputStart);
}
Expand All @@ -1542,16 +1538,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
* Method called to calculate UTF codepoint, from a surrogate pair.
*/
private int _convertSurrogate(int firstPart, int secondPart) {
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IllegalArgumentException(
int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+ (secondPart - SURR2_FIRST);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
if (_cfgLenientUnicodeEncoding) {
c = REPLACEMENT_CHAR;
} else {
_throwIllegalSurrogate(c);
}
}
return c;
}

private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
throw new IllegalArgumentException(
"Broken surrogate pair: first char 0x"
+ Integer.toHexString(firstPart) + ", second 0x"
+ Integer.toHexString(secondPart)
+ "; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+ (secondPart - SURR2_FIRST);
}

private void _throwIllegalSurrogate(int code) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
return f.createGenerator(result, null);
}

protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
throws IOException
{
CBORGenerator gen = cborGenerator(result);
gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
return gen;
}

/*
/**********************************************************
/* Additional assertion methods
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@

import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.*;

import org.junit.Assert;

import com.fasterxml.jackson.core.JsonGenerationException;

import com.fasterxml.jackson.databind.ObjectMapper;

import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;

public class UnicodeGenerationTest extends CBORTestBase
{
/**
* Test that encoding a String containing invalid surrogates fail with an exception
*/
public void testFailForInvalidSurrogate() throws Exception
{
ByteArrayOutputStream out = new ByteArrayOutputStream();
CBORGenerator gen = cborGenerator(out);

assertEquals(0, gen.getOutputBuffered());

// Unmatched first surrogate character
try {
gen.writeString("x\ud83d");
} catch (IllegalArgumentException e) {
}
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character
try {
gen.writeString("x\ude01");
} catch (IllegalArgumentException e) {
}
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character (2)
try {
gen.writeString("x\ude01x");
} catch (IllegalArgumentException e) {
}
assertEquals(0, gen.getOutputBuffered());

// Broken surrogate pair
try {
gen.writeString("x\ud83dx");
} catch (IllegalArgumentException e) {
}
assertEquals(0, gen.getOutputBuffered());
}

/**
* Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
*/
public void testRecoverInvalidSurrogate() throws Exception
{
ByteArrayOutputStream out;
CBORGenerator gen;
byte[] b;

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched first surrogate character
gen.writeString("x\ud83d");
gen.close();
b = "x\ufffd".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character
gen.writeString("x\ude01");
gen.close();
b = "x\ufffd".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Unmatched second surrogate character (2)
gen.writeString("x\ude01x");
gen.close();
b = "x\ufffdx".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

out = new ByteArrayOutputStream();
gen = lenientUnicodeCborGenerator(out);
assertEquals(0, gen.getOutputBuffered());

// Broken surrogate pair
gen.writeString("x\ud83dx");
gen.close();
b = "x\ufffdx".getBytes("utf-8");
_verifyBytes(out.toByteArray(),
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);

}

}

0 comments on commit 02a2cbc

Please sign in to comment.