Add a CBORGenerator feature for lenient unicode encoding

If enabled, the generator will output the Unicode Replacement Character for invalid unicode sequence (invalid surrogate chars in the Java String) instead of failing with an IllegalArgumentException
FasterXML · Sep 30, 2020 · 02a2cbc · 02a2cbc
1 parent f5853dc
commit 02a2cbc
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 69 deletions.
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
 {
     private final static int[] NO_INTS = new int[0];
 
+    /**
+     * The replacement character to use to fix invalid unicode sequences.
+     */
+    final static int REPLACEMENT_CHAR = 0xfffd;
+
     /**
      * Let's ensure that we have big enough output buffer because of safety
      * margins we need for UTF-8 encoding.
@@ -63,7 +68,14 @@ public enum Feature implements FormatFeature {
          *
          * @since 2.5
          */
-        WRITE_TYPE_HEADER(false)
+        WRITE_TYPE_HEADER(false),
+
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be outputed
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
+         */
+        LENIENT_UTF_ENCODING(false),
 
         ;
 
@@ -140,6 +152,8 @@ public int getMask() {
 
     protected boolean _cfgMinimalInts;
 
+    protected boolean _cfgLenientUnicodeEncoding;
+
     /*
     /**********************************************************
     /* Output state
@@ -234,6 +248,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
         _cborContext = CBORWriteContext.createRootContext(dups);
         _formatFeatures = formatFeatures;
         _cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
+        _cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
         _ioContext = ctxt;
         _out = out;
         _bufferRecyclable = true;
@@ -406,6 +421,9 @@ public CBORGenerator enable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = true;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = true;
+        }
         return this;
     }
 
@@ -414,6 +432,9 @@ public CBORGenerator disable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = false;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = false;
+        }
         return this;
     }
 
@@ -1424,81 +1445,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
         do {
             int c = str[i];
             if (c > 0x7F) {
-                return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
+                return _encode2(i, outputPtr, str, end, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         } while (++i < end);
         return outputPtr - outputStart;
     }
 
-    /**
-     * Helper method called when the whole character sequence is known to fit in
-     * the output buffer, but not all characters are single-byte (ASCII)
-     * characters.
-     */
-    private final int _shortUTF8Encode2(char[] str, int i, int end,
-            int outputPtr, int outputStart) {
-        final byte[] outBuf = _outputBuffer;
-        while (i < end) {
-            int c = str[i++];
-            if (c <= 0x7F) {
-                outBuf[outputPtr++] = (byte) c;
-                continue;
-            }
-            // Nope, multi-byte:
-            if (c < 0x800) { // 2-byte
-                outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
-                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
-                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
-            }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-        }
-        return (outputPtr - outputStart);
-    }
-
     private final int _encode(int outputPtr, String str, int len) {
         final byte[] outBuf = _outputBuffer;
         final int outputStart = outputPtr;
 
         for (int i = 0; i < len; ++i) {
             int c = str.charAt(i);
             if (c > 0x7F) {
-                return _encode2(i, outputPtr, str, len, outputStart);
+                return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         }
         return (outputPtr - outputStart);
     }
 
-    private final int _encode2(int i, int outputPtr, String str, int len,
+    private final int _encode2(int i, int outputPtr, char[] str, int len,
             int outputStart) {
         final byte[] outBuf = _outputBuffer;
         // no; non-ASCII stuff, slower loop
         while (i < len) {
-            int c = str.charAt(i++);
+            int c = str[i++];
             if (c <= 0x7F) {
                 outBuf[outputPtr++] = (byte) c;
                 continue;
@@ -1520,20 +1493,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
             }
             // Yup, a surrogate pair
             if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
             // ... meaning it must have a pair
-            if (i >= len) {
-                _throwIllegalSurrogate(c);
+            else if (i >= len) {
+               if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // ... verify that the next character is in range
+            else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogatePair(c, str[i]);
+                }
+            }
+            // ... we have a valid surrogate pair
+            else {
+                c = _convertSurrogate(c, str[i++]);
+            }
+            // if we replaced by the replacement char we actually have a 3 bytes char
+            if (c == REPLACEMENT_CHAR) {
+                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            } else {
+                outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
         }
         return (outputPtr - outputStart);
     }
@@ -1542,16 +1538,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
      * Method called to calculate UTF codepoint, from a surrogate pair.
      */
     private int _convertSurrogate(int firstPart, int secondPart) {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            throw new IllegalArgumentException(
+        int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+                + (secondPart - SURR2_FIRST);
+        if (c > 0x10FFFF) { // illegal in JSON as well as in XML
+            if (_cfgLenientUnicodeEncoding) {
+                c = REPLACEMENT_CHAR;
+            } else {
+                _throwIllegalSurrogate(c);
+            }
+        }
+        return c;
+    }
+
+    private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
+         throw new IllegalArgumentException(
                     "Broken surrogate pair: first char 0x"
                             + Integer.toHexString(firstPart) + ", second 0x"
                             + Integer.toHexString(secondPart)
                             + "; illegal combination");
-        }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
-                + (secondPart - SURR2_FIRST);
     }
 
     private void _throwIllegalSurrogate(int code) {

diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
         return f.createGenerator(result, null);
     }
 
+    protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
+        throws IOException
+    {
+        CBORGenerator gen = cborGenerator(result);
+        gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
+        return gen;
+    }
+
     /*
     /**********************************************************
     /* Additional assertion methods

diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java
@@ -0,0 +1,114 @@
+
+import java.io.*;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.*;
+
+import org.junit.Assert;
+
+import com.fasterxml.jackson.core.JsonGenerationException;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import com.fasterxml.jackson.dataformat.cbor.CBORParser;
+import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
+
+public class UnicodeGenerationTest extends CBORTestBase
+{   
+    /**
+     * Test that encoding a String containing invalid surrogates fail with an exception
+     */
+    public void testFailForInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        CBORGenerator gen = cborGenerator(out);
+
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched first surrogate character
+        try { 
+            gen.writeString("x\ud83d");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character
+        try { 
+            gen.writeString("x\ude01");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character (2)
+        try { 
+            gen.writeString("x\ude01x");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Broken surrogate pair
+        try { 
+            gen.writeString("x\ud83dx");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+    }
+
+    /**
+     * Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
+     */
+    public void testRecoverInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out;
+        CBORGenerator gen;
+        byte[] b;
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched first surrogate character
+        gen.writeString("x\ud83d");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character
+        gen.writeString("x\ude01");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character (2)
+        gen.writeString("x\ude01x");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Broken surrogate pair
+        gen.writeString("x\ud83dx");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+    }
+
+}