Manually merged #222

FasterXML · Oct 29, 2020 · 314bd30 · 314bd30
1 parent 1f3cbdc
commit 314bd30
Show file tree

Hide file tree

Showing 5 changed files with 238 additions and 73 deletions.
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -28,6 +28,14 @@ public class CBORGenerator extends GeneratorBase
      */
     final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
 
+    /**
+     * The replacement character to use to fix invalid Unicode sequences
+     * (mismatched surrogate pair).
+     *
+     * @since 2.12
+     */
+    final static int REPLACEMENT_CHAR = 0xfffd;
+
     /**
      * Longest char chunk we will output is chosen so that it is guaranteed to
      * fit in an empty buffer even if everything encoded in 3-byte sequences;
@@ -58,13 +66,25 @@ public enum Feature implements FormatFeature {
          * 55799, encoded as 3-byte sequence of <code>0xD9, 0xD9, 0xF7</code>)
          * should be written at the beginning of document or not.
          * <p>
-         * Default value is <code>false</code> meaning that type tag will not be
+         * Default value is {@code false} meaning that type tag will not be
          * written at the beginning of a new document.
          *
          * @since 2.5
          */
-        WRITE_TYPE_HEADER(false)
+        WRITE_TYPE_HEADER(false),
 
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be output
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
+         * an exception will be thrown to indicate invalid content.
+         *<p>
+         * Default value is {@code false} (for backwards compatibility) meaning that
+         * an invalide surrogate will result in exception ({@link IllegalArgumentException}
+         *
+         * @since 2.12
+         */
+        LENIENT_UTF_ENCODING(false),
         ;
 
         protected final boolean _defaultState;
@@ -201,7 +221,7 @@ public int getMask() {
 
     /**
      * Number of elements remaining in the current complex structure (if any),
-     * when writing defined-length Arrays, Objects; marker {@link #INDEFINITE_LENGTH}
+     * when writing defined-length Arrays, Objects; marker {code INDEFINITE_LENGTH}
      * otherwise.
      */
     protected int _currentRemainingElements = INDEFINITE_LENGTH;
@@ -1452,29 +1472,25 @@ private final int _shortUTF8Encode2(char[] str, int i, int end,
                 continue;
             }
             // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
+            if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
                 outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
                 outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                 outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
                 continue;
             }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // Yup, looks like a surrogate pair... but is it?
+            if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
+                final int d = str[i];
+                if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                    ++i;
+                    outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
+                    continue;
+                }
+                outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
+                continue;
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            // Nah, something wrong
+            outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
         }
         return (outputPtr - outputStart);
     }
@@ -1510,70 +1526,76 @@ private final int _encode2(int i, int outputPtr, String str, int len,
                 continue;
             }
             // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte
-                                                     // character
+            if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
                 outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
                 outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                 outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
                 continue;
             }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= len) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // Yup, looks like a surrogate pair... but is it?
+            if ((c <= SURR1_LAST) && (i < len)) { // must be from first range and have another char
+                final int d = str.charAt(i);
+                if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                    ++i;
+                    outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
+                    continue;
+                }
+                outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
+                continue;
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            // Nah, something wrong
+            outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
         }
         return (outputPtr - outputStart);
     }
 
-    /**
-     * Method called to calculate UTF codepoint, from a surrogate pair.
-     */
-    private int _convertSurrogate(int firstPart, int secondPart) {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            throw new IllegalArgumentException(
-                    "Broken surrogate pair: first char 0x"
-                            + Integer.toHexString(firstPart) + ", second 0x"
-                            + Integer.toHexString(secondPart)
-                            + "; illegal combination");
-        }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
-                + (secondPart - SURR2_FIRST);
-    }
-
-    private void _throwIllegalSurrogate(int code) {
-        if (code > 0x10FFFF) { // over max?
-            throw new IllegalArgumentException("Illegal character point (0x"
-                    + Integer.toHexString(code)
-                    + ") to output; max is 0x10FFFF as per RFC 4627");
-        }
-        if (code >= SURR1_FIRST) {
-            if (code <= SURR1_LAST) { // Unmatched first part (closing without
-                                      // second part?)
-                throw new IllegalArgumentException(
-                        "Unmatched first part of surrogate pair (0x"
-                                + Integer.toHexString(code) + ")");
-            }
-            throw new IllegalArgumentException(
-                    "Unmatched second part of surrogate pair (0x"
-                            + Integer.toHexString(code) + ")");
+    private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr) {
+        if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
+            return _appendReplacementChar(outBuf, outputPtr);
+        }
+        // Will be called in two distinct cases: either first character is
+        // invalid (code range of second part), or first character is valid
+        // but there is no second part to encode
+        if (code <= SURR1_LAST) {
+            // Unmatched first part (closing without second part?)
+            throw new IllegalArgumentException(String.format(
+"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
+code));
+        }
+        throw new IllegalArgumentException(String.format(
+"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
+code));
+    }
+
+    private int _invalidSurrogateEnd(int surr1, int surr2,
+            byte[] outBuf, int outputPtr)
+    {
+        if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
+            return _appendReplacementChar(outBuf, outputPtr);
         }
-        // should we ever get this?
-        throw new IllegalArgumentException("Illegal character point (0x"
-                + Integer.toHexString(code) + ") to output");
+        throw new IllegalArgumentException(String.format(
+"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
++" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
+surr1, surr2));
+    }
+
+    private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
+        outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
+        outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
+        return outputPtr;
+    }
+
+    private int _decodeAndWriteSurrogate(int surr1, int surr2,
+            byte[] outBuf, int outputPtr)
+    {
+        final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
+                + (surr2 - SURR2_FIRST);
+        outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+        outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+        return outputPtr;
     }
 
     /*

diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -85,6 +85,16 @@ protected CBORGenerator cborGenerator(CBORFactory f,
         return f.createGenerator(result, null);
     }
 
+    // @since 2.12
+    protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
+        throws IOException
+    {
+        return cborFactoryBuilder()
+                .enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING)
+                .build()
+                .createGenerator(result);
+    }
+
     /*
     /**********************************************************
     /* Additional assertion methods

diff --git a/...src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/LenientUnicodeGenerationTest.java b/...src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/LenientUnicodeGenerationTest.java
@@ -0,0 +1,124 @@
+package com.fasterxml.jackson.dataformat.cbor.gen;
+
+import java.io.ByteArrayOutputStream;
+
+import com.fasterxml.jackson.dataformat.cbor.*;
+
+public class LenientUnicodeGenerationTest extends CBORTestBase
+{
+    /**
+     * Test that encoding a String containing invalid surrogates fail with an exception
+     */
+    public void testFailForInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        CBORGenerator gen = cborGenerator(out);
+
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Invalid first surrogate character
+        try { 
+            gen.writeString("x\ud83d");
+        } catch (IllegalArgumentException e) {
+            verifyException(e, "Unmatched surrogate pair");
+            verifyException(e, "0xD83D");
+            verifyException(e, "without low surrogate");
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Missing second surrogate character
+        try { 
+            gen.writeString("x\ude01");
+        } catch (IllegalArgumentException e) {
+            verifyException(e, "Invalid surrogate pair");
+            verifyException(e, "0xDE01");
+            verifyException(e, "invalid high surrogate");
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Invalid second surrogate character (1)
+        try { 
+            gen.writeString("x\ud801\ud802");
+        } catch (IllegalArgumentException e) {
+            verifyException(e, "Invalid surrogate pair");
+            verifyException(e, "0xD801");
+            verifyException(e, "0xD802");
+            verifyException(e, "valid high surrogate");
+            verifyException(e, "invalid low surrogate");
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Invalid second surrogate character (2)
+        try { 
+            gen.writeString("x\ud83dx");
+        } catch (IllegalArgumentException e) {
+            verifyException(e, "Invalid surrogate pair");
+            verifyException(e, "0xD83D");
+            verifyException(e, "0x0078");
+            verifyException(e, "valid high surrogate");
+            verifyException(e, "invalid low surrogate");
+        }
+        assertEquals(0, gen.getOutputBuffered());
+    }
+
+    /**
+     * Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
+     */
+    public void testRecoverInvalidSurrogate1() throws Exception
+    {
+        ByteArrayOutputStream out;
+        CBORGenerator gen;
+        byte[] b;
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched first surrogate character
+        gen.writeString("x\ud83d");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character
+        gen.writeString("x\ude01");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character (2)
+        gen.writeString("x\ude01x");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+    }
+
+    public void testRecoverInvalidSurrogate2() throws Exception
+    {
+        ByteArrayOutputStream out;
+        CBORGenerator gen;
+        byte[] b;
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Broken surrogate pair
+        gen.writeString("X\ud83dY");
+        gen.close();
+        b = "X\ufffdY".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+    }
+}
diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x
@@ -143,3 +143,9 @@ Michael Liedtke (mcliedtke@github)
 * Contributed fix for #212: (ion) Optimize `IonParser.getNumberType()` using
   `IonReader.getIntegerSize()`
  (2.12.0)
+
+Guillaume Bort (guillaumebort@github)
+
+* Contributed implementation of #222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING`
+  for lenient handling of Unicode surrogate pairs on writing
+ (2.12.0)
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -14,6 +14,9 @@ Project: jackson-datatypes-binaryModules:
  (contributed by Paul F)
 #212: (ion) Optimize `IonParser.getNumberType()` using `IonReader.getIntegerSize()`
  (contributed by Michael L)
+#222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling of
+  Unicode surrogate pairs on writing
+ (contributed by Guillaume B)
 - Add Gradle Module Metadata (https://blog.gradle.org/alignment-with-gradle-module-metadata)
 
 2.11.3 (02-Oct-2020)