diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java index ec7454811c..9c84771d83 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java @@ -645,11 +645,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException _flushBuffer(); } _outputBuffer[_outputTail++] = _quoteChar; + + // When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into + // using the byte value. + final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits(); + // One or multiple segments? if (len <= _outputMaxContiguous) { - _writeUTF8Segment(text, offset, len); + _writeUTF8Segment(text, offset, len, extendedOutputEscapes); } else { - _writeUTF8Segments(text, offset, len); + _writeUTF8Segments(text, offset, len, extendedOutputEscapes); } if (_outputTail >= _outputEnd) { _flushBuffer(); @@ -1844,28 +1849,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in * to fit in the output buffer after escaping; as such, we just need to * chunk writes. */ - private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen) + private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { do { int len = Math.min(_outputMaxContiguous, totalLen); - _writeUTF8Segment(utf8, offset, len); + _writeUTF8Segment(utf8, offset, len, extendedOutputEscapes); offset += len; totalLen -= len; } while (totalLen > 0); } - private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len) + private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { // fast loop to see if escaping is needed; don't copy, just look - final int[] escCodes = _outputEscapes; - for (int ptr = offset, end = offset + len; ptr < end; ) { // 28-Feb-2011, tatu: escape codes just cover 7-bit range, so: - int ch = utf8[ptr++]; - if ((ch >= 0) && escCodes[ch] != 0) { - _writeUTF8Segment2(utf8, offset, len); + int ch = utf8[ptr++] & 0xFF; + if (extendedOutputEscapes[ch] != 0) { + _writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes); return; } } @@ -1878,7 +1881,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le _outputTail += len; } - private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) + private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { int outputPtr = _outputTail; @@ -1890,17 +1893,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) } final byte[] outputBuffer = _outputBuffer; - final int[] escCodes = _outputEscapes; len += offset; // so 'len' becomes 'end' while (offset < len) { byte b = utf8[offset++]; - int ch = b; - if (ch < 0 || escCodes[ch] == 0) { + int ch = b & 0xFF; + int escape = extendedOutputEscapes[ch]; + if (escape == 0) { outputBuffer[outputPtr++] = b; continue; } - int escape = escCodes[ch]; if (escape > 0) { // 2-char escape, fine outputBuffer[outputPtr++] = BYTE_BACKSLASH; outputBuffer[outputPtr++] = (byte) escape; @@ -1912,6 +1914,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) _outputTail = outputPtr; } + private int[] _extendOutputEscapesTo8Bits() { + final int[] escapes = _outputEscapes; + if (escapes.length >= 0xFF) { + return escapes; + } + + final int[] extended = new int[0xFF]; + System.arraycopy(escapes, 0, extended, 0, escapes.length); + _outputEscapes = extended; + return extended; + } + /* /********************************************************** /* Internal methods, low-level writing, base64 encoded diff --git a/src/test/java/perf/ManualUtf8WriteTest.java b/src/test/java/perf/ManualUtf8WriteTest.java new file mode 100644 index 0000000000..ca5442ec3a --- /dev/null +++ b/src/test/java/perf/ManualUtf8WriteTest.java @@ -0,0 +1,203 @@ +package perf; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.io.CharTypes; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Benchmarks the performance of writing UTF-8 encoded bytes, in particular the difference between using a 7-bit wide + * lookup table for escapes, versus a full 8-bit wide table. The latter is beneficial when processing encoded UTF-8 + * bytes, as the byte itself can directly be used as table index instead of needing an additional branch. + *
+ * This benchmark implements the escaping UTF-8 write loops using both 7-bit and 8-bit tables to show their respective
+ * differences, as well as testing {@link JsonGenerator#writeUTF8String} for benchmarking the production implementation.
+ *
+ * @see Github PR
+ */
+public class ManualUtf8WriteTest
+{
+ private String test(byte[] utf8) throws Exception
+ {
+ final byte[] OUTPUT = new byte[utf8.length * 2];
+ ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(utf8.length * 2);
+ JsonGenerator generator = new JsonFactory().createGenerator(OUTPUT_STREAM);
+
+ // Let's try to guestimate suitable size, N megs of output
+ final int REPS = (int) ((double) (80 * 1000 * 1000) / (double) utf8.length);
+ System.out.printf("%d bytes to scan, will do %d repetitions\n",
+ utf8.length, REPS);
+
+ int i = 0;
+ int roundsDone = 0;
+ final int TYPES = 3;
+ final int WARMUP_ROUNDS = 5;
+ final int ROUNDS = WARMUP_ROUNDS + 10;
+
+ final long[] times = new long[TYPES];
+
+ while (i < ROUNDS * TYPES) {
+ int round = i++ % TYPES;
+
+ String msg;
+
+ long msecs;
+ switch (round) {
+ case 0:
+ msg = "Write UTF-8 [7-bit escaping table]";
+ msecs = writeUtf8_7BitEscapingTable(REPS, utf8, OUTPUT);
+ break;
+ case 1:
+ msg = "Write UTF-8 [8-bit escaping table]";
+ msecs = writeUtf8_8BitEscapingTable(REPS, utf8, OUTPUT);
+ break;
+ case 2:
+ msg = "JsonGenerator.writeUTF8String ";
+ msecs = writeUtf8_JsonGenerator(REPS, utf8, OUTPUT_STREAM, generator);
+ break;
+ default:
+ throw new Error();
+ }
+ // skip first 5 rounds to let results stabilize
+ if (roundsDone >= WARMUP_ROUNDS) {
+ times[round] += msecs;
+ }
+
+ System.out.printf("Test '%s' -> %3d msecs\n", msg, msecs);
+ if (round == TYPES - 1) {
+ ++roundsDone;
+ if ((roundsDone % 3) == 0) {
+ System.out.println("[GC]");
+ Thread.sleep(100L);
+ System.gc();
+ Thread.sleep(100L);
+ }
+ System.out.println();
+ }
+ }
+ double den = roundsDone - WARMUP_ROUNDS;
+
+ return String.format("(7-bit, 8-bit, JsonGenerator): %5.1f / %5.1f / %5.1f msecs",
+ times[0] / den, times[1] / den, times[2] / den);
+ }
+
+ private final long writeUtf8_7BitEscapingTable(int REPS, byte[] input, byte[] output)
+ {
+ long start = System.currentTimeMillis();
+ int[] outputEscapes = CharTypes.get7BitOutputEscapes();
+
+ while (--REPS >= 0) {
+ int inOffset = 0;
+ int outOffset = 0;
+ int len = input.length;
+
+ while (inOffset < len) {
+ byte b = input[inOffset++];
+ int ch = b;
+ if (ch < 0 || outputEscapes[ch] == 0) {
+ output[outOffset++] = b;
+ continue;
+ }
+ int escape = outputEscapes[ch];
+ if (escape > 0) {
+ output[outOffset++] = (byte) '\\';
+ output[outOffset++] = (byte) escape;
+ } else {
+ throw new UnsupportedOperationException("ctrl character escapes are not covered in test");
+ }
+ }
+ }
+ long time = System.currentTimeMillis() - start;
+ return time;
+ }
+
+ private final long writeUtf8_8BitEscapingTable(int REPS, byte[] input, byte[] output)
+ {
+ long start = System.currentTimeMillis();
+
+ int[] outputEscapes = CharTypes.get7BitOutputEscapes();
+ int[] extendedOutputEscapes = new int[0xFF];
+ System.arraycopy(outputEscapes, 0, extendedOutputEscapes, 0, outputEscapes.length);
+
+ while (--REPS >= 0) {
+ int inOffset = 0;
+ int outOffset = 0;
+ int len = input.length;
+
+ while (inOffset < len) {
+ byte b = input[inOffset++];
+ int ch = b & 0xFF;
+ int escape = extendedOutputEscapes[ch];
+ if (escape == 0) {
+ output[outOffset++] = b;
+ continue;
+ }
+ if (escape > 0) {
+ output[outOffset++] = (byte) '\\';
+ output[outOffset++] = (byte) escape;
+ } else {
+ throw new UnsupportedOperationException("ctrl character escapes are not covered in test");
+ }
+ }
+ }
+
+ long time = System.currentTimeMillis() - start;
+ return time;
+ }
+
+ private final long writeUtf8_JsonGenerator(int REPS, byte[] input, ByteArrayOutputStream output, JsonGenerator generator) throws IOException {
+ long start = System.currentTimeMillis();
+
+ while (--REPS >= 0) {
+ output.reset();
+ generator.writeUTF8String(input, 0, input.length);
+ generator.flush();
+ }
+
+ long time = System.currentTimeMillis() - start;
+ return time;
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ if (args.length != 0) {
+ System.err.println("Usage: java ...");
+ System.exit(1);
+ }
+
+ final int[] LENGTHS = new int[]{8, 16, 32, 256, 512, 1024, 1024 * 8};
+ final String[] ESCAPE_VARIANTS = new String[] {"none", "start", "end"};
+ final List