diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java index ec7454811c..9c84771d83 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java @@ -645,11 +645,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException _flushBuffer(); } _outputBuffer[_outputTail++] = _quoteChar; + + // When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into + // using the byte value. + final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits(); + // One or multiple segments? if (len <= _outputMaxContiguous) { - _writeUTF8Segment(text, offset, len); + _writeUTF8Segment(text, offset, len, extendedOutputEscapes); } else { - _writeUTF8Segments(text, offset, len); + _writeUTF8Segments(text, offset, len, extendedOutputEscapes); } if (_outputTail >= _outputEnd) { _flushBuffer(); @@ -1844,28 +1849,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in * to fit in the output buffer after escaping; as such, we just need to * chunk writes. */ - private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen) + private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { do { int len = Math.min(_outputMaxContiguous, totalLen); - _writeUTF8Segment(utf8, offset, len); + _writeUTF8Segment(utf8, offset, len, extendedOutputEscapes); offset += len; totalLen -= len; } while (totalLen > 0); } - private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len) + private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { // fast loop to see if escaping is needed; don't copy, just look - final int[] escCodes = _outputEscapes; - for (int ptr = offset, end = offset + len; ptr < end; ) { // 28-Feb-2011, tatu: escape codes just cover 7-bit range, so: - int ch = utf8[ptr++]; - if ((ch >= 0) && escCodes[ch] != 0) { - _writeUTF8Segment2(utf8, offset, len); + int ch = utf8[ptr++] & 0xFF; + if (extendedOutputEscapes[ch] != 0) { + _writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes); return; } } @@ -1878,7 +1881,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le _outputTail += len; } - private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) + private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes) throws IOException, JsonGenerationException { int outputPtr = _outputTail; @@ -1890,17 +1893,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) } final byte[] outputBuffer = _outputBuffer; - final int[] escCodes = _outputEscapes; len += offset; // so 'len' becomes 'end' while (offset < len) { byte b = utf8[offset++]; - int ch = b; - if (ch < 0 || escCodes[ch] == 0) { + int ch = b & 0xFF; + int escape = extendedOutputEscapes[ch]; + if (escape == 0) { outputBuffer[outputPtr++] = b; continue; } - int escape = escCodes[ch]; if (escape > 0) { // 2-char escape, fine outputBuffer[outputPtr++] = BYTE_BACKSLASH; outputBuffer[outputPtr++] = (byte) escape; @@ -1912,6 +1914,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) _outputTail = outputPtr; } + private int[] _extendOutputEscapesTo8Bits() { + final int[] escapes = _outputEscapes; + if (escapes.length >= 0xFF) { + return escapes; + } + + final int[] extended = new int[0xFF]; + System.arraycopy(escapes, 0, extended, 0, escapes.length); + _outputEscapes = extended; + return extended; + } + /* /********************************************************** /* Internal methods, low-level writing, base64 encoded diff --git a/src/test/java/perf/ManualUtf8WriteTest.java b/src/test/java/perf/ManualUtf8WriteTest.java new file mode 100644 index 0000000000..ca5442ec3a --- /dev/null +++ b/src/test/java/perf/ManualUtf8WriteTest.java @@ -0,0 +1,203 @@ +package perf; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.io.CharTypes; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Benchmarks the performance of writing UTF-8 encoded bytes, in particular the difference between using a 7-bit wide + * lookup table for escapes, versus a full 8-bit wide table. The latter is beneficial when processing encoded UTF-8 + * bytes, as the byte itself can directly be used as table index instead of needing an additional branch. + *

+ * This benchmark implements the escaping UTF-8 write loops using both 7-bit and 8-bit tables to show their respective + * differences, as well as testing {@link JsonGenerator#writeUTF8String} for benchmarking the production implementation. + * + * @see Github PR + */ +public class ManualUtf8WriteTest +{ + private String test(byte[] utf8) throws Exception + { + final byte[] OUTPUT = new byte[utf8.length * 2]; + ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(utf8.length * 2); + JsonGenerator generator = new JsonFactory().createGenerator(OUTPUT_STREAM); + + // Let's try to guestimate suitable size, N megs of output + final int REPS = (int) ((double) (80 * 1000 * 1000) / (double) utf8.length); + System.out.printf("%d bytes to scan, will do %d repetitions\n", + utf8.length, REPS); + + int i = 0; + int roundsDone = 0; + final int TYPES = 3; + final int WARMUP_ROUNDS = 5; + final int ROUNDS = WARMUP_ROUNDS + 10; + + final long[] times = new long[TYPES]; + + while (i < ROUNDS * TYPES) { + int round = i++ % TYPES; + + String msg; + + long msecs; + switch (round) { + case 0: + msg = "Write UTF-8 [7-bit escaping table]"; + msecs = writeUtf8_7BitEscapingTable(REPS, utf8, OUTPUT); + break; + case 1: + msg = "Write UTF-8 [8-bit escaping table]"; + msecs = writeUtf8_8BitEscapingTable(REPS, utf8, OUTPUT); + break; + case 2: + msg = "JsonGenerator.writeUTF8String "; + msecs = writeUtf8_JsonGenerator(REPS, utf8, OUTPUT_STREAM, generator); + break; + default: + throw new Error(); + } + // skip first 5 rounds to let results stabilize + if (roundsDone >= WARMUP_ROUNDS) { + times[round] += msecs; + } + + System.out.printf("Test '%s' -> %3d msecs\n", msg, msecs); + if (round == TYPES - 1) { + ++roundsDone; + if ((roundsDone % 3) == 0) { + System.out.println("[GC]"); + Thread.sleep(100L); + System.gc(); + Thread.sleep(100L); + } + System.out.println(); + } + } + double den = roundsDone - WARMUP_ROUNDS; + + return String.format("(7-bit, 8-bit, JsonGenerator): %5.1f / %5.1f / %5.1f msecs", + times[0] / den, times[1] / den, times[2] / den); + } + + private final long writeUtf8_7BitEscapingTable(int REPS, byte[] input, byte[] output) + { + long start = System.currentTimeMillis(); + int[] outputEscapes = CharTypes.get7BitOutputEscapes(); + + while (--REPS >= 0) { + int inOffset = 0; + int outOffset = 0; + int len = input.length; + + while (inOffset < len) { + byte b = input[inOffset++]; + int ch = b; + if (ch < 0 || outputEscapes[ch] == 0) { + output[outOffset++] = b; + continue; + } + int escape = outputEscapes[ch]; + if (escape > 0) { + output[outOffset++] = (byte) '\\'; + output[outOffset++] = (byte) escape; + } else { + throw new UnsupportedOperationException("ctrl character escapes are not covered in test"); + } + } + } + long time = System.currentTimeMillis() - start; + return time; + } + + private final long writeUtf8_8BitEscapingTable(int REPS, byte[] input, byte[] output) + { + long start = System.currentTimeMillis(); + + int[] outputEscapes = CharTypes.get7BitOutputEscapes(); + int[] extendedOutputEscapes = new int[0xFF]; + System.arraycopy(outputEscapes, 0, extendedOutputEscapes, 0, outputEscapes.length); + + while (--REPS >= 0) { + int inOffset = 0; + int outOffset = 0; + int len = input.length; + + while (inOffset < len) { + byte b = input[inOffset++]; + int ch = b & 0xFF; + int escape = extendedOutputEscapes[ch]; + if (escape == 0) { + output[outOffset++] = b; + continue; + } + if (escape > 0) { + output[outOffset++] = (byte) '\\'; + output[outOffset++] = (byte) escape; + } else { + throw new UnsupportedOperationException("ctrl character escapes are not covered in test"); + } + } + } + + long time = System.currentTimeMillis() - start; + return time; + } + + private final long writeUtf8_JsonGenerator(int REPS, byte[] input, ByteArrayOutputStream output, JsonGenerator generator) throws IOException { + long start = System.currentTimeMillis(); + + while (--REPS >= 0) { + output.reset(); + generator.writeUTF8String(input, 0, input.length); + generator.flush(); + } + + long time = System.currentTimeMillis() - start; + return time; + } + + public static void main(String[] args) throws Exception + { + if (args.length != 0) { + System.err.println("Usage: java ..."); + System.exit(1); + } + + final int[] LENGTHS = new int[]{8, 16, 32, 256, 512, 1024, 1024 * 8}; + final String[] ESCAPE_VARIANTS = new String[] {"none", "start", "end"}; + final List results = new ArrayList(); + for (int length : LENGTHS){ + final byte[] buffer = new byte[length]; + + for (int j = 0; j < ESCAPE_VARIANTS.length; j++) { + Arrays.fill(buffer, (byte) 'a'); + + if (j == 1) { + buffer[0] = '"'; + } else if (j == 2) { + buffer[buffer.length - 1] = '"'; + } + + String LABEL = String.format("Length %4d, %5s escape", length, ESCAPE_VARIANTS[j]); + + System.out.printf("Starting %s %n", LABEL); + String result = new ManualUtf8WriteTest().test(buffer); + System.out.printf("Finished %s %n", LABEL); + System.out.println("================================================================================"); + + results.add(String.format("%s: %s", LABEL, result)); + } + } + + for (String result : results) { + System.out.println(result); + } + } +}