Skip to content

Commit

Permalink
Improve performance of writing raw UTF-8 encoded byte arrays
Browse files Browse the repository at this point in the history
The output escape table covers just 7-bits, meaning that a raw UTF-8 byte cannot
be used to index into the table without a branch test for negative bytes (i.e. bytes
larger than 0x7F). This extra check occurs in a tight loop and can be avoided if the
lookup table were to cover all 8-bit indices.

This commit introduces ad-hoc logic in `UTF8JsonGenerator#writeUTF8String` to create
an extended copy of `_outputEscapes` if necessary, writing the copy back into the field
to avoid having to compute it again (unless it is changed). This ad-hoc strategy was
chosen as it is the least disruptive to existing code, as a larger-scale change around
`CharacterEscapes` would impact public api or otherwise subtle chances for breakages.
  • Loading branch information
JoostK committed Oct 20, 2024
1 parent 2128a70 commit 2493a9f
Showing 1 changed file with 29 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -647,11 +647,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException
_flushBuffer();
}
_outputBuffer[_outputTail++] = _quoteChar;

// When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into
// using the byte value.
final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits();

// One or multiple segments?
if (len <= _outputMaxContiguous) {
_writeUTF8Segment(text, offset, len);
_writeUTF8Segment(text, offset, len, extendedOutputEscapes);
} else {
_writeUTF8Segments(text, offset, len);
_writeUTF8Segments(text, offset, len, extendedOutputEscapes);
}
if (_outputTail >= _outputEnd) {
_flushBuffer();
Expand Down Expand Up @@ -1846,28 +1851,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in
* to fit in the output buffer after escaping; as such, we just need to
* chunk writes.
*/
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes)
throws IOException, JsonGenerationException
{
do {
int len = Math.min(_outputMaxContiguous, totalLen);
_writeUTF8Segment(utf8, offset, len);
_writeUTF8Segment(utf8, offset, len, extendedOutputEscapes);
offset += len;
totalLen -= len;
} while (totalLen > 0);
}

private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len)
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes)
throws IOException, JsonGenerationException
{
// fast loop to see if escaping is needed; don't copy, just look
final int[] escCodes = _outputEscapes;

for (int ptr = offset, end = offset + len; ptr < end; ) {
// 28-Feb-2011, tatu: escape codes just cover 7-bit range, so:
int ch = utf8[ptr++];
if ((ch >= 0) && escCodes[ch] != 0) {
_writeUTF8Segment2(utf8, offset, len);
int ch = utf8[ptr++] & 0xFF;
if (extendedOutputEscapes[ch] != 0) {
_writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes);
return;
}
}
Expand All @@ -1880,7 +1883,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le
_outputTail += len;
}

private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes)
throws IOException, JsonGenerationException
{
int outputPtr = _outputTail;
Expand All @@ -1892,17 +1895,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
}

final byte[] outputBuffer = _outputBuffer;
final int[] escCodes = _outputEscapes;
len += offset; // so 'len' becomes 'end'

while (offset < len) {
byte b = utf8[offset++];
int ch = b;
if (ch < 0 || escCodes[ch] == 0) {
int ch = b & 0xFF;
int escape = extendedOutputEscapes[ch];
if (escape == 0) {
outputBuffer[outputPtr++] = b;
continue;
}
int escape = escCodes[ch];
if (escape > 0) { // 2-char escape, fine
outputBuffer[outputPtr++] = BYTE_BACKSLASH;
outputBuffer[outputPtr++] = (byte) escape;
Expand All @@ -1914,6 +1916,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
_outputTail = outputPtr;
}

private int[] _extendOutputEscapesTo8Bits() {
final int[] escapes = _outputEscapes;
if (escapes.length >= 0xFF) {
return escapes;
}

final int[] extended = new int[0xFF];
System.arraycopy(escapes, 0, extended, 0, escapes.length);
_outputEscapes = extended;
return extended;
}

/*
/**********************************************************
/* Internal methods, low-level writing, base64 encoded
Expand Down

0 comments on commit 2493a9f

Please sign in to comment.