From 15cab5bdcacff596af2c613db44a4732e90d3da6 Mon Sep 17 00:00:00 2001 From: Fabien Renaud Date: Tue, 23 Apr 2019 22:13:31 -0700 Subject: [PATCH] Account for bytes processed by encoding detection UTF8StreamJsonParser tracks read pointer (offset) and bytes processed separately and uses those to generate JsonLocation. When the byte payload starts with a UTF BOM, ByteSourceJsonBootstrapper processes a few bytes ahead of the parser, moves/increases the offset and passes the newly computed offset to the parser without telling it some bytes have been pre-processed. With this change, the number of bytes pre-processed for encoding detection is passed to the parser. JsonLocation instances returned by the parser now point to the correct byte offset when payload has a BOM. Issue: https://github.com/FasterXML/jackson-core/issues/533 --- .../core/json/ByteSourceJsonBootstrapper.java | 4 +- .../core/json/UTF8StreamJsonParser.java | 6 +- .../core/json/LocationOffsetsTest.java | 125 +++++++++++++++++- .../jackson/core/read/JsonParserTest.java | 2 +- .../core/util/JsonParserSequenceTest.java | 2 +- 5 files changed, 128 insertions(+), 11 deletions(-) diff --git a/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java b/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java index baa4d5d20d..b40778d215 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java +++ b/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java @@ -242,7 +242,9 @@ public JsonParser constructParser(ObjectReadContext readCtxt, ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols, int factoryFeatures) throws IOException { + int prevInputPtr = _inputPtr; JsonEncoding enc = detectEncoding(); + int bytesProcessed = _inputPtr - prevInputPtr; if (enc == JsonEncoding.UTF8) { /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader @@ -252,7 +254,7 @@ public JsonParser constructParser(ObjectReadContext readCtxt, ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures); return new UTF8StreamJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures, _in, can, - _inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable); + _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable); } } return new ReaderBasedJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures, diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java index cfd096c326..340f448fe1 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java @@ -122,7 +122,7 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt, int stdFeatures, int formatReadFeatures, InputStream in, ByteQuadsCanonicalizer sym, - byte[] inputBuffer, int start, int end, + byte[] inputBuffer, int start, int end, int bytesPreProcessed, boolean bufferRecyclable) { super(readCtxt, ctxt, stdFeatures, formatReadFeatures); @@ -131,9 +131,9 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt, _inputBuffer = inputBuffer; _inputPtr = start; _inputEnd = end; - _currInputRowStart = start; + _currInputRowStart = start - bytesPreProcessed; // If we have offset, need to omit that from byte offset, so: - _currInputProcessed = -start; + _currInputProcessed = -start + bytesPreProcessed; _bufferRecyclable = bufferRecyclable; } diff --git a/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java b/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java index c5f1ad0e6a..524ff4504f 100644 --- a/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java +++ b/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java @@ -23,7 +23,7 @@ public void testSimpleInitialOffsets() throws Exception assertEquals(0L, loc.getCharOffset()); assertEquals(1, loc.getLineNr()); assertEquals(1, loc.getColumnNr()); - + loc = p.getCurrentLocation(); assertEquals(-1L, loc.getByteOffset()); assertEquals(1L, loc.getCharOffset()); @@ -33,7 +33,7 @@ public void testSimpleInitialOffsets() throws Exception p.close(); // then byte-based - + p = JSON_F.createParser(ObjectReadContext.empty(), DOC.getBytes("UTF-8")); assertToken(JsonToken.START_OBJECT, p.nextToken()); @@ -42,7 +42,7 @@ public void testSimpleInitialOffsets() throws Exception assertEquals(-1L, loc.getCharOffset()); assertEquals(1, loc.getLineNr()); assertEquals(1, loc.getColumnNr()); - + loc = p.getCurrentLocation(); assertEquals(1L, loc.getByteOffset()); assertEquals(-1L, loc.getCharOffset()); @@ -61,7 +61,7 @@ public void testOffsetWithInputOffset() throws Exception byte[] b = " { } ".getBytes("UTF-8"); // and then peel them off - p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length-5); + p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length - 5); assertToken(JsonToken.START_OBJECT, p.nextToken()); loc = p.getTokenLocation(); @@ -69,7 +69,7 @@ public void testOffsetWithInputOffset() throws Exception assertEquals(-1L, loc.getCharOffset()); assertEquals(1, loc.getLineNr()); assertEquals(1, loc.getColumnNr()); - + loc = p.getCurrentLocation(); assertEquals(1L, loc.getByteOffset()); assertEquals(-1L, loc.getCharOffset()); @@ -78,4 +78,119 @@ public void testOffsetWithInputOffset() throws Exception p.close(); } + + public void testOffsetWithoutInputOffset() throws Exception + { + JsonLocation loc; + JsonParser p; + // 3 spaces before, 2 after, just for padding + byte[] b = " { } ".getBytes("UTF-8"); + + // and then peel them off + p = JSON_F.createParser(ObjectReadContext.empty(), b); + assertToken(JsonToken.START_OBJECT, p.nextToken()); + + loc = p.getTokenLocation(); + assertEquals(3L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(4, loc.getColumnNr()); + + loc = p.getCurrentLocation(); + assertEquals(4L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(5, loc.getColumnNr()); + + p.close(); + } + + // for [core#533] + public void testUtf8Bom() throws Exception + { + JsonLocation loc; + JsonParser p; + + byte[] b = withUtf8Bom("{ }".getBytes()); + + // and then peel them off + p = JSON_F.createParser(ObjectReadContext.empty(), b); + assertToken(JsonToken.START_OBJECT, p.nextToken()); + + loc = p.getTokenLocation(); + assertEquals(3L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(4, loc.getColumnNr()); + + loc = p.getCurrentLocation(); + assertEquals(4L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(5, loc.getColumnNr()); + + p.close(); + } + + public void testUtf8BomWithPadding() throws Exception + { + JsonLocation loc; + JsonParser p; + + byte[] b = withUtf8Bom(" { }".getBytes()); + + // and then peel them off + p = JSON_F.createParser(ObjectReadContext.empty(), b); + assertToken(JsonToken.START_OBJECT, p.nextToken()); + + loc = p.getTokenLocation(); + assertEquals(6L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(7, loc.getColumnNr()); + + loc = p.getCurrentLocation(); + assertEquals(7L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(8, loc.getColumnNr()); + + p.close(); + } + + public void testUtf8BomWithInputOffset() throws Exception + { + JsonLocation loc; + JsonParser p; + + byte[] b = withUtf8Bom(" { }".getBytes()); + + // and then peel them off + p = JSON_F.createParser(ObjectReadContext.empty(), b); + assertToken(JsonToken.START_OBJECT, p.nextToken()); + + loc = p.getTokenLocation(); + assertEquals(6L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(7, loc.getColumnNr()); + + loc = p.getCurrentLocation(); + assertEquals(7L, loc.getByteOffset()); + assertEquals(-1L, loc.getCharOffset()); + assertEquals(1, loc.getLineNr()); + assertEquals(8, loc.getColumnNr()); + + p.close(); + } + + private byte[] withUtf8Bom(byte[] bytes) { + byte[] arr = new byte[bytes.length + 3]; + // write UTF-8 BOM + arr[0] = (byte) 0xEF; + arr[1] = (byte) 0xBB; + arr[2] = (byte) 0xBF; + System.arraycopy(bytes, 0, arr, 3, bytes.length); + return arr; + } } diff --git a/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java b/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java index 3970c2c994..2289241d12 100644 --- a/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java +++ b/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java @@ -440,7 +440,7 @@ public void testUtf8BOMHandling() throws Exception */ JsonLocation loc = p.getTokenLocation(); // so if BOM was consider in-stream (part of input), this should expect 3: - assertEquals(0, loc.getByteOffset()); + assertEquals(3, loc.getByteOffset()); assertEquals(-1, loc.getCharOffset()); assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); assertEquals(JsonToken.END_ARRAY, p.nextToken()); diff --git a/src/test/java/com/fasterxml/jackson/core/util/JsonParserSequenceTest.java b/src/test/java/com/fasterxml/jackson/core/util/JsonParserSequenceTest.java index aa4412e294..bacabd3ded 100644 --- a/src/test/java/com/fasterxml/jackson/core/util/JsonParserSequenceTest.java +++ b/src/test/java/com/fasterxml/jackson/core/util/JsonParserSequenceTest.java @@ -49,7 +49,7 @@ public void testSkipChildren() throws IOException { UTF8StreamJsonParser uTF8StreamJsonParser = new UTF8StreamJsonParser(ObjectReadContext.empty(), ioContext, 0, 0, byteArrayInputStream, ByteQuadsCanonicalizer.createRoot(), - byteArray, -1, (byte) 9, true); + byteArray, -1, (byte) 9, 0, true); JsonParserDelegate jsonParserDelegate = new JsonParserDelegate(jsonParserArray[0]); JsonParserSequence jsonParserSequence = JsonParserSequence.createFlattened(true, uTF8StreamJsonParser, jsonParserDelegate); JsonParserSequence jsonParserSequenceTwo = (JsonParserSequence) jsonParserSequence.skipChildren();