From 598a126c0dbf4be2c84eaba5a57779b74c23e85b Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 27 Jan 2023 16:28:35 +0100 Subject: [PATCH 1/2] ORC-1361: InvalidProtocolBufferException when reading large stripe statistics Catch `InvalidProtocolBufferException` when parsing stripe statistics, log a warning message, and return an empty list. In some cases ORC files may be created with very large Metadata section due to stripe statistics. The bigger the ORC file gets the easier is to endup with a large Metadata section. Nevertheless, it is still possible to hit the problem with smaller ORC files and `TestOrcWithLargeStripeStatistics` demonstrates some extremes. Any attempt to read back the stripe statistics from the file will fail with `InvalidProtocolBufferException`. The exact exception may differ slighly and depending on: a) the size of stripe statistics; b) protobuf size limit. Stripe statistics less than 2GB, and protobuf limit less than stats size: ``` com.google.protobuf.InvalidProtocolBufferException: Protocol message was too large. May be malicious. Use CodedInputStream.setSizeLimit() to increase the size limit. ``` Stripe statistics greater than 2GB, and protobuf limit 2GB (`Integer.MAX_VALUE`): ``` com.google.protobuf.InvalidProtocolBufferException: While parsing a protocol message, the input ended unexpectedly in the middle of a field. This could mean either that the input has been truncated or that an embedded message misreported its own length. ``` The `Protocol message was too large` problem could be alleviated by increasing the limit as it was done in the past. However, increasing the limit would hide the underlying problem and probably lead to permanent metadata corruption in the near future. Moreover, the limit is already high enough (1GB) and bumping it further would lead to more memory being used potentially triggering `OutOfMemoryError`, OOM Killer, GC pauses, and other problems that are usually harder to debug and find the root cause. When the stripe statistics exceeds the 2GB there is nothing to be done to parse back the statistics since protobuf cannot deserialize such messages (https://github.com/protocolbuffers/protobuf/issues/11729). The problem cannot be solved unless the metadata storage changes but this can only happen in newer versions. On the other hand, stripe statistics are important but not vital for readers. In those situations where limits are breached it is acceptable to log a warning and return nothing instead of raising a fatal error to the caller. Existing unit tests plus new tests added in TestOrcWithLargeStripeStatistics. --- .../java/org/apache/orc/impl/ReaderImpl.java | 16 ++- .../orc/TestOrcWithLargeStripeStatistics.java | 115 ++++++++++++++++++ 2 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 6a4830c38d..c8f64e7314 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -19,6 +19,7 @@ package org.apache.orc.impl; import com.google.protobuf.CodedInputStream; +import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.TextFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -55,6 +56,7 @@ import java.security.Key; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.function.Supplier; @@ -1035,11 +1037,15 @@ private static List deserializeStripeStats( long offset, int length, InStream.StreamOptions options) throws IOException { - InStream stream = InStream.create("stripe stats", tailBuffer, offset, - length, options); - OrcProto.Metadata meta = OrcProto.Metadata.parseFrom( - InStream.createCodedInputStream(stream)); - return meta.getStripeStatsList(); + try (InStream stream = InStream.create("stripe stats", tailBuffer, offset, + length, options)) { + OrcProto.Metadata meta = OrcProto.Metadata.parseFrom( + InStream.createCodedInputStream(stream)); + return meta.getStripeStatsList(); + } catch (InvalidProtocolBufferException e) { + LOG.warn("Failed to parse stripe statistics; check ORC-1361 for more details.", e); + return Collections.emptyList(); + } } private List convertFromProto(List list) { diff --git a/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java new file mode 100644 index 0000000000..a766cb01df --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests for operations on Orc file with very large stripe statistics. + *

+ * The test is disabled by default cause it is rather slow (approx 14 minutes) and memory greedy + * (it requires about 4g heap space when creating the files). If you want to run it remove the + * {@code Disabled} annotation and ensure that max heap (Xmx) is at least 4g. + *

+ */ +@Disabled("ORC-1361") +public class TestOrcWithLargeStripeStatistics { + + @ParameterizedTest + @EnumSource(value = OrcFile.Version.class, mode = EnumSource.Mode.EXCLUDE, names = "FUTURE") + public void testGetStripeStatisticsNoProtocolBufferExceptions(OrcFile.Version version) + throws Exception { + // Use a size that exceeds the protobuf limit (e.g., 1GB) to trigger protobuf exception + Path p = createOrcFile(1024L << 20, version); + try (Reader reader = OrcFile.createReader(p, OrcFile.readerOptions(new Configuration()))) { + assertTrue(reader.getStripeStatistics().isEmpty()); + } + } + + /** + * Creates an Orc file with a metadata section of the specified size and return its path in the + * filesystem. + * + * The file has a fixed schema (500 string columns) and content (every column contains 200 + * characters, which is roughly 200 bytes). Each row is roughly 100KB uncompressed and each stripe + * holds exactly one row thus stripe metadata (column statistics) per row is 200KB (100KB for min, + * 100KB for max, few bytes for sum). + * + * @param metadataSize the desired size of the resulting metadata section in bytes + * @param version the desired version to create the file + * @return the path to filesystem where the file was created. + * @throws IOException if an IO problem occurs while creating the file + */ + private static Path createOrcFile(long metadataSize, OrcFile.Version version) throws IOException { + // Calculate the number of rows/stripes to create based on the size of one row (200KB). + final long ROW_STRIPE_NUM = metadataSize / 200_000L; + Path p = new Path(System.getProperty("test.tmp.dir"), + TestOrcWithLargeStripeStatistics.class.getSimpleName() + + "_" + ROW_STRIPE_NUM + "_" + version + ".orc"); + // Modify defaults to force one row per stripe. + Configuration conf = new Configuration(); + conf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "0"); + TypeDescription schema = createTypeDescription(); + OrcFile.WriterOptions writerOptions = + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(1) + .encodingStrategy(OrcFile.EncodingStrategy.SPEED) + .version(version); + try (Writer writer = OrcFile.createWriter(p, writerOptions)) { + VectorizedRowBatch batch = createSingleRowBatch(schema); + for (long i = 0; i < ROW_STRIPE_NUM; i++) { + writer.addRowBatch(batch); + } + } + return p; + } + + private static VectorizedRowBatch createSingleRowBatch(TypeDescription schema) { + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 1; + byte[] bigString = new byte[200]; + Arrays.fill(bigString, (byte) 'A'); + for (int i = 0; i < batch.numCols; i++) { + BytesColumnVector col = (BytesColumnVector) batch.cols[i]; + col.setVal(0, bigString); + } + return batch; + } + + private static TypeDescription createTypeDescription() { + String strCols = IntStream.range(0, 500) + .mapToObj(i -> "col" + i + ":string") + .collect(Collectors.joining(",")); + return TypeDescription.fromString("struct<" + strCols + ">"); + } + +} From 55eebc6c6b779e6063528b3514670b33d46756cf Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 17 Feb 2023 16:13:09 +0100 Subject: [PATCH 2/2] Simplify logging message in ReaderImpl --- java/core/src/java/org/apache/orc/impl/ReaderImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index c8f64e7314..f2e150cea5 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -1043,7 +1043,7 @@ private static List deserializeStripeStats( InStream.createCodedInputStream(stream)); return meta.getStripeStatsList(); } catch (InvalidProtocolBufferException e) { - LOG.warn("Failed to parse stripe statistics; check ORC-1361 for more details.", e); + LOG.warn("Failed to parse stripe statistics", e); return Collections.emptyList(); } }