diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 6a4830c38d..f2e150cea5 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -19,6 +19,7 @@ package org.apache.orc.impl; import com.google.protobuf.CodedInputStream; +import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.TextFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -55,6 +56,7 @@ import java.security.Key; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.function.Supplier; @@ -1035,11 +1037,15 @@ private static List deserializeStripeStats( long offset, int length, InStream.StreamOptions options) throws IOException { - InStream stream = InStream.create("stripe stats", tailBuffer, offset, - length, options); - OrcProto.Metadata meta = OrcProto.Metadata.parseFrom( - InStream.createCodedInputStream(stream)); - return meta.getStripeStatsList(); + try (InStream stream = InStream.create("stripe stats", tailBuffer, offset, + length, options)) { + OrcProto.Metadata meta = OrcProto.Metadata.parseFrom( + InStream.createCodedInputStream(stream)); + return meta.getStripeStatsList(); + } catch (InvalidProtocolBufferException e) { + LOG.warn("Failed to parse stripe statistics", e); + return Collections.emptyList(); + } } private List convertFromProto(List list) { diff --git a/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java new file mode 100644 index 0000000000..a766cb01df --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcWithLargeStripeStatistics.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests for operations on Orc file with very large stripe statistics. + *

+ * The test is disabled by default cause it is rather slow (approx 14 minutes) and memory greedy + * (it requires about 4g heap space when creating the files). If you want to run it remove the + * {@code Disabled} annotation and ensure that max heap (Xmx) is at least 4g. + *

+ */ +@Disabled("ORC-1361") +public class TestOrcWithLargeStripeStatistics { + + @ParameterizedTest + @EnumSource(value = OrcFile.Version.class, mode = EnumSource.Mode.EXCLUDE, names = "FUTURE") + public void testGetStripeStatisticsNoProtocolBufferExceptions(OrcFile.Version version) + throws Exception { + // Use a size that exceeds the protobuf limit (e.g., 1GB) to trigger protobuf exception + Path p = createOrcFile(1024L << 20, version); + try (Reader reader = OrcFile.createReader(p, OrcFile.readerOptions(new Configuration()))) { + assertTrue(reader.getStripeStatistics().isEmpty()); + } + } + + /** + * Creates an Orc file with a metadata section of the specified size and return its path in the + * filesystem. + * + * The file has a fixed schema (500 string columns) and content (every column contains 200 + * characters, which is roughly 200 bytes). Each row is roughly 100KB uncompressed and each stripe + * holds exactly one row thus stripe metadata (column statistics) per row is 200KB (100KB for min, + * 100KB for max, few bytes for sum). + * + * @param metadataSize the desired size of the resulting metadata section in bytes + * @param version the desired version to create the file + * @return the path to filesystem where the file was created. + * @throws IOException if an IO problem occurs while creating the file + */ + private static Path createOrcFile(long metadataSize, OrcFile.Version version) throws IOException { + // Calculate the number of rows/stripes to create based on the size of one row (200KB). + final long ROW_STRIPE_NUM = metadataSize / 200_000L; + Path p = new Path(System.getProperty("test.tmp.dir"), + TestOrcWithLargeStripeStatistics.class.getSimpleName() + + "_" + ROW_STRIPE_NUM + "_" + version + ".orc"); + // Modify defaults to force one row per stripe. + Configuration conf = new Configuration(); + conf.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "0"); + TypeDescription schema = createTypeDescription(); + OrcFile.WriterOptions writerOptions = + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(1) + .encodingStrategy(OrcFile.EncodingStrategy.SPEED) + .version(version); + try (Writer writer = OrcFile.createWriter(p, writerOptions)) { + VectorizedRowBatch batch = createSingleRowBatch(schema); + for (long i = 0; i < ROW_STRIPE_NUM; i++) { + writer.addRowBatch(batch); + } + } + return p; + } + + private static VectorizedRowBatch createSingleRowBatch(TypeDescription schema) { + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 1; + byte[] bigString = new byte[200]; + Arrays.fill(bigString, (byte) 'A'); + for (int i = 0; i < batch.numCols; i++) { + BytesColumnVector col = (BytesColumnVector) batch.cols[i]; + col.setVal(0, bigString); + } + return batch; + } + + private static TypeDescription createTypeDescription() { + String strCols = IntStream.range(0, 500) + .mapToObj(i -> "col" + i + ":string") + .collect(Collectors.joining(",")); + return TypeDescription.fromString("struct<" + strCols + ">"); + } + +}