From 43afdcec2ff51958fb5e3698fb12007268633c18 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 21 Jan 2025 17:28:43 +0800 Subject: [PATCH] GH-45283: [C++][Parquet] Omit level histogram when max level is 0 (#45285) ### Rationale for this change The level histogram of size statistics can be omitted if its max level is 0. We haven't implemented this yet and enforces histogram size to be equal to `max_level + 1`. However, when reading a Parquet file with omitted level histogram, exception will be thrown. ### What changes are included in this PR? Omit level histogram when max level is 0. ### Are these changes tested? Yes, a test case has been added to reflect the change. ### Are there any user-facing changes? No. * GitHub Issue: #45283 Lead-authored-by: Gang Wu Co-authored-by: Antoine Pitrou Signed-off-by: Gang Wu --- cpp/src/parquet/column_writer.cc | 3 ++ cpp/src/parquet/size_statistics.cc | 40 +++++++++++++------- cpp/src/parquet/size_statistics_test.cc | 49 +++++++++++++++++++++---- 3 files changed, 70 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 683a5ab735aed..4998e6f301a00 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< auto add_levels = [](std::vector& level_histogram, ::arrow::util::span levels, int16_t max_level) { + if (max_level == 0) { + return; + } ARROW_DCHECK_EQ(static_cast(max_level) + 1, level_histogram.size()); ::parquet::UpdateLevelHistogram(levels, level_histogram); }; diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index 7292f9222a684..1ce6c937ad5e6 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -64,23 +64,28 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { } void SizeStatistics::Validate(const ColumnDescriptor* descr) const { - if (repetition_level_histogram.size() != - static_cast(descr->max_repetition_level() + 1)) { - throw ParquetException("Repetition level histogram size mismatch"); - } - if (definition_level_histogram.size() != - static_cast(descr->max_definition_level() + 1)) { - throw ParquetException("Definition level histogram size mismatch"); - } + auto validate_histogram = [](const std::vector& histogram, int16_t max_level, + const std::string& name) { + if (histogram.empty()) { + // A levels histogram is always allowed to be missing. + return; + } + if (histogram.size() != static_cast(max_level + 1)) { + std::stringstream ss; + ss << name << " level histogram size mismatch, size: " << histogram.size() + << ", expected: " << (max_level + 1); + throw ParquetException(ss.str()); + } + }; + validate_histogram(repetition_level_histogram, descr->max_repetition_level(), + "Repetition"); + validate_histogram(definition_level_histogram, descr->max_definition_level(), + "Definition"); if (unencoded_byte_array_data_bytes.has_value() && descr->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("Unencoded byte array data bytes does not support " + TypeToString(descr->physical_type())); } - if (!unencoded_byte_array_data_bytes.has_value() && - descr->physical_type() == Type::BYTE_ARRAY) { - throw ParquetException("Missing unencoded byte array data bytes"); - } } void SizeStatistics::Reset() { @@ -93,8 +98,15 @@ void SizeStatistics::Reset() { std::unique_ptr SizeStatistics::Make(const ColumnDescriptor* descr) { auto size_stats = std::make_unique(); - size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); - size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + // If the max level is 0, the level histogram can be omitted because it contains + // only single level (a.k.a. 0) and its count is equivalent to `num_values` of the + // column chunk or data page. + if (descr->max_repetition_level() != 0) { + size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); + } + if (descr->max_definition_level() != 0) { + size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + } if (descr->physical_type() == Type::BYTE_ARRAY) { size_stats->unencoded_byte_array_data_bytes = 0; } diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index 0958ae4dec2ca..90d6df57e7f43 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -216,6 +216,20 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { } } + void ReadData() { + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); + auto metadata = reader->metadata(); + for (int i = 0; i < metadata->num_row_groups(); ++i) { + int64_t num_rows = metadata->RowGroup(i)->num_rows(); + auto row_group_reader = reader->RowGroup(i); + for (int j = 0; j < metadata->num_columns(); ++j) { + auto column_reader = row_group_reader->RecordReader(j); + ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows); + } + } + } + void Reset() { buffer_.reset(); } protected: @@ -300,23 +314,23 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) { ReadSizeStatistics(); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/5}, SizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/1}, SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/4})); EXPECT_THAT(page_stats_, ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{5}}, PageSizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{1}}, PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{4}})); } @@ -368,12 +382,31 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) { ReadSizeStatistics(); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000}, - /*rep_levels=*/{90000}, + /*rep_levels=*/{}, /*byte_array_bytes=*/90000})); EXPECT_THAT(page_stats_, ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000}, - /*rep_levels=*/{90000}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{90000}})); } +TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { + auto schema = + ::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)}); + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, + ::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), + /*max_row_group_length=*/2, + /*page_size=*/1024); + ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics()); + ASSERT_NO_FATAL_FAILURE(ReadData()); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/6})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/{6}})); +} + } // namespace parquet