oap-project · zhixingheyi-tian · Jul 19, 2022 · Jul 22, 2022 · Jul 22, 2022 · Aug 11, 2022
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -49,6 +49,7 @@ endif()
 
 set(ARROW_VERSION "4.0.0")
 #add_compile_options(-g -O0)
+add_compile_options(-g -march=native)
 
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}")
 

diff --git a/cpp/src/parquet/arrow/parquet_scan_benchmark.cc b/cpp/src/parquet/arrow/parquet_scan_benchmark.cc
@@ -130,12 +130,16 @@ class GoogleBenchmarkColumnarToRow_CacheScan_Benchmark
           properties, &parquet_reader));
 
       std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+      // ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
+      //     row_group_indices, local_column_indices, &record_batch_reader));
+      // need varify complex type, so remove local_column_indices
       ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
-          row_group_indices, local_column_indices, &record_batch_reader));
+          row_group_indices, &record_batch_reader));
       do {
         TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
 
         if (record_batch) {
+          // std::cout << " record_batch->ToString(): " << record_batch->ToString() << std::endl;
           // batches.push_back(record_batch);
           num_batches += 1;
           num_rows += record_batch->num_rows();

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
@@ -142,10 +142,6 @@ class FileReaderImpl : public FileReader {
       : pool_(pool),
         reader_(std::move(reader)),
         reader_properties_(std::move(properties)) {}
-
-  ~FileReaderImpl() {
-     std::cout << "Patch version-0830" << std::endl;
-  }
 
   Status Init() {
     return SchemaManifest::Make(reader_->metadata()->schema(),

diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
@@ -704,7 +704,10 @@ Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_
     case ::arrow::Type::DATE64:
       RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
       break;
-    case ::arrow::Type::FIXED_SIZE_BINARY:
+    case ::arrow::Type::FIXED_SIZE_BINARY: {
+      RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
+      result = chunked_result;
+    } break;
     case ::arrow::Type::BINARY:
     case ::arrow::Type::STRING:
     case ::arrow::Type::LARGE_BINARY:

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
@@ -1570,39 +1570,27 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
   }
 
   void ReadValuesDense(int64_t values_to_read) override {
-    // int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
-    //     static_cast<int>(values_to_read), &accumulator_);
     int64_t num_decoded = this->current_decoder_->DecodeArrow_opt(
         static_cast<int>(values_to_read), 0,
         NULLPTR, (reinterpret_cast<int32_t *>(offset_->mutable_data()) + values_written_), 
-        values_, 0, &accumulator_, &bianry_length_);
+        values_, 0, &bianry_length_);
     DCHECK_EQ(num_decoded, values_to_read);
-    // ResetValues();
   }
 
-  // void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
-  //   int64_t num_decoded = this->current_decoder_->DecodeArrow(
-  //       static_cast<int>(values_to_read), static_cast<int>(null_count),
-  //       valid_bits_->mutable_data(), values_written_, &accumulator_);
-  //   DCHECK_EQ(num_decoded, values_to_read - null_count);
-  //   ResetValues();
-  // }
-
   void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
     int64_t num_decoded = this->current_decoder_->DecodeArrow_opt(
         static_cast<int>(values_to_read), static_cast<int>(null_count),
         valid_bits_->mutable_data(), (reinterpret_cast<int32_t *>(offset_->mutable_data()) + values_written_), 
-        values_, values_written_, &accumulator_, &bianry_length_);
+        values_, values_written_, &bianry_length_);
     DCHECK_EQ(num_decoded, values_to_read - null_count);
-    // ResetValues();
   }
 
   void ReserveValues(int64_t extra_values) {
     const int64_t new_values_capacity =
         UpdateCapacity(values_capacity_, values_written_, extra_values);
     if (new_values_capacity > values_capacity_) {
       PARQUET_THROW_NOT_OK(
-          values_->Resize(new_values_capacity * 20, false));
+          values_->Resize(new_values_capacity * binary_per_row_length_, false));
       PARQUET_THROW_NOT_OK(
           offset_->Resize((new_values_capacity+1) * 4, false));
 
@@ -1626,7 +1614,6 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
 
  std::shared_ptr<ResizableBuffer> ReleaseValues() override {
       auto result = values_;
-      // PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
       values_ = AllocateBuffer(this->pool_);
       values_capacity_ = 0;
       return result;
@@ -1639,8 +1626,13 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
       const auto first_offset = offsetArr[0];
       const auto last_offset = offsetArr[values_written_];
       int64_t binary_length = last_offset - first_offset;
-      // std::cout << "binary_length:" << binary_length << std::endl;
       values_->SetSize(binary_length);
+
+      if (ARROW_PREDICT_FALSE(!hasCal_average_len_)) {
+        binary_per_row_length_ = binary_length / values_written_ + 1;
+        // std::cout << "binary_per_row_length_:" << binary_per_row_length_ << std::endl;
+        hasCal_average_len_ = true;
+      }
 
       offset_ = AllocateBuffer(this->pool_);
       bianry_length_ = 0;
@@ -1667,9 +1659,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
 
   int32_t bianry_length_ = 0;
 
-  // std::shared_ptr<::arrow::ResizableBuffer> values_;
   std::shared_ptr<::arrow::ResizableBuffer> offset_;
-  // std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
 };
 
 class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
@@ -54,6 +54,8 @@ static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
 // 16 KB is the default expected page header size
 static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
 
+static constexpr int32_t kDefaultBinaryPerRowSzie = 20;
+
 class PARQUET_EXPORT LevelDecoder {
  public:
   LevelDecoder();
@@ -301,6 +303,9 @@ class RecordReader {
   int64_t levels_position_;
   int64_t levels_capacity_;
 
+  bool hasCal_average_len_ = false;
+  int64_t binary_per_row_length_ = kDefaultBinaryPerRowSzie;
+
   std::shared_ptr<::arrow::ResizableBuffer> values_;
   // In the case of false, don't allocate the values buffer (when we directly read into
   // builder classes).