StarRocks · stephen-shelby · Oct 19, 2023 · starrocks-cr · Oct 19, 2023
@@ -143,6 +143,9 @@ set(EXEC_FILES
     schema_scanner/schema_helper.cpp
     schema_scanner/schema_routine_load_jobs_scanner.cpp
     schema_scanner/schema_stream_loads_scanner.cpp
+    schema_scanner/schema_iceberg_snapshots_scanner.cpp
+    schema_scanner/schema_iceberg_manifests_scanner.cpp
+    schema_scanner/schema_iceberg_files_scanner.cpp
     jdbc_scanner.cpp
     sorting/compare_column.cpp
     sorting/merge_column.cpp

@@ -115,7 +115,12 @@ Status SchemaScanContext::_prepare_params(RuntimeState* state) {
     if (_tnode.schema_scan_node.__isset.log_limit) {
         _param->log_limit = _tnode.schema_scan_node.log_limit;
     }
-
+    if (_tnode.schema_scan_node.__isset.origin_db_name) {
+        _param->origin_db = _obj_pool.add(new std::string(_tnode.schema_scan_node.origin_db_name));
+    }
+    if (_tnode.schema_scan_node.__isset.origin_table_name) {
+        _param->origin_table = _obj_pool.add(new std::string(_tnode.schema_scan_node.origin_table_name));
+    }
     return Status::OK();
 }
 

@@ -29,6 +29,9 @@
 #include "exec/schema_scanner/schema_columns_scanner.h"
 #include "exec/schema_scanner/schema_dummy_scanner.h"
 #include "exec/schema_scanner/schema_fe_tablet_schedules_scanner.h"
+#include "exec/schema_scanner/schema_iceberg_files_scanner.h"
+#include "exec/schema_scanner/schema_iceberg_manifests_scanner.h"
+#include "exec/schema_scanner/schema_iceberg_snapshots_scanner.h"
 #include "exec/schema_scanner/schema_load_tracking_logs_scanner.h"
 #include "exec/schema_scanner/schema_loads_scanner.h"
 #include "exec/schema_scanner/schema_materialized_views_scanner.h"
@@ -169,6 +172,12 @@ std::unique_ptr<SchemaScanner> SchemaScanner::create(TSchemaTableType::type type
         return std::make_unique<SchemaTablePipeFiles>();
     case TSchemaTableType::SCH_PIPES:
         return std::make_unique<SchemaTablePipes>();
+    case TSchemaTableType::SCH_ICEBERG_SNAPSHOTS:
+        return std::make_unique<SchemaIcebergSnapshotsScanner>();
+    case TSchemaTableType::SCH_ICEBERG_MANIFESTS:
+        return std::make_unique<SchemaIcebergManifestsScanner>();
+    case TSchemaTableType::SCH_ICEBERG_FILES:
+        return std::make_unique<SchemaIcebergFilesScanner>();
     default:
         return std::make_unique<SchemaDummyScanner>();
     }

@@ -65,6 +65,8 @@ struct SchemaScannerParam {
     const std::string* log_level{nullptr};
     const std::string* log_pattern{nullptr};
     int64_t log_limit{-1};
+    const std::string* origin_db{nullptr};
+    const std::string* origin_table{nullptr};
 
     RuntimeProfile::Counter* _rpc_timer = nullptr;
     RuntimeProfile::Counter* _fill_chunk_timer = nullptr;

@@ -213,6 +213,31 @@ Status SchemaHelper::get_grants_to(const std::string& ip, const int32_t port,
             timeout_ms);
 }
 
+Status SchemaHelper::get_iceberg_snapshots(const std::string& ip, const int32_t port,
+                                           const TGetIcebergSnapshotRequest& request,
+                                           TGetIcebergSnapshotsResponse* response) {
+    return ThriftRpcHelper::rpc<FrontendServiceClient>(ip, port,
+                                                       [&request, &response](FrontendServiceConnection& client) {
+                                                           client->getIcebergSnapshots(*response, request);
+                                                       });
+}
+
+Status SchemaHelper::get_iceberg_manifests(const std::string& ip, const int32_t port,
+                                           const TGetIcebergManifestsRequest& request,
+                                           TGetIcebergManifestsResponse* response) {
+    return ThriftRpcHelper::rpc<FrontendServiceClient>(ip, port,
+                                                       [&request, &response](FrontendServiceConnection& client) {
+                                                           client->getIcebergManifests(*response, request);
+                                                       });
+}
+
+Status SchemaHelper::get_iceberg_files(const std::string& ip, const int32_t port,
+                                       const TGetIcebergFilesRequest& request, TGetIcebergFilesResponse* response) {
+    return ThriftRpcHelper::rpc<FrontendServiceClient>(
+            ip, port,
+            [&request, &response](FrontendServiceConnection& client) { client->getIcebergFiles(*response, request); });
+}
+
 void fill_data_column_with_null(Column* data_column) {
     auto* nullable_column = down_cast<NullableColumn*>(data_column);
     nullable_column->append_nulls(1);

@@ -93,6 +93,15 @@ class SchemaHelper {
     static Status get_grants_to(const std::string& ip, const int32_t port,
                                 const TGetGrantsToRolesOrUserRequest& request,
                                 TGetGrantsToRolesOrUserResponse* response, int timeout_ms);
+    static Status get_iceberg_snapshots(const std::string& ip, const int32_t port,
+                                        const TGetIcebergSnapshotRequest& request,
+                                        TGetIcebergSnapshotsResponse* response);
+
+    static Status get_iceberg_manifests(const std::string& ip, const int32_t port,
+                                        const TGetIcebergManifestsRequest& request,
+                                        TGetIcebergManifestsResponse* response);
+    static Status get_iceberg_files(const std::string& ip, const int32_t port, const TGetIcebergFilesRequest& request,
+                                    TGetIcebergFilesResponse* response);
 };
 
 template <LogicalType SlotType>

@@ -0,0 +1,206 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exec/schema_scanner/schema_iceberg_files_scanner.h"
+
+#include "exec/schema_scanner/schema_helper.h"
+#include "runtime/runtime_state.h"
+#include "runtime/string_value.h"
+
+namespace starrocks {
+
+SchemaScanner::ColumnDesc SchemaIcebergFilesScanner::_iceberg_files_columns[] = {
+        //   name,       type,          size,     is_null
+        {"content", TYPE_INT, sizeof(int32_t), false},
+        {"file_path", TYPE_VARCHAR, sizeof(StringValue), false},
+        {"file_format", TYPE_VARCHAR, sizeof(StringValue), false},
+        {"spec_id", TYPE_INT, sizeof(int32_t), false},
+        {"record_count", TYPE_BIGINT, sizeof(int64_t), false},
+        {"file_size_in_bytes", TYPE_BIGINT, sizeof(int64_t), false},
+        {"column_sizes", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"value_counts", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"null_value_counts", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"nan_value_counts", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"lower_bounds", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"upper_bounds", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"split_offsets", TYPE_VARCHAR, sizeof(StringValue), true},
+        {"equality_ids", TYPE_VARCHAR, sizeof(StringValue), true}};
+
+SchemaIcebergFilesScanner::SchemaIcebergFilesScanner()
+        : SchemaScanner(_iceberg_files_columns, sizeof(_iceberg_files_columns) / sizeof(SchemaScanner::ColumnDesc)) {}
+
+SchemaIcebergFilesScanner::~SchemaIcebergFilesScanner() = default;
+
+Status SchemaIcebergFilesScanner::start(RuntimeState* state) {
+    RETURN_IF_ERROR(SchemaScanner::start(state));
+    TGetIcebergFilesRequest request;
+    if (nullptr != _param->catalog) {
+        request.__set_catalog_name(*(_param->catalog));
+    }
+    if (nullptr != _param->origin_db) {
+        request.__set_db_name(*(_param->origin_db));
+    }
+    if (nullptr != _param->origin_table) {
+        request.__set_table_name(*(_param->origin_table));
+    }
+    request.__set_limit(_param->limit);
+    if (nullptr != _param->ip && 0 != _param->port) {
+        RETURN_IF_ERROR(SchemaHelper::get_iceberg_files(*(_param->ip), _param->port, request, &_files_res));
+    } else {
+        return Status::InternalError("IP or port doesn't exists");
+    }
+    _files_index = 0;
+    return Status::OK();
+}
+
+Status SchemaIcebergFilesScanner::fill_chunk(ChunkPtr* chunk) {
+    const TIcebergFile& file = _files_res.iceberg_files[_files_index];
+    const auto& slot_id_to_index_map = (*chunk)->get_slot_id_to_index_map();
+    for (const auto& [slot_id, index] : slot_id_to_index_map) {
+        ColumnPtr column = (*chunk)->get_column_by_slot_id(slot_id);
+        switch (slot_id) {
+        case 1: {
+            // content
+            { fill_column_with_slot<TYPE_INT>(column.get(), (void*)&file.content); }
+            break;
+        }
+        case 2: {
+            // file_path
+            {
+                const std::string* str = &file.file_path;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 3: {
+            // file_format
+            {
+                const std::string* str = &file.file_format;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 4: {
+            // spec_id
+            { fill_column_with_slot<TYPE_INT>(column.get(), (void*)&file.spec_id); }
+            break;
+        }
+        case 5: {
+            // record_count
+            { fill_column_with_slot<TYPE_BIGINT>(column.get(), (void*)&file.record_count); }
+            break;
+        }
+        case 6: {
+            // file_size_in_bytes
+            { fill_column_with_slot<TYPE_BIGINT>(column.get(), (void*)&file.file_size_in_bytes); }
+            break;
+        }
+        case 7: {
+            // column_sizes
+            {
+                const std::string* str = &file.column_sizes;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 8: {
+            // value_counts
+            {
+                const std::string* str = &file.value_counts;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 9: {
+            // null_value_counts
+            {
+                const std::string* str = &file.null_value_counts;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 10: {
+            // nan_value_counts
+            {
+                const std::string* str = &file.nan_value_counts;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 11: {
+            // lower_bounds
+            {
+                const std::string* str = &file.lower_bounds;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 12: {
+            // upper_bounds
+            {
+                const std::string* str = &file.upper_bounds;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 13: {
+            // split_offsets
+            {
+                const std::string* str = &file.split_offsets;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        case 14: {
+            // equality_ids
+            {
+                const std::string* str = &file.equality_ids;
+                Slice value(str->c_str(), str->length());
+                fill_column_with_slot<TYPE_VARCHAR>(column.get(), (void*)&value);
+            }
+            break;
+        }
+        default:
+            break;
+        }
+    }
+    _files_index++;
+    return Status::OK();
+}
+
+Status SchemaIcebergFilesScanner::get_next(ChunkPtr* chunk, bool* eos) {
+    if (!_is_init) {
+        return Status::InternalError("Used before initialized.");
+    }
+    if (_files_index >= _files_res.iceberg_files.size()) {
+        *eos = true;
+        return Status::OK();
+    }
+    if (nullptr == chunk || nullptr == eos) {
+        return Status::InternalError("input pointer is nullptr.");
+    }
+    *eos = false;
+    return fill_chunk(chunk);
+}
+
+} // namespace starrocks
@@ -0,0 +1,39 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "exec/schema_scanner.h"
+#include "gen_cpp/FrontendService_types.h"
+
+namespace starrocks {
+
+class SchemaIcebergFilesScanner : public SchemaScanner {
+public:
+    SchemaIcebergFilesScanner();
+    ~SchemaIcebergFilesScanner() override;
+    Status start(RuntimeState* state) override;
+    Status get_next(ChunkPtr* chunk, bool* eos) override;
+
+private:
+    Status fill_chunk(ChunkPtr* chunk);
+
+    int _files_index{0};
+    TGetIcebergFilesResponse _files_res;
+    static SchemaScanner::ColumnDesc _iceberg_files_columns[];
+};
+
+} // namespace starrocks