From f5d7a957075f2dade99179a0ccef72c52bf14c10 Mon Sep 17 00:00:00 2001 From: Johan Mabille Date: Thu, 24 Oct 2024 16:14:18 +0200 Subject: [PATCH] Cleanup of external_array_data_creation (#251) --- test/arrow_array_schema_creation.hpp | 2 +- test/external_array_data_creation.cpp | 43 ++++- test/external_array_data_creation.hpp | 261 +++++++++++++------------- 3 files changed, 168 insertions(+), 138 deletions(-) diff --git a/test/arrow_array_schema_creation.hpp b/test/arrow_array_schema_creation.hpp index 76e9a42d..b4aea20d 100644 --- a/test/arrow_array_schema_creation.hpp +++ b/test/arrow_array_schema_creation.hpp @@ -27,7 +27,7 @@ inline std::pair make_external_arrow_schema_and_array() std::pair pair; constexpr size_t size = 10; constexpr size_t offset = 1; - sparrow::test::fill_schema_and_array(pair.second, pair.first, size, offset, {2, 3}); + sparrow::test::fill_external_schema_and_array(pair.second, pair.first, size, offset, {2, 3}); return pair; } diff --git a/test/external_array_data_creation.cpp b/test/external_array_data_creation.cpp index 256f82fd..ee328c9f 100644 --- a/test/external_array_data_creation.cpp +++ b/test/external_array_data_creation.cpp @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "sparrow/arrow_interface/arrow_array.hpp" -#include "sparrow/arrow_interface/arrow_schema.hpp" -#include "sparrow/arrow_interface/arrow_array_schema_factory.hpp" #include "external_array_data_creation.hpp" #ifdef __GNUC__ @@ -28,7 +25,7 @@ namespace sparrow::test namespace detail { template - void release_common_arrow(T* t) + void release_external_common_arrow(T* t) { if (t->dictionary) { @@ -48,12 +45,12 @@ namespace sparrow::test } } - void release_arrow_schema(ArrowSchema* schema) + void release_external_arrow_schema(ArrowSchema* schema) { - detail::release_common_arrow(schema); + detail::release_external_common_arrow(schema); } - void release_arrow_array(ArrowArray* arr) + void release_external_arrow_array(ArrowArray* arr) { for (std::int64_t i = 0; i < arr->n_buffers; ++i) { @@ -61,7 +58,33 @@ namespace sparrow::test } delete[] reinterpret_cast(arr->buffers); arr->buffers = nullptr; - detail::release_common_arrow(arr); + detail::release_external_common_arrow(arr); + } + + sparrow::buffer make_offset_buffer_from_sizes(const std::vector& sizes, bool big) + { + const auto n = sizes.size() + 1; + const auto buf_size = n * (big ? sizeof(std::uint64_t) : sizeof(std::uint32_t)); + auto buf = new std::uint8_t[buf_size]; + if (big) + { + auto* ptr = reinterpret_cast(buf); + ptr[0] = 0; + for (std::size_t i = 0; i < sizes.size(); ++i) + { + ptr[i + 1] = ptr[i] + static_cast(sizes[i]); + } + } + else + { + auto* ptr = reinterpret_cast(buf); + ptr[0] = 0; + for (std::size_t i = 0; i < sizes.size(); ++i) + { + ptr[i + 1] = ptr[i] + static_cast(sizes[i]); + } + } + return {buf, buf_size}; } sparrow::buffer make_size_buffer(const std::vector& sizes, bool big) @@ -113,7 +136,7 @@ namespace sparrow::test std::vector arr_buffs = { sparrow::make_bitmap_buffer(list_lengths.size(), false_positions), - make_offset_buffer_from_sizes2(list_lengths, big_list) + make_offset_buffer_from_sizes(list_lengths, big_list) }; ArrowArray** array_children = new ArrowArray*[1]; @@ -203,7 +226,7 @@ namespace sparrow::test std::vector arr_buffs = { sparrow::make_bitmap_buffer(list_lengths.size(), false_positions), - make_offset_buffer_from_sizes2(list_lengths, big_list), + make_offset_buffer_from_sizes(list_lengths, big_list), make_size_buffer(list_lengths, big_list) }; diff --git a/test/external_array_data_creation.hpp b/test/external_array_data_creation.hpp index 03cc929a..70924f72 100644 --- a/test/external_array_data_creation.hpp +++ b/test/external_array_data_creation.hpp @@ -16,7 +16,9 @@ #include #include - +#include "sparrow/arrow_interface/arrow_array.hpp" +#include "sparrow/arrow_interface/arrow_schema.hpp" +#include "sparrow/arrow_interface/arrow_array_schema_factory.hpp" #include "sparrow/arrow_array_schema_proxy.hpp" #include "sparrow/types/data_type.hpp" #include "sparrow/types/data_traits.hpp" @@ -24,75 +26,8 @@ namespace sparrow::test { - void release_arrow_schema(ArrowSchema* schema); - void release_arrow_array(ArrowArray* arr); - - inline std::uint8_t* make_offset_buffer_from_sizes(const std::vector& sizes, bool big) - { -// ignore -Werror=cast-align] -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wcast-align" -#endif - const auto n = sizes.size() + 1; - auto buf = new std::uint8_t[n * (big ? sizeof(std::uint64_t) : sizeof(std::uint32_t))]; - if (big) - { - auto* ptr = reinterpret_cast(buf); - ptr[0] = 0; - for (std::size_t i = 0; i < sizes.size(); ++i) - { - ptr[i + 1] = ptr[i] + static_cast(sizes[i]); - } - } - else - { - auto* ptr = reinterpret_cast(buf); - ptr[0] = 0; - for (std::size_t i = 0; i < sizes.size(); ++i) - { - ptr[i + 1] = ptr[i] + static_cast(sizes[i]); - } - } -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif - return buf; - } - - inline sparrow::buffer make_offset_buffer_from_sizes2(const std::vector& sizes, bool big) - { -// ignore -Werror=cast-align] -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wcast-align" -#endif - const auto n = sizes.size() + 1; - const auto buf_size = n * (big ? sizeof(std::uint64_t) : sizeof(std::uint32_t)); - auto buf = new std::uint8_t[buf_size]; - if (big) - { - auto* ptr = reinterpret_cast(buf); - ptr[0] = 0; - for (std::size_t i = 0; i < sizes.size(); ++i) - { - ptr[i + 1] = ptr[i] + static_cast(sizes[i]); - } - } - else - { - auto* ptr = reinterpret_cast(buf); - ptr[0] = 0; - for (std::size_t i = 0; i < sizes.size(); ++i) - { - ptr[i + 1] = ptr[i] + static_cast(sizes[i]); - } - } -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif - return {buf, buf_size}; - } + void release_external_arrow_schema(ArrowSchema* schema); + void release_external_arrow_array(ArrowArray* arr); inline std::uint8_t* make_bitmap_buffer(size_t n, const std::vector& false_bitmap) { @@ -111,8 +46,29 @@ namespace sparrow::test return buf; } + namespace detail + { + template + void fill_primitive_data_buffer(T* data_buf, std::size_t size) + { + if constexpr (std::same_as) + { + for (std::size_t i = 0; i < size; ++i) + { + data_buf[i] = (i % 2 == 0); + } + } + else + { + std::iota(data_buf, data_buf + size, T(0)); + } + } + } + + // We need to keep this one is for testing arrow_proxy + // on external (i.e. not allocated by sparrow) ArrowArray and ArrowProxy template - void fill_schema_and_array( + void fill_external_schema_and_array( ArrowSchema& schema, ArrowArray& arr, size_t size, @@ -126,7 +82,7 @@ namespace sparrow::test schema.n_children = 0; schema.children = nullptr; schema.dictionary = nullptr; - schema.release = &release_arrow_schema; + schema.release = &release_external_arrow_schema; arr.length = static_cast(size - offset); arr.null_count = static_cast(false_bitmap.size()); @@ -139,22 +95,54 @@ namespace sparrow::test buf[0] = make_bitmap_buffer(size, false_bitmap); T* data_buf = new T[size]; - if constexpr (std::same_as) - { - for (std::size_t i = 0; i < size; ++i) - { - data_buf[i] = (i % 2 == 0); - } - } - else - { - std::iota(data_buf, data_buf + size, T(0)); - } + detail::fill_primitive_data_buffer(data_buf, size); buf[1] = reinterpret_cast(data_buf); arr.children = nullptr; arr.dictionary = nullptr; - arr.release = &release_arrow_array; + arr.release = &release_external_arrow_array; + } + + template + void fill_schema_and_array( + ArrowSchema& schema, + ArrowArray& arr, + size_t size, + size_t offset, + const std::vector& false_bitmap + ) + { + sparrow::fill_arrow_schema( + schema, + sparrow::data_type_format_of(), + "test", + "test metadata", + std::nullopt, + 0, + nullptr, + nullptr + ); + + using buffer_type = sparrow::buffer; + buffer_type data_buf(size * sizeof(T)); + detail::fill_primitive_data_buffer(data_buf.data(), size); + + std::vector arr_buffs = + { + sparrow::make_bitmap_buffer(size, false_bitmap), + std::move(data_buf) + }; + + sparrow::fill_arrow_array( + arr, + static_cast(size - offset), + static_cast(false_bitmap.size()), + static_cast(offset), + std::move(arr_buffs), + 0u, + nullptr, + nullptr + ); } inline std::vector make_testing_words(std::size_t n) @@ -199,22 +187,18 @@ namespace sparrow::test const std::vector& false_bitmap ) { - schema.format = sparrow::data_type_format_of().data(); - schema.name = "test"; - schema.n_children = 0; - schema.children = nullptr; - schema.dictionary = nullptr; - schema.release = &release_arrow_schema; - - arr.length = static_cast(size - offset); - arr.null_count = static_cast(false_bitmap.size()); - arr.offset = static_cast(offset); - arr.n_buffers = 3; - arr.n_children = 0; - std::uint8_t** buf = new std::uint8_t*[3]; - arr.buffers = const_cast(reinterpret_cast(buf)); + sparrow::fill_arrow_schema( + schema, + std::string_view("u"), + "test", + "test metadata", + std::nullopt, + 0, + nullptr, + nullptr + ); - buf[0] = make_bitmap_buffer(size, false_bitmap); + using buffer_type = sparrow::buffer; auto words = make_testing_words(size); std::size_t value_size = std::accumulate( @@ -226,45 +210,68 @@ namespace sparrow::test return res + s.size(); } ); - auto offset_buf = new int32_t[size + 1]; - auto value_buf = new char[value_size]; - offset_buf[0] = 0; - char* ptr = value_buf; - for (std::size_t i = 0; i < size; ++i) + buffer_type offset_buf(sizeof(std::int32_t) * (size + 1)); + buffer_type value_buf(sizeof(char) * value_size); { - offset_buf[i + 1] = offset_buf[i] + static_cast(words[i].size()); - std::ranges::copy(words[i], ptr); - ptr += words[i].size(); + std::int32_t* offset_data = offset_buf.data(); + offset_data[0] = 0; + char* ptr = value_buf.data(); + for (std::size_t i = 0; i < size; ++i) + { + offset_data[i + 1] = offset_data[i] + static_cast(words[i].size()); + std::ranges::copy(words[i], ptr); + ptr += words[i].size(); + } } - buf[1] = reinterpret_cast(offset_buf); - buf[2] = reinterpret_cast(value_buf); - arr.children = nullptr; - arr.dictionary = nullptr; - arr.release = &release_arrow_array; + std::vector arr_buffs = + { + sparrow::make_bitmap_buffer(size, false_bitmap), + std::move(offset_buf), + std::move(value_buf) + }; + + sparrow::fill_arrow_array( + arr, + static_cast(size - offset), + static_cast(false_bitmap.size()), + static_cast(offset), + std::move(arr_buffs), + 0u, + nullptr, + nullptr + ); } template <> inline void fill_schema_and_array< sparrow::null_type>(ArrowSchema& schema, ArrowArray& arr, size_t size, size_t offset, const std::vector&) { - schema.format = sparrow::data_type_format_of().data(); - schema.name = "test"; - schema.n_children = 0; - schema.children = nullptr; - schema.dictionary = nullptr; - schema.release = &release_arrow_schema; + sparrow::fill_arrow_schema( + schema, + std::string_view("n"), + "test", + "test metadata", + std::nullopt, + 0, + nullptr, + nullptr + ); - arr.length = static_cast(size - offset); - arr.null_count = arr.length; - arr.offset = static_cast(offset); - arr.n_buffers = 0; - arr.n_children = 0; - arr.buffers = nullptr; - arr.children = nullptr; - arr.dictionary = nullptr; - arr.release = &release_arrow_array; + using buffer_type = sparrow::buffer; + std::vector arr_buffs = {}; + + sparrow::fill_arrow_array( + arr, + static_cast(size - offset), + static_cast(size - offset), + static_cast(offset), + std::move(arr_buffs), + 0u, + nullptr, + nullptr + ); } template