From 89b595a049a1247ef4ab2e53ede8e756e61102e4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 7 Sep 2023 17:59:57 -0400 Subject: [PATCH 1/4] Expose stream parameter in public nvtext ngram APIs --- cpp/include/nvtext/generate_ngrams.hpp | 26 ++++++++++++++++---------- cpp/include/nvtext/ngrams_tokenize.hpp | 12 +++++++----- cpp/src/text/generate_ngrams.cu | 9 ++++++--- cpp/src/text/jaccard.cu | 4 ++-- cpp/src/text/ngrams_tokenize.cu | 4 ++-- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 5d66401df9d..ddcd63bcb4b 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -47,18 +47,20 @@ namespace nvtext { * @throw cudf::logic_error if `separator` is invalid * @throw cudf::logic_error if there are not enough strings to generate any ngrams * - * @param strings Strings column to tokenize and produce ngrams from. + * @param input Strings column to tokenize and produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. * @param separator The string to use for separating ngram tokens. * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, cudf::string_scalar const& separator = cudf::string_scalar{"_"}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -79,15 +81,17 @@ std::unique_ptr generate_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr generate_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -113,14 +117,16 @@ std::unique_ptr generate_character_ngrams( * @throw cudf::logic_error if `ngrams < 2` * @throw cudf::logic_error if there are not enough characters to generate any ngrams * - * @param strings Strings column to produce ngrams from. + * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values */ std::unique_ptr hash_character_ngrams( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index 17f20f7ea4c..cca287c8414 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,21 +66,23 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize and produce ngrams from. + * @param input Strings column to tokenize and produce ngrams from * @param ngrams The ngram number to generate. * Default is 2 = bigram. * @param delimiter UTF-8 characters used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. * @param separator The string to use for separating ngram tokens. * Default is "_" character. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr ngrams_tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type ngrams = 2, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, cudf::string_scalar const& separator = cudf::string_scalar{"_"}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 938fd45246d..5f2f4d021a4 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -150,10 +150,11 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s std::unique_ptr generate_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr); + return detail::generate_ngrams(strings, ngrams, separator, stream, mr); } namespace detail { @@ -317,18 +318,20 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::generate_character_ngrams(strings, ngrams, stream, mr); } std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr); + return detail::hash_character_ngrams(strings, ngrams, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 5b55745c2c7..95324847ea0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -107,7 +107,7 @@ rmm::device_uvector compute_unique_counts(cudf::column_view con * * This is called with a warp per row */ -struct sorted_interset_fn { +struct sorted_intersect_fn { cudf::column_device_view const d_input1; cudf::column_device_view const d_input2; cudf::size_type* d_results; @@ -151,7 +151,7 @@ rmm::device_uvector compute_intersect_counts(cudf::column_view auto const d_input1 = cudf::column_device_view::create(input1, stream); auto const d_input2 = cudf::column_device_view::create(input2, stream); auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()}; + sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; thrust::for_each_n(rmm::exec_policy(stream), thrust::counting_iterator(0), input1.size() * cudf::detail::warp_size, diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index fd1cbf99221..73d85513e95 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -265,11 +265,11 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s cudf::size_type ngrams, cudf::string_scalar const& delimiter, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ngrams_tokenize( - strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr); + return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr); } } // namespace nvtext From 3131dba2df60bd3c6eb00dd19b0612bb60fc4ec7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Sep 2023 17:24:57 -0400 Subject: [PATCH 2/4] add streams tests --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/text/ngrams_test.cpp | 55 ++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 cpp/tests/streams/text/ngrams_test.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a69dc9bf2f8..8a947faad62 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp new file mode 100644 index 00000000000..6e3ec87488a --- /dev/null +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +class TextNGramsTest : public cudf::test::BaseFixture {}; + +TEST_F(TextNGramsTest, GenerateNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const separator = cudf::string_scalar("_"); + nvtext::generate_ngrams( + cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, GenerateCharacterNgrams) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::generate_character_ngrams( + cudf::strings_column_view(input), 3, cudf::test::get_default_stream()); +} + +TEST_F(TextNGramsTest, HashCharacterNgrams) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + nvtext::hash_character_ngrams(cudf::strings_column_view(input)); +} + +TEST_F(TextNGramsTest, NgramsTokenize) +{ + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + nvtext::ngrams_tokenize(cudf::strings_column_view(input)); +} From 585252483ea6bbfb65e393b0737f39bc2045c7ac Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Sep 2023 19:29:23 -0400 Subject: [PATCH 3/4] fix stream in default parameters --- cpp/include/nvtext/generate_ngrams.hpp | 2 +- cpp/include/nvtext/ngrams_tokenize.hpp | 4 ++-- cpp/tests/streams/text/ngrams_test.cpp | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index ddcd63bcb4b..5cb75d0fb89 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -59,7 +59,7 @@ namespace nvtext { std::unique_ptr generate_ngrams( cudf::strings_column_view const& input, cudf::size_type ngrams = 2, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, + cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index cca287c8414..4ac597f5db3 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -80,8 +80,8 @@ namespace nvtext { std::unique_ptr ngrams_tokenize( cudf::strings_column_view const& input, cudf::size_type ngrams = 2, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - cudf::string_scalar const& separator = cudf::string_scalar{"_"}, + cudf::string_scalar const& delimiter = cudf::string_scalar{"", true, cudf::get_default_stream()}, + cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp index 6e3ec87488a..bce0d2b680b 100644 --- a/cpp/tests/streams/text/ngrams_test.cpp +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -27,7 +27,7 @@ TEST_F(TextNGramsTest, GenerateNgrams) { auto const input = cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); - auto const separator = cudf::string_scalar("_"); + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; nvtext::generate_ngrams( cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream()); } @@ -44,12 +44,16 @@ TEST_F(TextNGramsTest, HashCharacterNgrams) { auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); - nvtext::hash_character_ngrams(cudf::strings_column_view(input)); + nvtext::hash_character_ngrams( + cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); } TEST_F(TextNGramsTest, NgramsTokenize) { auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); - nvtext::ngrams_tokenize(cudf::strings_column_view(input)); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::ngrams_tokenize( + cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream()); } From b50f485288f196b2c0be96f904e8119549a2f08a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Sep 2023 13:44:25 -0400 Subject: [PATCH 4/4] remove parameter defaults --- cpp/benchmarks/text/ngrams.cpp | 3 ++- cpp/benchmarks/text/tokenize.cpp | 7 ++++-- cpp/include/nvtext/generate_ngrams.hpp | 14 +++++------- cpp/include/nvtext/ngrams_tokenize.hpp | 18 +++++++-------- cpp/tests/text/ngrams_tests.cpp | 28 ++++++++++++++---------- cpp/tests/text/ngrams_tokenize_tests.cpp | 11 ++++++---- 6 files changed, 44 insertions(+), 37 deletions(-) diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 0319577f6b9..f3fd5cc5729 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); + auto const separator = cudf::string_scalar("_"); for (auto _ : state) { cuda_event_timer raii(state, true); switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input); break; + case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; } } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 423fe667b05..b556a84c541 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state) auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); }); } else if (tokenize_type == "ngrams") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); }); + auto const delimiter = cudf::string_scalar(""); + auto const separator = cudf::string_scalar("_"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator); + }); } else if (tokenize_type == "characters") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); }); diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 5cb75d0fb89..46f2c0e7bc9 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -48,20 +48,18 @@ namespace nvtext { * @throw cudf::logic_error if there are not enough strings to generate any ngrams * * @param input Strings column to tokenize and produce ngrams from - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. + * @param ngrams The ngram number to generate + * @param separator The string to use for separating ngram tokens * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings columns of tokens */ std::unique_ptr generate_ngrams( cudf::strings_column_view const& input, - cudf::size_type ngrams = 2, - cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::size_type ngrams, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Generates ngrams of characters within each string. diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp index 4ac597f5db3..9d76ef8689f 100644 --- a/cpp/include/nvtext/ngrams_tokenize.hpp +++ b/cpp/include/nvtext/ngrams_tokenize.hpp @@ -67,23 +67,21 @@ namespace nvtext { * All null row entries are ignored and the output contains all valid rows. * * @param input Strings column to tokenize and produce ngrams from - * @param ngrams The ngram number to generate. - * Default is 2 = bigram. + * @param ngrams The ngram number to generate * @param delimiter UTF-8 characters used to separate each string into tokens. - * The default of empty string will separate tokens using whitespace. - * @param separator The string to use for separating ngram tokens. - * Default is "_" character. + * An empty string will separate tokens using whitespace. + * @param separator The string to use for separating ngram tokens * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings columns of tokens */ std::unique_ptr ngrams_tokenize( cudf::strings_column_view const& input, - cudf::size_type ngrams = 2, - cudf::string_scalar const& delimiter = cudf::string_scalar{"", true, cudf::get_default_stream()}, - cudf::string_scalar const& separator = cudf::string_scalar{"_", true, cudf::get_default_stream()}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::size_type ngrams, + cudf::string_scalar const& delimiter, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace nvtext diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 323b3eed3e2..7b179588385 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams) { cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"}; cudf::strings_column_view strings_view(strings); + auto const separator = cudf::string_scalar("_"); { cudf::test::strings_column_wrapper expected{ "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view); + auto const results = nvtext::generate_ngrams(strings_view, 2, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"}; - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls) h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const separator = cudf::string_scalar("_"); cudf::strings_column_view strings_view(strings); { - auto const results = nvtext::generate_ngrams(strings_view, 3); + auto const results = nvtext::generate_ngrams(strings_view, 3, separator); cudf::test::strings_column_wrapper expected{ "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); - auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column)); + auto const separator = cudf::string_scalar("_"); + + auto results = + nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator); cudf::test::expect_column_empty(results->view()); results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column)); cudf::test::expect_column_empty(results->view()); @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty) TEST_F(TextGenerateNgramsTest, Errors) { cudf::test::strings_column_wrapper strings{""}; + auto const separator = cudf::string_scalar("_"); // invalid parameter value - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error); // not enough strings to generate ngrams - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator), + cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error); - std::vector h_strings{"", nullptr, "", nullptr}; - cudf::test::strings_column_wrapper strings_no_tokens( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)), + cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0}); + EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator), cudf::logic_error); EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)), cudf::logic_error); diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp index 5879bec3e64..c6fb886f7e5 100644 --- a/cpp/tests/text/ngrams_tokenize_tests.cpp +++ b/cpp/tests/text/ngrams_tokenize_tests.cpp @@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize) "mousé_ate", "ate_the", "the_cheese"}; - auto results = nvtext::ngrams_tokenize(strings_view); + auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { @@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram) { cudf::test::strings_column_wrapper strings{"aaa bbb", " ccc ddd ", "eee"}; cudf::strings_column_view strings_view(strings); + auto const empty = cudf::string_scalar(""); cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"}; - auto results = nvtext::ngrams_tokenize(strings_view, 1); + auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); cudf::strings_column_view strings_view(strings->view()); - auto results = nvtext::ngrams_tokenize(strings_view); + auto const empty = cudf::string_scalar(""); + auto results = nvtext::ngrams_tokenize(strings_view, 2, empty, empty); EXPECT_EQ(results->size(), 0); EXPECT_EQ(results->has_nulls(), false); } @@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest) { cudf::test::strings_column_wrapper strings{"this column intentionally left blank"}; cudf::strings_column_view strings_view(strings); - EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error); + auto const empty = cudf::string_scalar(""); + EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error); }