Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose stream parameter in public nvtext ngram APIs #14061

Merged
merged 15 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/benchmarks/text/ngrams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::strings_column_view input(column->view());
auto const separator = cudf::string_scalar("_");

for (auto _ : state) {
cuda_event_timer raii(state, true);
switch (nt) {
case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
}
}
Expand Down
7 changes: 5 additions & 2 deletions cpp/benchmarks/text/tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
});
} else if (tokenize_type == "ngrams") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
auto const delimiter = cudf::string_scalar("");
auto const separator = cudf::string_scalar("_");
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
});
} else if (tokenize_type == "characters") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
Expand Down
38 changes: 21 additions & 17 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ namespace nvtext {
* @throw cudf::logic_error if `separator` is invalid
* @throw cudf::logic_error if there are not enough strings to generate any ngrams
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate
* @param separator The string to use for separating ngram tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_ngrams(
cudf::strings_column_view const& strings,
cudf::size_type ngrams = 2,
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::strings_column_view const& input,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Generates ngrams of characters within each string.
Expand All @@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
28 changes: 14 additions & 14 deletions cpp/include/nvtext/ngrams_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,22 +66,22 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* An empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> ngrams_tokenize(
cudf::strings_column_view const& strings,
cudf::size_type ngrams = 2,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::strings_column_view const& input,
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace nvtext
9 changes: 6 additions & 3 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
}

namespace detail {
Expand Down Expand Up @@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::generate_character_ngrams(strings, ngrams, stream, mr);
}

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
*
* This is called with a warp per row
*/
struct sorted_interset_fn {
struct sorted_intersect_fn {
cudf::column_device_view const d_input1;
cudf::column_device_view const d_input2;
cudf::size_type* d_results;
Expand Down Expand Up @@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
auto const d_input1 = cudf::column_device_view::create(input1, stream);
auto const d_input2 = cudf::column_device_view::create(input2, stream);
auto d_results = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
input1.size() * cudf::detail::warp_size,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::ngrams_tokenize(
strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
}

} // namespace nvtext
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ ConfigureTest(
STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
testing
)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)

# ##################################################################################################
# Install tests ####################################################################################
Expand Down
59 changes: 59 additions & 0 deletions cpp/tests/streams/text/ngrams_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvtext/generate_ngrams.hpp>
#include <nvtext/ngrams_tokenize.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

class TextNGramsTest : public cudf::test::BaseFixture {};

TEST_F(TextNGramsTest, GenerateNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::generate_ngrams(
cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, GenerateCharacterNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
nvtext::generate_character_ngrams(
cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, HashCharacterNgrams)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
nvtext::hash_character_ngrams(
cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, NgramsTokenize)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::ngrams_tokenize(
cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
}
28 changes: 16 additions & 12 deletions cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
{
cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
cudf::strings_column_view strings_view(strings);
auto const separator = cudf::string_scalar("_");

{
cudf::test::strings_column_wrapper expected{
"the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
auto const results = nvtext::generate_ngrams(strings_view);
auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

{
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
auto const results = nvtext::generate_ngrams(strings_view, 3);
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
Expand Down Expand Up @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
auto const separator = cudf::string_scalar("_");

cudf::strings_column_view strings_view(strings);
{
auto const results = nvtext::generate_ngrams(strings_view, 3);
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
Expand All @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
{
auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();

auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
auto const separator = cudf::string_scalar("_");

auto results =
nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
cudf::test::expect_column_empty(results->view());
results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
cudf::test::expect_column_empty(results->view());
Expand All @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
TEST_F(TextGenerateNgramsTest, Errors)
{
cudf::test::strings_column_wrapper strings{""};
auto const separator = cudf::string_scalar("_");
// invalid parameter value
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
cudf::logic_error);
// not enough strings to generate ngrams
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
cudf::logic_error);

std::vector<char const*> h_strings{"", nullptr, "", nullptr};
cudf::test::strings_column_wrapper strings_no_tokens(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
cudf::logic_error);
Expand Down
11 changes: 7 additions & 4 deletions cpp/tests/text/ngrams_tokenize_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize)
"mousé_ate",
"ate_the",
"the_cheese"};
auto results = nvtext::ngrams_tokenize(strings_view);
auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_"));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
Expand Down Expand Up @@ -101,17 +101,19 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram)
{
cudf::test::strings_column_wrapper strings{"aaa bbb", " ccc ddd ", "eee"};
cudf::strings_column_view strings_view(strings);
auto const empty = cudf::string_scalar("");

cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"};
auto results = nvtext::ngrams_tokenize(strings_view, 1);
auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest)
{
auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
cudf::strings_column_view strings_view(strings->view());
auto results = nvtext::ngrams_tokenize(strings_view);
auto const empty = cudf::string_scalar("");
auto results = nvtext::ngrams_tokenize(strings_view, 2, empty, empty);
EXPECT_EQ(results->size(), 0);
EXPECT_EQ(results->has_nulls(), false);
}
Expand All @@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest)
{
cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
cudf::strings_column_view strings_view(strings);
EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error);
auto const empty = cudf::string_scalar("");
EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error);
}