Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into move-config
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora authored Nov 18, 2024
2 parents c2e77d8 + aeb6a30 commit c99fa6b
Show file tree
Hide file tree
Showing 15 changed files with 126 additions and 124 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,8 +355,7 @@ ConfigureNVBench(
# ##################################################################################################
# * strings benchmark -------------------------------------------------------------------
ConfigureBench(
STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/replace.cpp string/translate.cpp
string/url_decode.cu
STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/replace.cpp string/url_decode.cu
)

ConfigureNVBench(
Expand Down Expand Up @@ -386,6 +385,7 @@ ConfigureNVBench(
string/slice.cpp
string/split.cpp
string/split_re.cpp
string/translate.cpp
)

# ##################################################################################################
Expand Down
66 changes: 25 additions & 41 deletions cpp/benchmarks/string/translate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,32 @@
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/translate.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <nvbench/nvbench.cuh>

class StringTranslate : public cudf::benchmark {};
#include <algorithm>
#include <vector>

using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;

static void BM_translate(benchmark::State& state, int entry_count)
static void bench_translate(nvbench::state& state)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const entry_count = static_cast<cudf::size_type>(state.get_int64("entries"));

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::strings_column_view input(column->view());
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
auto const input = cudf::strings_column_view(column->view());

std::vector<entry_type> entries(entry_count);
std::transform(thrust::counting_iterator<int>(0),
Expand All @@ -51,33 +49,19 @@ static void BM_translate(benchmark::State& state, int entry_count)
return entry_type{'!' + idx, '~' - idx};
});

for (auto _ : state) {
cuda_event_timer raii(state, true, cudf::get_default_stream());
cudf::strings::translate(input, entries);
}
auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto chars_size = input.chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int8_t>(chars_size);

state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::translate(input, entries); });
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name, entries) \
BENCHMARK_DEFINE_F(StringTranslate, name) \
(::benchmark::State & st) { BM_translate(st, entries); } \
BENCHMARK_REGISTER_F(StringTranslate, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(translate_small, 5)
STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
STRINGS_BENCHMARK_DEFINE(translate_large, 50)
NVBENCH_BENCH(bench_translate)
.set_name("translate")
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("entries", {5, 25, 50});
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ set(cython_sources
transform.pyx
transpose.pyx
types.pyx
unary.pyx
utils.pyx
)
set(linked_libraries cudf::cudf)
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
text,
timezone,
transpose,
unary,
)

MAX_COLUMN_SIZE = np.iinfo(np.int32).max
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

from typing import Literal

from typing_extensions import Self

import pylibcudf as plc

from cudf._typing import Dtype, DtypeObj, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
Expand Down Expand Up @@ -71,3 +75,8 @@ class Column:
# TODO: The val parameter should be Scalar, not ScalarLike
@staticmethod
def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
@staticmethod
def from_pylibcudf(
col: plc.Column, data_ptr_exposed: bool = False
) -> ColumnBase: ...
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
60 changes: 0 additions & 60 deletions python/cudf/cudf/_lib/unary.pyx

This file was deleted.

64 changes: 64 additions & 0 deletions python/cudf/cudf/core/_internals/unary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from __future__ import annotations

from typing import TYPE_CHECKING

import pylibcudf as plc

from cudf._lib.types import dtype_to_pylibcudf_type
from cudf.api.types import is_decimal_dtype
from cudf.core.buffer import acquire_spill_lock

if TYPE_CHECKING:
from cudf._typing import Dtype
from cudf.core.column import ColumnBase


@acquire_spill_lock()
def unary_operation(
col: ColumnBase, op: plc.unary.UnaryOperator
) -> ColumnBase:
return type(col).from_pylibcudf(
plc.unary.unary_operation(col.to_pylibcudf(mode="read"), op)
)


@acquire_spill_lock()
def is_null(col: ColumnBase) -> ColumnBase:
return type(col).from_pylibcudf(
plc.unary.is_null(col.to_pylibcudf(mode="read"))
)


@acquire_spill_lock()
def is_valid(col: ColumnBase) -> ColumnBase:
return type(col).from_pylibcudf(
plc.unary.is_valid(col.to_pylibcudf(mode="read"))
)


@acquire_spill_lock()
def cast(col: ColumnBase, dtype: Dtype) -> ColumnBase:
result = type(col).from_pylibcudf(
plc.unary.cast(
col.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
)
)

if is_decimal_dtype(result.dtype):
result.dtype.precision = dtype.precision # type: ignore[union-attr]
return result


@acquire_spill_lock()
def is_nan(col: ColumnBase) -> ColumnBase:
return type(col).from_pylibcudf(
plc.unary.is_nan(col.to_pylibcudf(mode="read"))
)


@acquire_spill_lock()
def is_non_nan(col: ColumnBase) -> ColumnBase:
return type(col).from_pylibcudf(
plc.unary.is_not_nan(col.to_pylibcudf(mode="read"))
)
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import cudf
from cudf import _lib as libcudf
from cudf._lib.transform import bools_to_mask
from cudf.core._internals import unary
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
Expand Down Expand Up @@ -1018,12 +1019,12 @@ def isnull(self) -> ColumnBase:
"""
Identify missing values in a CategoricalColumn.
"""
result = libcudf.unary.is_null(self)
result = unary.is_null(self)

if self.categories.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of an underlying float column
categories = libcudf.unary.is_nan(self.categories)
categories = unary.is_nan(self.categories)
if categories.any():
code = self._encode(np.nan)
result = result | (self.codes == cudf.Scalar(code))
Expand All @@ -1034,12 +1035,12 @@ def notnull(self) -> ColumnBase:
"""
Identify non-missing values in a CategoricalColumn.
"""
result = libcudf.unary.is_valid(self)
result = unary.is_valid(self)

if self.categories.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of an underlying float column
categories = libcudf.unary.is_nan(self.categories)
categories = unary.is_nan(self.categories)
if categories.any():
code = self._encode(np.nan)
result = result & (self.codes != cudf.Scalar(code))
Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
is_string_dtype,
)
from cudf.core._compat import PANDAS_GE_210
from cudf.core._internals import unary
from cudf.core._internals.timezones import get_compatible_timezone
from cudf.core.abc import Serializable
from cudf.core.buffer import (
Expand Down Expand Up @@ -713,12 +714,12 @@ def isnull(self) -> ColumnBase:
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(False, length=len(self))

result = libcudf.unary.is_null(self)
result = unary.is_null(self)

if self.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of a float column
result = result | libcudf.unary.is_nan(self)
result = result | unary.is_nan(self)

return result

Expand All @@ -727,12 +728,12 @@ def notnull(self) -> ColumnBase:
if not self.has_nulls(include_nan=self.dtype.kind == "f"):
return as_column(True, length=len(self))

result = libcudf.unary.is_valid(self)
result = unary.is_valid(self)

if self.dtype.kind == "f":
# Need to consider `np.nan` values in case
# of a float column
result = result & libcudf.unary.is_non_nan(self)
result = result & unary.is_non_nan(self)

return result

Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals import unary
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
get_compatible_timezone,
Expand Down Expand Up @@ -490,7 +491,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
"Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
"Use tz_localize instead."
)
return libcudf.unary.cast(self, dtype=dtype)
return unary.cast(self, dtype=dtype) # type: ignore[return-value]

def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override]
raise TypeError(
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from_decimal as cpp_from_decimal,
)
from cudf.api.types import is_scalar
from cudf.core._internals import unary
from cudf.core.buffer import as_buffer
from cudf.core.column import ColumnBase
from cudf.core.dtypes import (
Expand Down Expand Up @@ -85,7 +86,7 @@ def as_decimal_column(

if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype)
return unary.cast(self, dtype) # type: ignore[return-value]

def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
Expand Down Expand Up @@ -232,7 +233,7 @@ def _decimal_quantile(
def as_numerical_column(
self, dtype: Dtype
) -> "cudf.core.column.NumericalColumn":
return libcudf.unary.cast(self, dtype)
return unary.cast(self, dtype) # type: ignore[return-value]


class Decimal32Column(DecimalBaseColumn):
Expand Down
Loading

0 comments on commit c99fa6b

Please sign in to comment.