rapidsai · rapids-bot · Jan 28, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 22, 2025
@@ -97,7 +97,6 @@ void BM_parquet_read_data(nvbench::state& state,
 void BM_parquet_read_long_strings(nvbench::state& state)
 {
   auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
 
   auto const d_type      = get_type_or_group(static_cast<int32_t>(data_type::STRING));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
@@ -106,14 +105,15 @@ void BM_parquet_read_long_strings(nvbench::state& state)
 
   auto const avg_string_length = static_cast<cudf::size_type>(state.get_int64("avg_string_length"));
   // corresponds to 3 sigma (full width 6 sigma: 99.7% of range)
-  auto const half_width = static_cast<cudf::size_type>(state.get_int64("half_width_string_length"));
+  auto const half_width =
+    avg_string_length >> 3;  // 32 +/- 4, 128 +/- 16, 1024 +/- 128, 8k +/- 1k, etc.
   auto const length_min = avg_string_length - half_width;
   auto const length_max = avg_string_length + half_width;
 
   data_profile profile =
     data_profile_builder()
       .cardinality(cardinality)
-      .avg_run_length(run_length)
+      .avg_run_length(1)
       .distribution(data_type::STRING, distribution_id::NORMAL, length_min, length_max);
 
   auto const num_rows_written = [&]() {
@@ -409,6 +409,5 @@ NVBENCH_BENCH(BM_parquet_read_long_strings)
   .add_string_axis("io_type", {"DEVICE_BUFFER"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32})
-  .add_int64_axis("avg_string_length", {16, 48, 96})
-  .add_int64_axis("half_width_string_length", {16, 32, 64});  // length = avg +/- half_width
+  .add_int64_power_of_two_axis("avg_string_length",
+                               nvbench::range(4, 16, 2));  // 16, 64, ... -> 64k
@@ -115,39 +115,39 @@ __device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s)
   block_excl_sum<block_size>(offptr, value_count, initial_value);
 }
 
+template <int value>
+inline constexpr int log2_int()
+{
+  static_assert((value >= 1) && ((value & (value - 1)) == 0), "Only works for powers of 2!");
+  if constexpr (value == 1)
+    return 0;
+  else
+    return 1 + log2_int<value / 2>();
+}
+
 template <int block_size>
-__device__ inline int calc_threads_per_string_log2(int avg)
+__device__ inline int calc_threads_per_string_log2(int avg_string_length)  // returns log2(M)
 {
   // From testing, performance is best when copying an average of B = 4 bytes at once.
   // So #-threads-per-string M = avg_string_length / 4
   // Help the compiler make the code fast by keeping everything a power of 2
   // For avg length < 4/8/16/..., length power-of-2 = 2/3/4/.../. Divide by 4: 0/1/...
-  // This is the target (log2) for M, but we need to clamp its range
 
-  // Clamp M (#-threads-per-string):
-  // For T threads: clamp #-strings-at-once N btw T/32 (1/warp) & 32 (cache miss if larger)
-  // So, clamp #-threads-per-string M = T / N between 32 (all in warp) & T/32 (cache miss)
-  // Writing an equation M(T) is slower than just handling each T case separately
-  auto caster = [](int value) { return static_cast<int>(value != 0); };  // branchless
-
-  if constexpr (block_size > 512) {
-    return 5;  // max of 32 strings at a time, no matter what
-  } else if constexpr (block_size > 256) {
-    return (avg < 64) ? 4 : 5;
-  } else if constexpr (block_size > 128) {
-    //(avg < 32) ? 3 : ((avg < 64) ? 4 : 5);
-    return 3 + caster(avg >> 5) + caster(avg >> 6);
-  } else if constexpr (block_size > 64) {
-    //(avg < 16) ? 2 : ((avg < 32) ? 3 : ((avg < 64) ? 4 : 5));
-    return 2 + caster(avg >> 4) + caster(avg >> 5) + caster(avg >> 6);
-  } else if constexpr (block_size > 32) {
-    //(avg < 8) ? 1 : ((avg < 16) ? 2 : ((avg < 32) ? 3 : ((avg < 64) ? 4 : 5)));
-    return 1 + caster(avg >> 3) + caster(avg >> 4) + caster(avg >> 5) + caster(avg >> 6);
-  } else {  // One warp
-    //(avg<4) ? 0 : ((avg<8) ? 1 : ((avg<16) ? 2 : ((avg<32) ? 3 : ((avg<64) ? 4 : 5))));
-    return caster(avg >> 2) + caster(avg >> 3) + caster(avg >> 4) + caster(avg >> 5) +
-           caster(avg >> 6);
-  }
+  // avg - 1: Don't want extra thread at powers of 2 (e.g. 32 (0b100000 -> 0b11111 -> 5)
+  int const avg_log2     = 32 - __clz(avg_string_length - 1);
+  int const threads_log2 = avg_log2 - 2;  // Target 4 bytes / thread at once (log2(4) = 2)
+
+  // This is the target (log2) for M, but we need to clamp its range
+  // First clamp #-strings-at-once (N) btw 1 (all threads (T)) & 32 (cache miss if larger)
+  // So, clamp #-threads-per-string M = T / N between: T (all) & T/32 (cache miss)
+  // So, clamp log2(#-threads-per-string) between log2(T) & log2(T) - 5 (min 1)
+  static constexpr int block_size_log2  = log2_int<block_size>();  // 7 for block_size = 128
+  static constexpr int min_threads_log2 = block_size_log2 > 5 ? block_size_log2 - 5 : 1;
+
+  // Clamp log2(M) (between 2 and 7 for block_size = 128)
+  return (threads_log2 <= min_threads_log2)
+           ? min_threads_log2
+           : ((threads_log2 >= block_size_log2) ? block_size_log2 : threads_log2);
 }
 
 /**
@@ -287,10 +287,27 @@ __device__ size_t gpuDecodeString(
         auto output_string = outputs[str_idx];
         int const length   = lengths[str_idx];
 
-        // One-shot N chars per thread
-        int const chars_at_once    = (length + threads_per_string - 1) >> threads_per_string_log2;
-        int const start_index      = string_lane * chars_at_once;
-        int const substring_length = min(chars_at_once, length - start_index);
+        // Max 8 chars at once per thread, else perf degrades dramatically
+        // Loop, copying 8 chars at a time, until <= 8 chars per thread left
+        static constexpr int max_chars_at_once = 8;
+        int chars_remaining_per_thread =
+          (length + threads_per_string - 1) >> threads_per_string_log2;
+        int group_offset = 0;
+        if (chars_remaining_per_thread > max_chars_at_once) {
+          int const max_chars_copied_string = max_chars_at_once * threads_per_string;
+          int start_index                   = string_lane * max_chars_at_once;
+          do {
+            memcpy(&(output_string[start_index]), &(input_string[start_index]), max_chars_at_once);
+
+            chars_remaining_per_thread -= max_chars_at_once;
+            start_index += max_chars_copied_string;
+            group_offset += max_chars_copied_string;
+          } while (chars_remaining_per_thread > max_chars_at_once);
+        }
+
+        // Final copy of remaining chars
+        int const start_index      = group_offset + string_lane * chars_remaining_per_thread;
+        int const substring_length = min(chars_remaining_per_thread, length - start_index);
         if (substring_length > 0) {
           memcpy(&(output_string[start_index]), &(input_string[start_index]), substring_length);
         }