chfast · rodiazet · Feb 6, 2022 · Feb 6, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/test/benchmarks/benchmarks.cpp b/test/benchmarks/benchmarks.cpp
@@ -3,6 +3,7 @@
 // Licensed under the Apache License, Version 2.0.
 
 #include "../experimental/addmod.hpp"
+#include "../experimental/shift.hpp"
 #include <benchmark/benchmark.h>
 #include <intx/intx.hpp>
 #include <test/utils/gmp.hpp>
@@ -198,8 +199,7 @@ BENCHMARK_TEMPLATE(binop, uint512, uint512, public_mul);
 BENCHMARK_TEMPLATE(binop, uint512, uint512, gmp::mul);
 
 template <unsigned N>
-[[gnu::noinline]] static intx::uint<N> shl_public(
-    const intx::uint<N>& x, const uint64_t& y) noexcept
+[[gnu::noinline]] static intx::uint<N> shl_public(const intx::uint<N>& x, uint64_t y) noexcept
 {
     return x << y;
 }
@@ -211,35 +211,52 @@ template <unsigned N>
     return x << y;
 }
 
-[[gnu::noinline]] static intx::uint256 shl_halves(
-    const intx::uint256& x, const uint64_t& shift) noexcept
+template <unsigned N>
+[[gnu::noinline]] static intx::uint<N> shl_generic(const intx::uint<N>& x, uint64_t y) noexcept
+{
+    return intx::operator<<<N>(x, y);
+}
+
+template <unsigned N>
+[[gnu::noinline]] static intx::uint<N> shl_generic(
+    const intx::uint<N>& x, const intx::uint<N>& y) noexcept
+{
+    return intx::operator<<<N>(x, y);
+}
+
+[[gnu::noinline]] static intx::uint256 shl_halves(const intx::uint256& x, uint64_t shift) noexcept
 {
     constexpr auto num_bits = 256;
     constexpr auto half_bits = num_bits / 2;
 
-    const auto xlo = uint128{x[0], x[1]};
+    const auto xlo_ = uint128{x[0], x[1]};
+    const auto xlo = static_cast<builtin_uint128>(xlo_);
 
     if (shift < half_bits)
     {
-        const auto lo = xlo << shift;
+        const auto lo_ = xlo << shift;
+        const auto lo = uint128{lo_};
 
-        const auto xhi = uint128{x[2], x[3]};
+        const auto xhi_ = uint128{x[2], x[3]};
+        const auto xhi = static_cast<builtin_uint128>(xhi_);
 
         // Find the part moved from lo to hi.
         // The shift right here can be invalid:
         // for shift == 0 => lshift == half_bits.
         // Split it into 2 valid shifts by (rshift - 1) and 1.
         const auto rshift = half_bits - shift;
         const auto lo_overflow = (xlo >> (rshift - 1)) >> 1;
-        const auto hi = (xhi << shift) | lo_overflow;
+        const auto hi_ = (xhi << shift) | lo_overflow;
+        const auto hi = uint128{hi_};
         return {lo[0], lo[1], hi[0], hi[1]};
     }
 
     // This check is only needed if we want "defined" behavior for shifts
     // larger than size of the Int.
     if (shift < num_bits)
     {
-        const auto hi = xlo << (shift - half_bits);
+        const auto hi_ = xlo << (shift - half_bits);
+        const auto hi = uint128{hi_};
         return {0, 0, hi[0], hi[1]};
     }
 
@@ -288,7 +305,7 @@ template <unsigned N>
 }
 
 #if INTX_HAS_EXTINT
-[[gnu::noinline]] static intx::uint256 shl_llvm(const intx::uint256& x, const uint64_t& y) noexcept
+[[gnu::noinline]] static intx::uint256 shl_llvm(const intx::uint256& x, uint64_t y) noexcept
 {
     unsigned _ExtInt(256) a;
     std::memcpy(&a, &x, sizeof(a));
@@ -300,7 +317,7 @@ template <unsigned N>
 #endif
 
 
-template <typename ArgT, typename ShiftT, ArgT ShiftFn(const ArgT&, const ShiftT&)>
+template <typename ArgT, typename ShiftT, ArgT ShiftFn(const ArgT&, ShiftT)>
 static void shift(benchmark::State& state)
 {
     const auto& shift_samples_id = [&state]() noexcept {
@@ -324,7 +341,7 @@ static void shift(benchmark::State& state)
 
     const auto& xs = test::get_samples<ArgT>(sizeof(ArgT) == sizeof(uint256) ? x_256 : x_512);
     const auto& raw_shifts = test::get_samples<uint64_t>(shift_samples_id);
-    std::array<ShiftT, test::num_samples> shifts;
+    std::array<std::decay_t<ShiftT>, test::num_samples> shifts;
     std::copy(std::cbegin(raw_shifts), std::cend(raw_shifts), std::begin(shifts));
 
     while (state.KeepRunningBatch(xs.size()))
@@ -336,14 +353,24 @@ static void shift(benchmark::State& state)
         }
     }
 }
-BENCHMARK_TEMPLATE(shift, uint256, uint256, shl_public)->DenseRange(-1, 3);
-BENCHMARK_TEMPLATE(shift, uint256, uint256, shl_halves)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, const uint256&, shl_public)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, const uint256&, shl_generic)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, const uint256&, shl_halves)->DenseRange(-1, 3);
 BENCHMARK_TEMPLATE(shift, uint256, uint64_t, shl_public)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, shl_generic)->DenseRange(-1, 3);
 BENCHMARK_TEMPLATE(shift, uint256, uint64_t, shl_halves)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_c)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_e)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_w)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_avx)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_1)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_2)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_3)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint256, uint64_t, experimental::shl_bits_4)->DenseRange(-1, 3);
 #if INTX_HAS_EXTINT
 BENCHMARK_TEMPLATE(shift, uint256, uint64_t, shl_llvm)->DenseRange(-1, 3);
 #endif
-BENCHMARK_TEMPLATE(shift, uint512, uint512, shl_public)->DenseRange(-1, 3);
+BENCHMARK_TEMPLATE(shift, uint512, const uint512&, shl_public)->DenseRange(-1, 3);
 BENCHMARK_TEMPLATE(shift, uint512, uint64_t, shl_public)->DenseRange(-1, 3);
 
 [[gnu::noinline]] static bool lt_public(const uint256& x, const uint256& y) noexcept

diff --git a/test/experimental/CMakeLists.txt b/test/experimental/CMakeLists.txt
@@ -1,8 +1,8 @@
 # intx: extended precision integer library.
-# Copyright 2019-2020 Pawel Bylica.
+# Copyright 2019 Pawel Bylica.
 # Licensed under the Apache License, Version 2.0.
 
-add_library(experimental STATIC add.cpp add.hpp)
+add_library(experimental STATIC add.cpp add.hpp shift.hpp)
 target_compile_definitions(experimental PRIVATE INTX_EXPERIMENTAL)
 target_include_directories(experimental PUBLIC ${PROJECT_SOURCE_DIR}/test)
 target_link_libraries(experimental PUBLIC intx::intx)