diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index b9528ce45a..657a9f2286 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -21,8 +21,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) extern \"C\" { void F77_BLAS_MANGLE(zdotc,ZDOTC)( - std::complex* result, const int* n, - const std::complex x[], const int* incx, + std::complex* result, const int* n, + const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -49,9 +49,9 @@ int main() { CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) - SET(VARNAME OFF) + SET(${VARNAME} OFF PARENT_SCOPE) ELSE() - SET(VARNAME ON) + SET(${VARNAME} ON PARENT_SCOPE) ENDIF() ENDFUNCTION() diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index ebccce7d7c..aa995836eb 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -96,25 +96,37 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { Kokkos::View>; - result_type result{}; - RVector_Result R = RVector_Result(&result); XVector_Internal X = x; YVector_Internal Y = y; - // Even though RVector is the template parameter, Dot::dot has an overload - // that accepts RVector_Internal (with the special accumulator, if dot_type is - // 32-bit precision). Impl::Dot needs to support both cases, and it's easier - // to do this with overloading than by extending the ETI to deal with two - // different scalar types. - Impl::DotSpecialAccumulator::dot(space, R, - X, Y); - space.fence(); - // mfh 22 Jan 2020: We need the line below because - // Kokkos::complex lacks a constructor that takes a - // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + bool useFallback = false; + if (useFallback) { + // Even though RVector is the template parameter, Dot::dot has an overload + // that accepts RVector_Internal (with the special accumulator, if dot_type + // is 32-bit precision). Impl::Dot needs to support both cases, and it's + // easier to do this with overloading than by extending the ETI to deal with + // two different scalar types. + result_type result{}; + RVector_Result R = RVector_Result(&result); + Impl::DotSpecialAccumulator::dot(space, + R, X, + Y); + space.fence(); + // mfh 22 Jan 2020: We need the line below because + // Kokkos::complex lacks a constructor that takes a + // Kokkos::complex with U != T. + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } else { + dot_type result{}; + RVector_Internal R = RVector_Internal(&result); + Impl::Dot::dot(space, R, X, Y); + space.fence(); + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } } /// \brief Return the dot product of the two vectors x and y. diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index ca2139980d..3ba8f063b4 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -52,18 +52,22 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) + +// TODO: we met difficuties in FindTPLMKL.cmake to set the BLAS library properly +// such that the test in CheckHostBlasReturnComplex.cmake could not be +// compiled and run to give a correct answer on KK_BLAS_RESULT_AS_POINTER_ARG. +// This resulted in segfault in dot() with MKL and complex. +// So we just temporarily disable it until FindTPLMKL.cmake is fixed. +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct dot_tpl_spec_avail< \ EXECSPACE, \ @@ -77,19 +81,27 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 718e32f14c..ace26ebdbd 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -39,71 +39,40 @@ inline void dot_print_specialization() { namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ dot_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ + R() = HostBlas::dot( \ + N, reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -112,105 +81,22 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif // cuBLAS @@ -219,38 +105,48 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ + const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -259,81 +155,73 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -342,72 +230,100 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_sdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCdotc(s.handle, N, reinterpret_cast(X.data()), \ - one, reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R())); \ } else { \ - Dot::dot(space, R, \ + Dot::dot(exec, R, \ X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif #endif diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 71e22a690c..ec739aa98a 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -89,22 +89,39 @@ double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x, const int* x_inc, const double* y, const int* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) -std::complex F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotu, ZDOTU)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); -std::complex F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotc, ZDOTC)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); +// clang-format off +// For the return type, don't use std::complex, otherwise compiler will complain +// error: 'cdotu_' has C-linkage specified, but returns user-defined type 'std::complex' which is incompatible with C [-Werror,-Wreturn-type-c-linkage]" +// But with float _Complex, I got error: '_Complex' is a C99 extension [-Werror,-Wc99-extensions]. +// So I just use a C struct. +// clang-format on +typedef struct { + float vals[2]; +} _kk_float2; +typedef struct { + double vals[2]; +} _kk_double2; + +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); #else void F77_BLAS_MANGLE(cdotu, CDOTU)(std::complex* res, const int* N, @@ -795,7 +812,8 @@ std::complex HostBlas >::dot( int n, const std::complex* x, int x_inc, const std::complex* y, int y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc); @@ -967,7 +985,8 @@ std::complex HostBlas >::dot( int n, const std::complex* x, int x_inc, const std::complex* y, int y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc);