Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/raja vec #83

Open
wants to merge 15 commits into
base: develop
Choose a base branch
from
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@
[submodule "tpl/RAJA"]
path = tpl/RAJA
url = https://github.com/LLNL/RAJA.git
[submodule "tpl/RAJAvec"]
path = tpl/RAJAvec
url = https://github.com/LLNL/RAJA.git
branch = feature/kunen1/vector
24 changes: 17 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,21 @@ cmake_minimum_required(VERSION 3.9)
option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable
this, and all other variants, to run _only_ raw C loops." On)

option(ENABLE_RAJA_VECTORIZATION "Run vectorized variants of RAJA kernels. Disable
this, and all other variants, to run _only_ raw C loops." Off)

#
# Initialize the BLT build system
#

message(STATUS ${ENABLE_RAJA_SEQUENTIAL})
message(STATUS ${ENABLE_RAJA_VECTORIZATION})

if (PERFSUITE_ENABLE_WARNINGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This flag should go in the scripts/lc-builds/XXX files. Also I think there is a architecture agnostic flag for (at least gnu and clang) like -march=native or something... in case the machine has SSE, AVX, AVX2 or AVX512, it will pick the best one.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I saw the lc-builds file and this is where I have it now. Not sure how this slipped in..but thanks for the feedback!

set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests")
set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples")
set(ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises")
Expand Down Expand Up @@ -48,12 +55,6 @@ set(RAJA_DATA_ALIGN 64)


# exclude RAJA make targets from top-level build...
add_subdirectory(tpl/RAJA)

get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES)
include_directories(${RAJA_INCLUDE_DIRS})


#
# Setup variables to pass to Perf suite
#
Expand All @@ -63,14 +64,23 @@ include_directories(${RAJA_INCLUDE_DIRS})
# performance issues in the xl compiler.
#
if (ENABLE_RAJA_SEQUENTIAL)
add_subdirectory(tpl/RAJAvec)
get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJAvec PROPERTY INCLUDE_DIRECTORIES)
include_directories(${RAJA_INCLUDE_DIRS})
add_definitions(-DRUN_RAJA_SEQ)
endif ()
#if(ENABLE_RAJA_VECTORIZATION)
# add_subdirectory(tpl/RAJAvec)
# get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJAvec PROPERTY INCLUDE_DIRECTORIES)
# include_directories(${RAJA_INCLUDE_DIRS})
# add_definitions(-DRUN_RAJA_VEC)
#endif ()
if (ENABLE_OPENMP)
add_definitions(-DRUN_OPENMP)
endif ()

set(RAJA_PERFSUITE_VERSION_MAJOR 0)
set(RAJA_PERFSUITE_VERSION_MINOR 8)
set(RAJA_PERFSUITE_VERSION_MINOR 9)
set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0)

set(RAJA_PERFSUITE_DEPENDS RAJA)
Expand Down
15 changes: 13 additions & 2 deletions scripts/lc-builds/blueos_xl-2020.09.17.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,27 @@
BUILD_SUFFIX=lc_blueos-xl_2020.09.17
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/xl_X.cmake

rm -rf build_${BUILD_SUFFIX} 2>/dev/null
mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null
mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1

module load cmake/3.14.5

if [ "$1" == "seq" ]; then
argS="On"
argV="Off"
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
else
argS="On"
argV="On"
RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake
fi

cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.09.17/bin/xlc++_r \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DENABLE_RAJA_VECTORIZATION=$argV \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
"$@" \
..
18 changes: 15 additions & 3 deletions scripts/lc-builds/toss3_clang10.0.1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,30 @@
###############################################################################

BUILD_SUFFIX=lc_toss3-clang-10.0.1
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake

rm -rf build_${BUILD_SUFFIX} 2>/dev/null
mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null
mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1

module load cmake/3.14.5

if [ "$1" == "seq" ]; then
argS="On"
argV="Off"
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
else
argS="On"
argV="On"
RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake
fi


cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-10.0.1/bin/clang++ \
-DCMAKE_CXX_FLAGS=-ffp-contract=fast \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DENABLE_RAJA_VECTORIZATION=$argV \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
"$@" \
..
15 changes: 13 additions & 2 deletions scripts/lc-builds/toss3_clang9.0.0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,27 @@
BUILD_SUFFIX=lc_toss3-clang-9.0.0
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake

rm -rf build_${BUILD_SUFFIX} 2>/dev/null
mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null
mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1

module load cmake/3.14.5

if [ "$1" == "seq" ]; then
argS="On"
argV="Off"
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
else
argS="On"
argV="On"
RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake
fi

cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-9.0.0/bin/clang++ \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DENABLE_RAJA_VECTORIZATION=$argV \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
"$@" \
..
15 changes: 13 additions & 2 deletions scripts/lc-builds/toss3_gcc8.1.0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,27 @@
BUILD_SUFFIX=lc_toss3-gcc-8.1.0
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake

rm -rf build_${BUILD_SUFFIX} 2>/dev/null
mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null
mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1

module load cmake/3.14.5

if [ "$1" == "seq" ]; then
argS="On"
argV="Off"
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
else
argS="On"
argV="On"
RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake
fi

cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++ \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DENABLE_RAJA_VECTORIZATION=$argV \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
"$@" \
..
16 changes: 14 additions & 2 deletions scripts/lc-builds/toss3_icpc19.1.0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,28 @@
BUILD_SUFFIX=lc_toss3-icpc-19.1.0
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake

rm -rf build_${BUILD_SUFFIX} 2>/dev/null
mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null
mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1

module load cmake/3.14.5

if [ "$1" == "seq" ]; then
argS="On"
argV="Off"
RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake
else
argS="On"
argV="On"
RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake
fi

cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-19.1.0/bin/icpc \
-DCMAKE_CXX_FLAGS="-xCORE-AVX2" \
-C ${RAJA_HOSTCONFIG} \
-DENABLE_OPENMP=On \
-DENABLE_RAJA_VECTORIZATION=$argV \
-DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
"$@" \
..
43 changes: 43 additions & 0 deletions src/basic/DAXPY-Seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,50 @@ void DAXPY::runSeqVariant(VariantID vid)

break;
}

case RAJA_Vec : {

#if(0)
DAXPY_DATA_VEC_SETUP;

auto daxpy_vec_lam = [=](RAJA::VectorIndex<I, vector_t> i) {
DAXPY_VEC_BODY;
};
startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
RAJA::forall<RAJA::vector_exec<vector_t>>(
RAJA::TypedRangeSegment<I>(ibegin, iend), daxpy_vec_lam);
}
stopTimer();
#endif

#if(0)
DAXPY_DATA_VEC_SETUP2;

auto daxpy_vec_lam = [=](RAJA::VectorIndex<I, vector_t> i) {
DAXPY_VEC_BODY2;
};

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
RAJA::forall<RAJA::vector_exec<vector_t>>(
RAJA::TypedRangeSegment<I>(ibegin, iend), daxpy_vec_lam);
}
stopTimer();
#endif

#if(1)
DAXPY_DATA_VEC_SETUP3;

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
DAXPY_VEC_BODY3;
}
stopTimer();
#endif
break;
}
#endif //RUN_RAJA_VEC

default : {
std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl;
Expand Down
1 change: 1 addition & 0 deletions src/basic/DAXPY.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ DAXPY::DAXPY(const RunParams& params)
setVariantDefined( Base_Seq );
setVariantDefined( Lambda_Seq );
setVariantDefined( RAJA_Seq );
setVariantDefined( RAJA_Vec );

setVariantDefined( Base_OpenMP );
setVariantDefined( Lambda_OpenMP );
Expand Down
44 changes: 43 additions & 1 deletion src/basic/DAXPY.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,53 @@
#define DAXPY_DATA_SETUP \
Real_ptr x = m_x; \
Real_ptr y = m_y; \
Real_type a = m_a;
const Real_type a = m_a;

#define DAXPY_BODY \
y[i] += a * x[i] ;

#define DAXPY_DATA_VEC_SETUP \
RAJA_INDEX_VALUE_T(I, Int_type, "I");\
using vector_t = RAJA::StreamVector<Real_type, 2>; \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> X(x, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> Y(y, iend);

#define DAXPY_DATA_VEC_SETUP2 \
RAJA_INDEX_VALUE_T(I, Int_type, "I"); \
using vector_t = RAJA::StreamVector<Real_type,2>; \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> Xview(x, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> Yview(y, iend); \
RAJA::forall<RAJA::vector_exec<vector_t>> (RAJA::TypedRangeSegment<I>(ibegin, iend),\
[=](RAJA::VectorIndex<I, vector_t> i) { \
vector_t X(0), Y(0); \
for(int j = 0; j < i.size(); ++j) { \
X.set(j, *(x + (**i) + j)); \
Y.set(j, *(y + (**i) + j)); \
} \
Xview(i) = X; \
Yview(i) = Y; \
});

#define DAXPY_DATA_VEC_SETUP3 \
RAJA_INDEX_VALUE_T(I, Int_type, "I");\
using element_t = RAJA::StreamVector<Real_type,2>::element_type; \
element_t X[iend], Y[iend]; \
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This really isn't the intended use. You are basically telling the compiler to hold the entire array X and Y in register at the same time.
You want to keep they arrays as "Real_type X[iend], Y[iend]", and load vector-sized chunks of those arrays using element_t. You can do the load/stores either with Views+VectorIndexs, or by using the load() and store() functions in the vector or register classes.

for(int i = 0; i < iend; ++i) { \
X[i] = x[i]; \
Y[i] = y[i]; \
}

#define DAXPY_VEC_BODY \
Y(i) += a * X(i);

#define DAXPY_VEC_BODY2 \
Yview(i) += a*Xview(i);

#define DAXPY_VEC_BODY3 \
for(int i = 0;i < iend; ++i){ \
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

going along with the last comment: the "i" index here should be over Real_types... so it should increment by the vector width

Y[i] += a * X[i]; \
y[i] = Y[i]; \
}

#include "common/KernelBase.hpp"

Expand Down
19 changes: 19 additions & 0 deletions src/basic/INIT3-Seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,25 @@ void INIT3::runSeqVariant(VariantID vid)

break;
}

case RAJA_Vec : {
INIT3_VEC_SETUP;

auto init3_vec_lam = [=](RAJA::VectorIndex<I, vector_t> i) {
INIT3_VEC_BODY;
};
startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::forall<RAJA::vector_exec<vector_t>>(
RAJA::TypedRangeSegment<I>(ibegin, iend), init3_vec_lam);

}
stopTimer();

break;
}

#endif // RUN_RAJA_SEQ

default : {
Expand Down
1 change: 1 addition & 0 deletions src/basic/INIT3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ INIT3::INIT3(const RunParams& params)
setVariantDefined( Base_Seq );
setVariantDefined( Lambda_Seq );
setVariantDefined( RAJA_Seq );
setVariantDefined( RAJA_Vec );

setVariantDefined( Base_OpenMP );
setVariantDefined( Lambda_OpenMP );
Expand Down
11 changes: 11 additions & 0 deletions src/basic/INIT3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,20 @@
Real_ptr in1 = m_in1; \
Real_ptr in2 = m_in2;

#define INIT3_VEC_SETUP \
RAJA_INDEX_VALUE_T(I, Int_type, "I"); \
using vector_t = RAJA::StreamVector<Real_type,2>; \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> O1(out1, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> O2(out2, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> O3(out3, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> I1(in1, iend); \
RAJA::TypedView<Real_type, RAJA::Layout<1, Int_type, 0>, I> I2(in2, iend);

#define INIT3_BODY \
out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ;

#define INIT3_VEC_BODY \
O1(i) = O2(i) = O3(i) = -1 * I1(i) - I2(i);

#include "common/KernelBase.hpp"

Expand Down
Loading