Skip to content

Commit

Permalink
Merge pull request CNugteren#216 from CNugteren/integrated_tuner
Browse files Browse the repository at this point in the history
Integrated tuner
  • Loading branch information
CNugteren authored Nov 19, 2017
2 parents c41d219 + defad3d commit da76d7a
Show file tree
Hide file tree
Showing 36 changed files with 1,126 additions and 676 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

Development (next version)
- Re-designed and integrated the auto-tuner, no more dependency on CLTune
- Added tuned parameters for various devices (see README)

Version 1.2.0
Expand Down
55 changes: 36 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ set(clblast_VERSION_PATCH 0)
# Options and their default values
option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
option(SAMPLES "Enable compilation of the examples" OFF)
option(TUNERS "Enable compilation of the tuners" OFF)
option(TUNERS "Enable compilation of the tuners" ON)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
Expand Down Expand Up @@ -156,15 +156,6 @@ elseif(CUDA)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
endif()

# Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
if(TUNERS)
find_package(CLTune)
if(NOT CLTUNE_FOUND)
message(STATUS "Could NOT find CLTune, disabling the compilation of the tuners")
set(TUNERS OFF)
endif()
endif()

# Don't search for system libraries when cross-compiling
if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
if(TESTS)
Expand Down Expand Up @@ -233,7 +224,9 @@ endif()
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
src/utilities/compile.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
src/api_common.cpp
src/cache.cpp
Expand All @@ -252,6 +245,7 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/routines/common.hpp
src/routines/routines.hpp
src/utilities/buffer_test.hpp
src/utilities/compile.hpp
src/utilities/clblast_exceptions.hpp
src/utilities/device_mapping.hpp
src/utilities/msvc.hpp
Expand Down Expand Up @@ -373,27 +367,50 @@ endif()

# ==================================================================================================

# This section contains all the code related to the tuners. These tuners require the presence of
# the CLTune library (not included as part of the source).
# This section contains all the code related to the tuners
if(TUNERS)

# Visual Studio requires the sources of non-exported objects/libraries
set(TUNERS_COMMON src/tuning/tuning.hpp)
set(TUNERS_COMMON
src/utilities/compile.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
src/tuning/configurations.cpp
src/tuning/tuning.cpp)
set(TUNERS_HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
src/utilities/compile.hpp
src/utilities/clblast_exceptions.hpp
src/utilities/timing.hpp
src/utilities/utilities.hpp
src/tuning/configurations.hpp
src/tuning/tuning.hpp)
set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS})

# Creates a library with common sources for all tuners
if(MSVC)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
# Visual Studio requires the sources of non-exported objects/libraries
else()
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(tuners_common_library OBJECT ${TUNERS_COMMON})

# Adds CLBlast's interface include paths because we can't link to CLBlast here
target_include_directories(tuners_common_library PRIVATE
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR} ${API_INCLUDE_DIRS})
set(TUNERS_COMMON $<TARGET_OBJECTS:tuners_common_library>)
endif()

# Adds tuning executables
foreach(KERNEL ${KERNELS})
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC ${CLTUNE_INCLUDE_DIRS})
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()

Expand Down
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,6 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s

cmake -DTUNERS=ON ..

Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.6.0 or higher).

Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.

The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
Expand Down Expand Up @@ -416,7 +414,7 @@ More information
Further information on CLBlast is available through the following links:

* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.


Support us
Expand Down
68 changes: 0 additions & 68 deletions cmake/Modules/FindCLTune.cmake

This file was deleted.

2 changes: 1 addition & 1 deletion scripts/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def main(argv):
# Removes database entries before continuing
if cl_args.remove_device is not None:
print("[database] Removing all results for device '%s'" % cl_args.remove_device)
remove_database_entries(database, {"clblast_device": cl_args.remove_device})
remove_database_entries(database, {"clblast_device_name": cl_args.remove_device})
io.save_database(database, database_filename)

# Retrieves the best performing results
Expand Down
1 change: 0 additions & 1 deletion scripts/database/database/clblast.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ def print_cpp_database(database, output_dir):
kernels = sorted(set([s["kernel"] for s in device_database]))
for kernel in kernels:
kernel_database = [s for s in device_database if s["kernel"] == kernel]

assert len(kernel_database) == 1
results = kernel_database[0]["results"]

Expand Down
8 changes: 8 additions & 0 deletions scripts/database/database/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ def load_tuning_results(filename):
# Removes the numbering following the kernel family name
json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"])

# Removes unnecessary data
if json_data["best_kernel"]:
del json_data["best_kernel"]
if json_data["best_time"]:
del json_data["best_time"]
if json_data["best_parameters"]:
del json_data["best_parameters"]

# Adds the kernel name to the section instead of to the individual results
assert len(json_data["results"]) > 0
json_data["kernel"] = json_data["results"][0]["kernel"]
Expand Down
7 changes: 7 additions & 0 deletions src/clpp11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,13 @@ class Device {
std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
}

// Retrieves the above extra information (if present)
std::string GetExtraInfo() const {
if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); }
else { return std::string{""}; }
}

// Accessor to the private data-member
const RawDeviceID& operator()() const { return device_; }
private:
Expand Down
3 changes: 3 additions & 0 deletions src/cupp11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ class Device {
std::string AMDBoardName() const { return ""; }
std::string NVIDIAComputeCapability() const { return Capabilities(); }

// Retrieves the above extra information
std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }

// Accessor to the private data-member
const RawDeviceID& operator()() const { return device_; }
private:
Expand Down
68 changes: 4 additions & 64 deletions src/routine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,88 +135,28 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}

// Collects the parameters for this device in the form of defines, and adds the precision
// Collects the parameters for this device in the form of defines
auto source_string = std::string{""};
for (const auto &kernel_name : kernel_names_) {
source_string += db_(kernel_name).GetDefines();
}
source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";

// Adds the name of the routine as a define
source_string += "#define ROUTINE_"+routine_name_+"\n";

// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
if (device_.IsNVIDIA() || device_.IsARM()) {
source_string += "#define USE_INLINE_KEYWORD 1\n";
}

// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
source_string += "#define USE_CL_MAD 1\n";
}

// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
source_string += "#define USE_STAGGERED_INDICES 1\n";
}

// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
source_string += "#define GLOBAL_MEM_FENCE 1\n";
}

// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
source_string +=
#include "kernels/opencl_to_cuda.h"
;
#endif

// Loads the common header (typedefs and defines and such)
source_string +=
#include "kernels/common.opencl"
;

// Adds routine-specific code to the constructed source string
for (const char *s: source) {
source_string += s;
}

// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Completes the source and compiles the kernel
program_ = CompileFromSource(source_string, precision_, routine_name_,
device_, context_, options);

// Compiles the kernel
program_ = Program(context_, source_string);
try {
program_.Build(device_, options);
} catch (const CLCudaAPIBuildError &e) {
if (program_.StatusIsCompilationWarningOrError(e.status())) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program_.GetBuildInfo(device_).c_str());
}
throw;
}

// Store the compiled binary and program in the cache
BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
program_.GetIR());

ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
Program{ program_ });

// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
}

// =================================================================================================
Expand Down
1 change: 1 addition & 0 deletions src/routines/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <vector>

#include "utilities/utilities.hpp"
#include "utilities/compile.hpp"
#include "database/database.hpp"

namespace clblast {
Expand Down
Loading

0 comments on commit da76d7a

Please sign in to comment.