Skip to content

Commit

Permalink
Modified the kernel tuners to use the newly integrated auto-tuner
Browse files Browse the repository at this point in the history
  • Loading branch information
CNugteren committed Nov 19, 2017
1 parent 8a5a5e0 commit 7a54494
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 294 deletions.
26 changes: 12 additions & 14 deletions src/tuning/kernels/copy_fast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
// This file uses the auto-tuner to tune the copy OpenCL kernels.
//
// =================================================================================================

Expand Down Expand Up @@ -42,7 +42,6 @@ class TuneCopy {
settings.kernel_family = "copy";
settings.kernel_name = "CopyMatrixFast";
settings.sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/level3.opencl"
#include "../src/kernels/level3/copy_fast.opencl"
;
Expand All @@ -51,6 +50,10 @@ class TuneCopy {
settings.size_a = args.m * args.n;
settings.size_b = args.m * args.n;

// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {2, 3};
settings.outputs = {3};

// Sets the base thread configuration
settings.global_size = {args.m, args.n};
settings.global_size_ref = settings.global_size;
Expand Down Expand Up @@ -78,20 +81,15 @@ class TuneCopy {

// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }

// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
static std::vector<Constraint> SetConstraints() { return {}; }

// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
static void SetArguments(Kernel &kernel, const Arguments<T> &args,
std::vector<Buffer<T>>& buffers) {
kernel.SetArgument(0, static_cast<int>(args.m));
kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
kernel.SetArgument(3, GetRealArg(args.alpha));
}
};

Expand Down
42 changes: 20 additions & 22 deletions src/tuning/kernels/copy_pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
// This file uses the auto-tuner to tune the pad OpenCL kernels.
//
// =================================================================================================

Expand Down Expand Up @@ -42,7 +42,6 @@ class TunePad {
settings.kernel_family = "pad";
settings.kernel_name = "CopyPadMatrix";
settings.sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/level3.opencl"
#include "../src/kernels/level3/copy_pad.opencl"
;
Expand All @@ -51,6 +50,10 @@ class TunePad {
settings.size_a = args.m * args.n;
settings.size_b = args.m * args.n;

// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {2, 3};
settings.outputs = {3};

// Sets the base thread configuration
settings.global_size = {args.m, args.n};
settings.global_size_ref = settings.global_size;
Expand Down Expand Up @@ -78,28 +81,23 @@ class TunePad {

// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }

// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
static std::vector<Constraint> SetConstraints() { return {}; }

// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(0);
static void SetArguments(Kernel &kernel, const Arguments<T> &args,
std::vector<Buffer<T>>& buffers) {
kernel.SetArgument(0, static_cast<int>(args.m));
kernel.SetArgument(1, static_cast<int>(args.n));
kernel.SetArgument(2, static_cast<int>(args.m));
kernel.SetArgument(3, 0);
kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
kernel.SetArgument(5, static_cast<int>(args.m));
kernel.SetArgument(6, static_cast<int>(args.n));
kernel.SetArgument(7, static_cast<int>(args.m));
kernel.SetArgument(8, 0);
kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
kernel.SetArgument(10, GetRealArg(args.alpha));
kernel.SetArgument(11, 0);
}
};

Expand Down
31 changes: 12 additions & 19 deletions src/tuning/kernels/transpose_fast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
// This file uses the auto-tuner to tune the transpose OpenCL kernels.
//
// =================================================================================================

Expand Down Expand Up @@ -42,7 +42,6 @@ class TuneTranspose {
settings.kernel_family = "transpose";
settings.kernel_name = "TransposeMatrixFast";
settings.sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/level3.opencl"
#include "../src/kernels/level3/transpose_fast.opencl"
;
Expand All @@ -51,6 +50,10 @@ class TuneTranspose {
settings.size_a = args.m * args.n;
settings.size_b = args.m * args.n;

// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {2, 3};
settings.outputs = {3};

// Sets the base thread configuration
settings.global_size = {args.m, args.n};
settings.global_size_ref = settings.global_size;
Expand Down Expand Up @@ -78,25 +81,15 @@ class TuneTranspose {

// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }

// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) {
return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
};
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
}
static std::vector<Constraint> SetConstraints() { return {}; }

// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
static void SetArguments(Kernel &kernel, const Arguments<T> &args,
std::vector<Buffer<T>>& buffers) {
kernel.SetArgument(0, static_cast<int>(args.m));
kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
kernel.SetArgument(3, GetRealArg(args.alpha));
}
};

Expand Down
47 changes: 20 additions & 27 deletions src/tuning/kernels/transpose_pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels.
//
// =================================================================================================

Expand Down Expand Up @@ -42,7 +42,6 @@ class TunePadTranspose {
settings.kernel_family = "padtranspose";
settings.kernel_name = "TransposePadMatrix";
settings.sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/level3.opencl"
#include "../src/kernels/level3/transpose_pad.opencl"
;
Expand All @@ -51,6 +50,10 @@ class TunePadTranspose {
settings.size_a = args.m * args.n;
settings.size_b = args.m * args.n;

// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {2, 3};
settings.outputs = {3};

// Sets the base thread configuration
settings.global_size = {args.m, args.n};
settings.global_size_ref = settings.global_size;
Expand All @@ -77,33 +80,23 @@ class TunePadTranspose {

// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }

// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) {
return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
};
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
}
static std::vector<Constraint> SetConstraints() { return {}; }

// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(0);
static void SetArguments(Kernel &kernel, const Arguments<T> &args,
std::vector<Buffer<T>>& buffers) {
kernel.SetArgument(0, static_cast<int>(args.m));
kernel.SetArgument(1, static_cast<int>(args.n));
kernel.SetArgument(2, static_cast<int>(args.m));
kernel.SetArgument(3, 0);
kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
kernel.SetArgument(5, static_cast<int>(args.n));
kernel.SetArgument(6, static_cast<int>(args.m));
kernel.SetArgument(7, static_cast<int>(args.n));
kernel.SetArgument(8, 0);
kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
kernel.SetArgument(10, GetRealArg(args.alpha));
kernel.SetArgument(11, 0);
}
};

Expand Down
26 changes: 12 additions & 14 deletions src/tuning/kernels/xaxpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels.
// This file uses the auto-tuner to tune the xaxpy OpenCL kernels.
//
// =================================================================================================

Expand Down Expand Up @@ -41,7 +41,6 @@ class TuneXaxpy {
settings.kernel_family = "xaxpy";
settings.kernel_name = "XaxpyFastest";
settings.sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/level1/level1.opencl"
#include "../src/kernels/level1/xaxpy.opencl"
;
Expand All @@ -50,6 +49,10 @@ class TuneXaxpy {
settings.size_x = args.n;
settings.size_y = args.n;

// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {0, 1};
settings.outputs = {1};

// Sets the base thread configuration
settings.global_size = {args.n};
settings.global_size_ref = settings.global_size;
Expand Down Expand Up @@ -80,20 +83,15 @@ class TuneXaxpy {
throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
}
}

// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
static std::vector<Constraint> SetConstraints() { return {}; }

// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentOutput(y_vec);
static void SetArguments(Kernel &kernel, const Arguments<T> &args,
std::vector<Buffer<T>>& buffers) {
kernel.SetArgument(0, static_cast<int>(args.n));
kernel.SetArgument(1, GetRealArg(args.alpha));
kernel.SetArgument(2, buffers[0]()); // 0 == X vector
kernel.SetArgument(3, buffers[1]()); // 1 == Y vector
}
};

Expand Down
Loading

0 comments on commit 7a54494

Please sign in to comment.