Skip to content

Commit

Permalink
rewrite test_select to run in a few seconds. (#1665)
Browse files Browse the repository at this point in the history
* rewrite test_select to run in a few seconds.

* removing the threading; reverting to the original method.

* Merge from Master, remove all suggested changes and start with a simple change report on each change's cost savings:
 BEFORE:
        real    47m8.497s
        user    48m8.860s
        sys     0m14.952s
AFTER:
        real    17m53.383s
        user    18m53.342s
        sys     0m13.297s

initSrcBuffer generates the same random noise every iteration through the loop.  There is no change to the arguments, and the host data itself doesn't need to get rewritten.  Profiling realizes a 2 times speed accel from simply relying upon the buffer to remain randomized at the next loop iteration.

* BEFORE:
        real    17m53.383s
        user    18m53.342s
        sys     0m13.297s
AFTER:
real    12m26.035s
user    13m15.505s
sys     0m15.414s

rearrange a few things in the loops to allow for vectorized / interleaved loop traversal.  NB: not all loops are vectorizable obviously; but this addresses the worst offenders.  Also note, to enable compiler to generate vectorized and interleaved loop traversal build with -o3.

* address the CI format requirements.

* address the CI format requirements.

* address the CI format requirements.
  • Loading branch information
gpx1000 authored Oct 3, 2023
1 parent f396850 commit 43d6886
Showing 1 changed file with 38 additions and 18 deletions.
56 changes: 38 additions & 18 deletions test_conformance/select/test_select.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,14 @@ static void initSrcBuffer(void* src1, Type stype, MTdata);

// initialize the valued used to compare with in the select with
// vlaues [start, count)
static void initCmpBuffer(void* cmp, Type cmptype, uint64_t start, size_t count);
static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
const size_t count);

// make a program that uses select for the given stype (src/dest type),
// ctype (comparison type), veclen (vector length)
static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type stype, Type ctype, size_t veclen );
static cl_program makeSelectProgram(cl_kernel *kernel_ptr, cl_context context,
Type stype, Type ctype,
const size_t veclen);

// Creates and execute the select test for the given device, context,
// stype (source/dest type), cmptype (comparison type), using max_tg_size
Expand Down Expand Up @@ -121,7 +124,9 @@ static void initSrcBuffer(void* src1, Type stype, MTdata d)
s1[i] = genrand_int32(d);
}

static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
const size_t count)

{
assert(cmptype != kfloat);
switch (type_size[cmptype]) {
Expand All @@ -144,11 +149,12 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// The short test doesn't iterate over the entire 32 bit space so
// we alternate between positive and negative values
int32_t* ui = (int32_t *)cmp;
int32_t sign = 1;
for (size_t i = 0; i < count; ++i, ++start)
int32_t neg_start = (int32_t)start * -1;
for (size_t i = 0; i < count; i++)
{
ui[i] = (int32_t)start*sign;
sign = sign * -1;
++start;
--neg_start;
ui[i] = (int32_t)((i % 2) ? start : neg_start);
}
}
break;
Expand All @@ -157,11 +163,12 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// We don't iterate over the entire space of 64 bit so for the
// selects, we want to test positive and negative values
int64_t* ll = (int64_t *)cmp;
int64_t sign = 1;
for (size_t i = 0; i < count; ++i, ++start)
int64_t neg_start = (int64_t)start * -1;
for (size_t i = 0; i < count; i++)
{
ll[i] = start*sign;
sign = sign * -1;
++start;
--neg_start;
ll[i] = (int64_t)((i % 2) ? start : neg_start);
}
break;
}
Expand All @@ -173,7 +180,9 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// Make the various incarnations of the program we want to run
// stype: source and destination type for the select
// ctype: compare type
static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type srctype, Type cmptype, size_t vec_len)
static cl_program makeSelectProgram(cl_kernel *kernel_ptr,
const cl_context context, Type srctype,
Type cmptype, const size_t vec_len)
{
char testname[256];
char stypename[32];
Expand Down Expand Up @@ -309,7 +318,7 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
clMemWrapper src1, src2, cmp, dest;

cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
size_t block_elements = BUFFER_SIZE / type_size[stype];
const size_t block_elements = BUFFER_SIZE / type_size[stype];
size_t step = s_wimpy_mode ? s_wimpy_reduction_factor : 1;
cl_ulong cmp_stride = block_elements * step;

Expand Down Expand Up @@ -355,10 +364,21 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
test_error_count(err, "Error: could not allocate dest buffer\n");

for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
programs[0] = makeSelectProgram(&kernels[0], context, stype, cmptype,
element_count[0]);
programs[1] = makeSelectProgram(&kernels[1], context, stype, cmptype,
element_count[1]);
programs[2] = makeSelectProgram(&kernels[2], context, stype, cmptype,
element_count[2]);
programs[3] = makeSelectProgram(&kernels[3], context, stype, cmptype,
element_count[3]);
programs[4] = makeSelectProgram(&kernels[4], context, stype, cmptype,
element_count[4]);
programs[5] = makeSelectProgram(&kernels[5], context, stype, cmptype,
element_count[5]);

for (size_t vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
{
programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype,
cmptype, element_count[vecsize]);
if (!programs[vecsize] || !kernels[vecsize])
{
return -1;
Expand Down Expand Up @@ -391,10 +411,10 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
log_info("Testing...");
uint64_t i;

initSrcBuffer(src1_host.data(), stype, d);
initSrcBuffer(src2_host.data(), stype, d);
for (i=0; i < blocks; i+=step)
{
initSrcBuffer(src1_host.data(), stype, d);
initSrcBuffer(src2_host.data(), stype, d);
initCmpBuffer(cmp_host.data(), cmptype, i * cmp_stride, block_elements);

err = clEnqueueWriteBuffer(queue, src1, CL_FALSE, 0, BUFFER_SIZE,
Expand Down

0 comments on commit 43d6886

Please sign in to comment.