From 337b2dbae96cd4e2cf943d9d3440521e6d51421b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 10:57:03 +0200 Subject: [PATCH 01/36] Format change. --- examples/directives/matrix_multiply_c_openacc.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py index d8bf7cc4..5914a0a2 100644 --- a/examples/directives/matrix_multiply_c_openacc.py +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -2,12 +2,7 @@ """This is an example tuning a naive matrix multiplication using the simplified directives interface""" from kernel_tuner import tune_kernel -from kernel_tuner.utils.directives import ( - Code, - OpenACC, - Cxx, - process_directives -) +from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives N = 4096 From 852f19875f6baf934be9ef9e838fbb883030c905 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 11:00:11 +0200 Subject: [PATCH 02/36] First very draft attempt at feature parity OpenACC/OpenMP. --- examples/directives/vector_add_c_openmp.py | 57 ++++++++++ kernel_tuner/utils/directives.py | 122 +++++++++++++++++++-- 2 files changed, 169 insertions(+), 10 deletions(-) create mode 100644 examples/directives/vector_add_c_openmp.py diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py new file mode 100644 index 00000000..b21c5b75 --- /dev/null +++ b/examples/directives/vector_add_c_openmp.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenACC code with the kernel tuner""" + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + +code = """ +#include + +#define VECTOR_SIZE 1000000 + +int main(void) { + int size = VECTOR_SIZE; + float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp target teams num_threads(nthreads) + #pragma omp distribute parallel for + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); +} +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None] + +tune_kernel( + "vector_add", + kernel_string["vector_add"], + 0, + kernel_args["vector_add"], + tune_params, + metrics=metrics, + answer=answer, + compiler_options=["-fast", "-mp=gpu"], + compiler="nvc++", +) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index ded8c57f..08503f56 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -56,6 +56,13 @@ def get(self) -> str: return "openacc" +class OpenMP(Directive): + """Class to represent OpenMP""" + + def get(self) -> str: + return "openmp" + + class Cxx(Language): """Class to represent C++ code""" @@ -131,6 +138,11 @@ def is_openacc(directive: Directive) -> bool: return isinstance(directive, OpenACC) +def is_openmp(directive: Directive) -> bool: + """Check if a directive is OpenMP""" + return isinstance(directive, OpenMP) + + def is_cxx(lang: Language) -> bool: """Check if language is C++""" return isinstance(lang, Cxx) @@ -141,6 +153,19 @@ def is_fortran(lang: Language) -> bool: return isinstance(lang, Fortran) +def line_contains(line: str, target: str) -> bool: + """Generic helper to check if a line contains the target""" + return target in line + + +def directive_contains_clause(line: str, clauses: list) -> bool: + """Check if a directive contains one clause from a list""" + for clause in clauses: + if clause in line: + return True + return False + + def line_contains_openacc_directive(line: str, lang: Language) -> bool: """Check if line contains an OpenACC directive or not""" if is_cxx(lang): @@ -160,6 +185,25 @@ def line_contains_openacc_directive_fortran(line: str) -> bool: return line_contains(line, "!$acc") +def line_contains_openmp_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenMP directive or not""" + if is_cxx(lang): + return line_contains_openmp_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openmp_directive_fortran(line) + return False + + +def line_contains_openmp_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenMP directive or not""" + return line_contains(line, "#pragma omp") + + +def line_contains_openmp_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenMP directive or not""" + return line_contains(line, "!$omp") + + def line_contains_openacc_parallel_directive(line: str, lang: Language) -> bool: """Check if line contains an OpenACC parallel directive or not""" if is_cxx(lang): @@ -179,23 +223,35 @@ def line_contains_openacc_parallel_directive_fortran(line: str) -> bool: return line_contains(line, "!$acc parallel") -def line_contains(line: str, target: str) -> bool: - """Generic helper to check if a line contains the target""" - return target in line +def line_contains_openmp_target_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenMP target directive or not""" + if is_cxx(lang): + return line_contains_openmp_target_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openmp_target_directive_fortran(line) + return False -def openacc_directive_contains_clause(line: str, clauses: list) -> bool: - """Check if an OpenACC directive contains one clause from a list""" - for clause in clauses: - if clause in line: - return True - return False +def line_contains_openmp_target_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenMP target directive or not""" + return line_contains(line, "#pragma omp target") + + +def line_contains_openmp_target_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenMP target directive or not""" + return line_contains(line, "!$omp target") def openacc_directive_contains_data_clause(line: str) -> bool: """Check if an OpenACC directive contains one data clause""" data_clauses = ["copy", "copyin", "copyout", "create", "no_create", "present", "device_ptr", "attach"] - return openacc_directive_contains_clause(line, data_clauses) + return directive_contains_clause(line, data_clauses) + + +def openmp_directive_contains_data_clause(line: str) -> bool: + """Check if an OpenMP directive contains one data clause""" + data_clauses = ["map"] + return directive_contains_clause(line, data_clauses) def create_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str: @@ -223,6 +279,29 @@ def create_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: ) +def create_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str: + """Create a data directive for a given language""" + if is_cxx(lang): + return create_data_directive_openmp_cxx(name, size) + elif is_fortran(lang): + return create_data_directive_openmp_fortran(name, size) + return "" + + +def create_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to allocate and copy data""" + return f"#pragma omp target enter data map(to: {name}[:{size.get()}])\n" + + +def create_data_directive_openmp_fortran(name: str, size: ArraySize) -> str: + """Create Fortran OpenMP code to allocate and copy data""" + if len(size) == 1: + return f"!omp target enter data map(to: {name}(:{size.get()}))\n" + else: + md_size = fortran_md_size(size) + return f"!$omp target enter data map(to: {name}({','.join(md_size)}))\n" + + def exit_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str: """Create code to copy data back for a given language""" if is_cxx(lang): @@ -246,6 +325,29 @@ def exit_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: return f"!$acc exit data copyout({name}({','.join(md_size)}))\n" +def exit_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str: + """Create code to copy data back for a given language""" + if is_cxx(lang): + return exit_data_directive_openmp_cxx(name, size) + elif is_fortran(lang): + return exit_data_directive_openmp_fortran(name, size) + return "" + + +def exit_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to copy back data""" + return f"#pragma omp target exit data map(from: {name}[:{size.get()}])\n" + + +def exit_data_directive_openmp_fortran(name: str, size: ArraySize) -> str: + """Create Fortran OpenMP code to copy back data""" + if len(size) == 1: + return f"!$omp target exit data map(from: {name}(:{size.get()}))\n" + else: + md_size = fortran_md_size(size) + return f"!$omp target exit data map(from: {name}({','.join(md_size)}))\n" + + def correct_kernel(kernel_name: str, line: str) -> bool: """Checks if the line contains the correct kernel name""" return f" {kernel_name} " in line or (kernel_name in line and len(line.partition(kernel_name)[2]) == 0) From a2cee376e1f94004964ed952eeeb969794e95051 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 11:48:16 +0200 Subject: [PATCH 03/36] Added OpenMP tests. --- kernel_tuner/utils/directives.py | 21 ++++++++++----- test/utils/test_directives.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 08503f56..df8e8a32 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -481,12 +481,21 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) - if is_openacc(langs.directive) and is_cxx(langs.language): - intro += create_data_directive_openacc_cxx(name, size) - outro += exit_data_directive_openacc_cxx(name, size) - elif is_openacc(langs.directive) and is_fortran(langs.language): - intro += create_data_directive_openacc_fortran(name, size) - outro += exit_data_directive_openacc_fortran(name, size) + if is_openacc(langs.directive): + if is_cxx(langs.language): + intro += create_data_directive_openacc_cxx(name, size) + outro += exit_data_directive_openacc_cxx(name, size) + elif is_fortran(langs.language): + intro += create_data_directive_openacc_fortran(name, size) + outro += exit_data_directive_openacc_fortran(name, size) + elif is_openmp(langs.directive): + if is_cxx(langs.language): + intro += create_data_directive_openmp_cxx(name, size) + outro += exit_data_directive_openmp_cxx(name, size) + elif is_fortran(langs.language): + intro += create_data_directive_openmp_fortran(name, size) + outro += exit_data_directive_openmp_fortran(name, size) + return "\n".join([intro, code, outro]) diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index d58a4442..fec225ff 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -4,6 +4,13 @@ def test_is_openacc(): assert is_openacc(OpenACC()) assert not is_openacc(None) + assert not is_openacc(OpenMP()) + + +def test_is_openmp(): + assert is_openmp(OpenMP()) + assert not is_openmp(None) + assert not is_openmp(OpenACC()) def test_is_cxx(): @@ -28,6 +35,16 @@ def test_line_contains_openacc_directive(): assert not line_contains_openacc_directive(cxx_code, None) +def test_line_contains_openmp_directive(): + cxx_code = "int main(void) {\n#pragma omp target}" + f90_code = "!$omp target" + assert line_contains_openmp_directive(cxx_code, Cxx()) + assert not line_contains_openmp_directive(f90_code, Cxx()) + assert line_contains_openmp_directive(f90_code, Fortran()) + assert not line_contains_openmp_directive(cxx_code, Fortran()) + assert not line_contains_openmp_directive(cxx_code, None) + + def test_line_contains_openacc_parallel_directive(): assert line_contains_openacc_parallel_directive("#pragma acc parallel wait", Cxx()) assert line_contains_openacc_parallel_directive("!$acc parallel", Fortran()) @@ -36,11 +53,24 @@ def test_line_contains_openacc_parallel_directive(): assert not line_contains_openacc_parallel_directive("!$acc parallel", None) +def test_line_contains_openmp_target_directive(): + assert line_contains_openmp_target_directive("#pragma omp target teams", Cxx()) + assert line_contains_openmp_target_directive("!$omp target", Fortran()) + assert not line_contains_openmp_target_directive("#pragma omp loop", Cxx()) + assert not line_contains_openmp_target_directive("!$omp loop", Fortran()) + assert not line_contains_openmp_target_directive("!$omp parallel", None) + + def test_openacc_directive_contains_data_clause(): assert openacc_directive_contains_data_clause("#pragma acc parallel present(A[:1089])") assert not openacc_directive_contains_data_clause("#pragma acc parallel for") +def test_openmp_directive_contains_data_clause(): + assert openacc_directive_contains_data_clause("#pragma omp target teams map(tofrom: A[:1089])") + assert not openacc_directive_contains_data_clause("#pragma omp target") + + def test_create_data_directive(): size = ArraySize() size.add(1024) @@ -48,6 +78,7 @@ def test_create_data_directive(): create_data_directive_openacc("array", size, Cxx()) == "#pragma acc enter data create(array[:1024])\n#pragma acc update device(array[:1024])\n" ) + assert create_data_directive_openmp("array", size, Cxx()) == "#pragma omp target enter data map(to: array[:1024])\n" size.clear() size.add(35) size.add(16) @@ -55,18 +86,27 @@ def test_create_data_directive(): create_data_directive_openacc("matrix", size, Fortran()) == "!$acc enter data create(matrix(:35,:16))\n!$acc update device(matrix(:35,:16))\n" ) + assert ( + create_data_directive_openmp("matrix", size, Fortran()) == "!$omp target enter data map(to: matrix(:35,:16))\n" + ) assert create_data_directive_openacc("array", size, None) == "" + assert create_data_directive_openmp("array", size, None) == "" def test_exit_data_directive(): size = ArraySize() size.add(1024) assert exit_data_directive_openacc("array", size, Cxx()) == "#pragma acc exit data copyout(array[:1024])\n" + assert exit_data_directive_openmp("array", size, Cxx()) == "#pragma omp target exit data map(from: array[:1024])\n" size.clear() size.add(35) size.add(16) assert exit_data_directive_openacc("matrix", size, Fortran()) == "!$acc exit data copyout(matrix(:35,:16))\n" + assert ( + exit_data_directive_openmp("matrix", size, Fortran()) == "!$omp target exit data map(from: matrix(:35,:16))\n" + ) assert exit_data_directive_openacc("matrix", size, None) == "" + assert exit_data_directive_openmp("matrix", size, None) == "" def test_correct_kernel(): @@ -101,14 +141,20 @@ def test_wrap_timing(): def test_wrap_data(): acc_cxx = Code(OpenACC(), Cxx()) acc_f90 = Code(OpenACC(), Fortran()) + omp_cxx = Code(OpenMP(), Cxx()) + omp_f90 = Code(OpenMP(), Fortran()) code_cxx = "// this is a comment\n" code_f90 = "! this is a comment\n" data = {"array": ["int*", "size"]} preprocessor = ["#define size 42"] expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n" assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx + expected_cxx = "#pragma omp target enter data map(to: array[:42])\n\n// this is a comment\n\n#pragma omp target exit data map(from: array[:42])\n" + assert wrap_data(code_cxx, omp_cxx, data, preprocessor, None) == expected_cxx expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n" assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90 + expected_f90 = "!$omp target enter data map(to: array(:42))\n\n! this is a comment\n\n!$omp target exit data map(from: array(:42))\n" + assert wrap_data(code_f90, omp_f90, data, preprocessor, None) == expected_f90 data = {"matrix": ["float*", "rows,cols"]} preprocessor = ["#define rows 42", "#define cols 84"] expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n" From 0067fd5851066e26ecd5526b595017de573fa3cb Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 11:53:02 +0200 Subject: [PATCH 04/36] Fixed two failing tests. --- kernel_tuner/utils/directives.py | 2 +- test/utils/test_directives.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index df8e8a32..e9bac153 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -296,7 +296,7 @@ def create_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: def create_data_directive_openmp_fortran(name: str, size: ArraySize) -> str: """Create Fortran OpenMP code to allocate and copy data""" if len(size) == 1: - return f"!omp target enter data map(to: {name}(:{size.get()}))\n" + return f"!$omp target enter data map(to: {name}(:{size.get()}))\n" else: md_size = fortran_md_size(size) return f"!$omp target enter data map(to: {name}({','.join(md_size)}))\n" diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index fec225ff..b7fd3735 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -67,8 +67,8 @@ def test_openacc_directive_contains_data_clause(): def test_openmp_directive_contains_data_clause(): - assert openacc_directive_contains_data_clause("#pragma omp target teams map(tofrom: A[:1089])") - assert not openacc_directive_contains_data_clause("#pragma omp target") + assert openmp_directive_contains_data_clause("#pragma omp target teams map(tofrom: A[:1089])") + assert not openmp_directive_contains_data_clause("#pragma omp target") def test_create_data_directive(): From debcb6470497dad6fa6efc3679553ecd1af0b19f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 14:29:02 +0200 Subject: [PATCH 05/36] A fix was necessary in the Compiler backend to have OpenMP working. --- examples/directives/vector_add_c_openmp.py | 5 ++--- kernel_tuner/backends/compiler.py | 7 ++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py index b21c5b75..011459b2 100644 --- a/examples/directives/vector_add_c_openmp.py +++ b/examples/directives/vector_add_c_openmp.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""This is a simple example for tuning C++ OpenACC code with the kernel tuner""" +"""This is a simple example for tuning C++ OpenMP code with the kernel tuner""" from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives @@ -16,8 +16,7 @@ float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) - #pragma omp target teams num_threads(nthreads) - #pragma omp distribute parallel for + #pragma omp target teams parallel for num_threads(nthreads) for ( int i = 0; i < size; i++ ) { c[i] = a[i] + b[i]; } diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index b5724a1a..df52a0fa 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -186,16 +186,13 @@ def compile(self, kernel_instance): compiler_options = ["-fPIC"] # detect openmp - if "#include " in kernel_string or "use omp_lib" in kernel_string: + if "#pragma omp" in kernel_string or "!$omp" in kernel_string: logging.debug("set using_openmp to true") self.using_openmp = True - if self.compiler in ["nvc", "nvc++", "nvfortran"]: - compiler_options.append("-mp") - else: - compiler_options.append("-fopenmp") # detect openacc if "#pragma acc" in kernel_string or "!$acc" in kernel_string: + logging.debug("set using_openacc to true") self.using_openacc = True # if filename is known, use that one From bb95d7bcafe03ce368d482dc65544652539924ab Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 14:34:04 +0200 Subject: [PATCH 06/36] Fixed test. --- test/test_compiler_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 0c9d7f86..b27e33ec 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -427,7 +427,7 @@ def env(): @skip_if_no_openmp @skip_if_no_gcc def test_benchmark(env): - results, _ = kernel_tuner.tune_kernel(*env, block_size_names=["nthreads"]) + results, _ = kernel_tuner.tune_kernel(*env, compiler_options=["-fopenmp"], block_size_names=["nthreads"]) assert len(results) == 3 assert all(["nthreads" in result for result in results]) assert all(["time" in result for result in results]) From ba02b31977d4ec082d9dfdc7f4461b3d4041f623 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 14:48:37 +0200 Subject: [PATCH 07/36] Fixing the examples. --- examples/c/vector_add.py | 7 ++- examples/fortran/test_fortran_vector_add.py | 42 ---------------- examples/fortran/vector_add.F90 | 4 +- examples/fortran/vector_add.py | 2 +- examples/fortran/vector_add_acc.F90 | 53 --------------------- examples/fortran/vector_add_acc.py | 37 -------------- test/test_compiler_functions.py | 16 ++----- 7 files changed, 10 insertions(+), 151 deletions(-) delete mode 100755 examples/fortran/test_fortran_vector_add.py delete mode 100644 examples/fortran/vector_add_acc.F90 delete mode 100755 examples/fortran/vector_add_acc.py diff --git a/examples/c/vector_add.py b/examples/c/vector_add.py index c5956f2a..38122b49 100755 --- a/examples/c/vector_add.py +++ b/examples/c/vector_add.py @@ -26,7 +26,7 @@ } """ -size = 72*1024*1024 +size = 72 * 1024 * 1024 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) @@ -39,7 +39,6 @@ tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32] tune_params["vecsize"] = [1, 2, 4, 8, 16] -answer = [a+b, None, None, None] +answer = [a + b, None, None, None] -tune_kernel("vector_add", kernel_string, size, args, tune_params, - answer=answer, compiler_options=['-O3']) +tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"]) diff --git a/examples/fortran/test_fortran_vector_add.py b/examples/fortran/test_fortran_vector_add.py deleted file mode 100755 index 41564702..00000000 --- a/examples/fortran/test_fortran_vector_add.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python -"""This is a minimal example for calling Fortran functions""" - -import json -from pathlib import Path -import numpy as np -from kernel_tuner import run_kernel - - -def test(): - filename = Path(__file__).parent / "vector_add.F90" - with open(filename, "r") as f: - kernel_string = f.read() - - size = 10000000 - - a = np.random.randn(size).astype(np.float32) - b = np.random.randn(size).astype(np.float32) - c = np.zeros_like(b) - n = np.int32(size) - - args = [c, a, b, n] - - tune_params = dict() - tune_params["N"] = size - tune_params["NTHREADS"] = 4 - - answer = run_kernel( - "vector_add", - kernel_string, - size, - args, - tune_params, - lang="fortran", - compiler="gfortran", - ) - - assert np.allclose(answer[0], a + b, atol=1e-8) - - -if __name__ == "__main__": - test() diff --git a/examples/fortran/vector_add.F90 b/examples/fortran/vector_add.F90 index b5541b59..6d39752b 100644 --- a/examples/fortran/vector_add.F90 +++ b/examples/fortran/vector_add.F90 @@ -17,11 +17,11 @@ subroutine vector_add(C, A, B, n) real (c_float), intent(in), dimension(N) :: A, B integer (c_int), intent(in) :: n - !$OMP parallel do + !$omp parallel do do i = 1, N C(i) = A(i) + B(i) end do - !$OMP end parallel do + !$omp end parallel do end subroutine vector_add diff --git a/examples/fortran/vector_add.py b/examples/fortran/vector_add.py index 5c1a5476..813f6671 100755 --- a/examples/fortran/vector_add.py +++ b/examples/fortran/vector_add.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """This is a minimal example for calling Fortran functions""" -from __future__ import print_function import numpy as np from kernel_tuner import tune_kernel @@ -29,6 +28,7 @@ def tune(): tune_params, lang="C", compiler="gfortran", + compiler_options=["-fopenmp"], ) return result diff --git a/examples/fortran/vector_add_acc.F90 b/examples/fortran/vector_add_acc.F90 deleted file mode 100644 index 9188e74c..00000000 --- a/examples/fortran/vector_add_acc.F90 +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef N -#define N 1024 -#endif -#ifndef block_size_x -#define block_size_x 256 -#endif - -module vector -use iso_c_binding -use omp_lib - -contains - -subroutine vector_add(C, A, B, n) - use iso_c_binding - real (c_float), intent(out), dimension(N) :: C - real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), value, intent(in) :: n - - !$acc data copyin(A, B) copyout(C) - - !$acc parallel loop device_type(nvidia) vector_length(block_size_x) - do i = 1, n - C(i) = A(i) + B(i) - end do - !$acc end parallel loop - - !$acc end data - -end subroutine vector_add - - - -function time_vector_add(C, A, B, n) result(time) - use iso_c_binding - real (c_float), intent(out), dimension(N) :: C - real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), value, intent(in) :: n - real (c_float) :: time - real (c_double) start_time, end_time - - start_time = omp_get_wtime() - - call vector_add(C, A, B, n) - - end_time = omp_get_wtime() - time = (end_time - start_time)*1e3 - -end function time_vector_add - - - -end module vector diff --git a/examples/fortran/vector_add_acc.py b/examples/fortran/vector_add_acc.py deleted file mode 100755 index ff84c8b6..00000000 --- a/examples/fortran/vector_add_acc.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -"""This is a minimal example for calling Fortran functions""" - -import numpy as np -from kernel_tuner import tune_kernel - - -def tune(): - size = int(72 * 1024 * 1024) - - a = np.random.randn(size).astype(np.float32) - b = np.random.randn(size).astype(np.float32) - c = np.zeros_like(b) - n = np.int32(size) - - args = [c, a, b, n] - - tune_params = dict() - tune_params["N"] = [size] - tune_params["block_size_x"] = [32, 64, 128, 256, 512] - - result, env = tune_kernel( - "time_vector_add", - "vector_add_acc.F90", - size, - args, - tune_params, - lang="C", - compiler="nvfortran", - compiler_options=["-fast", "-acc=gpu"], - ) - - return result - - -if __name__ == "__main__": - tune() diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index b27e33ec..f227eef9 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -159,9 +159,7 @@ def test_compile(npct, subprocess): kernel_string = "this is a fake C program" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions() f = cfunc.compile(kernel_instance) @@ -191,9 +189,7 @@ def test_compile_detects_device_code(npct, subprocess): kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions() cfunc.compile(kernel_instance) @@ -347,9 +343,7 @@ def test_complies_fortran_function_no_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) @@ -378,9 +372,7 @@ def test_complies_fortran_function_with_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) try: cfunc = CompilerFunctions(compiler="gfortran") From 828d4c6df29bb85b71bc7b92bbfd9deac30fcdf0 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 21 Aug 2024 16:18:12 +0200 Subject: [PATCH 08/36] Matrix multiply example for OpenMP. --- .../directives/matrix_multiply_c_openmp.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 examples/directives/matrix_multiply_c_openmp.py diff --git a/examples/directives/matrix_multiply_c_openmp.py b/examples/directives/matrix_multiply_c_openmp.py new file mode 100644 index 00000000..fde833cc --- /dev/null +++ b/examples/directives/matrix_multiply_c_openmp.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +"""This is an example tuning a naive matrix multiplication using the simplified directives interface""" + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + +N = 4096 + +code = """ +#define N 4096 + +void matrix_multiply(float *A, float *B, float *C) { + #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) + float temp_sum = 0.0f; + #pragma omp target + #pragma omp teams collapse(2) + for ( int i = 0; i < N; i++) { + for ( int j = 0; j < N; j++ ) { + temp_sum = 0.0f; + #pragma omp distribute parallel for num_threads(nthreads) reduction(+:temp_sum) + for ( int k = 0; k < N; k++ ) { + temp_sum += A[(i * N) + k] * B[(k * N) + j]; + } + C[(i * N) + j] = temp_sum; + } + } + #pragma tuner stop +} +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +dims = {"NN": N**2} +kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims) + +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["time_s"] = lambda x: x["time"] / 10**3 +metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 +metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 + +tune_kernel( + "mm", + kernel_string["mm"], + 0, + kernel_args["mm"], + tune_params, + metrics=metrics, + compiler_options=["-fast", "-mp=gpu"], + compiler="nvc++", +) From 24857ccbe9e331812b3dbaa0bd34240015e33e4d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 10:35:39 +0200 Subject: [PATCH 09/36] Adding correctness check. --- examples/directives/matrix_multiply_c_openacc.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py index 5914a0a2..96b94a1d 100644 --- a/examples/directives/matrix_multiply_c_openacc.py +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """This is an example tuning a naive matrix multiplication using the simplified directives interface""" -from kernel_tuner import tune_kernel +from kernel_tuner import tune_kernel, run_kernel from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives N = 4096 @@ -40,6 +40,12 @@ metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 +# compute reference solution from CPU +results = run_kernel( + "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"] +) +answer = [None, None, results[2]] + tune_kernel( "mm", kernel_string["mm"], @@ -47,6 +53,7 @@ kernel_args["mm"], tune_params, metrics=metrics, - compiler_options=["-fast", "-acc=gpu"], + answer=answer, compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], ) From 3d32c4738fd71c3a08de31137e6311ca41a584db Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 10:44:19 +0200 Subject: [PATCH 10/36] Refactor function. --- kernel_tuner/utils/directives.py | 40 +++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index e9bac153..3e3358c3 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -481,24 +481,38 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) + temp = None if is_openacc(langs.directive): - if is_cxx(langs.language): - intro += create_data_directive_openacc_cxx(name, size) - outro += exit_data_directive_openacc_cxx(name, size) - elif is_fortran(langs.language): - intro += create_data_directive_openacc_fortran(name, size) - outro += exit_data_directive_openacc_fortran(name, size) + temp = wrap_data_openacc(name, size) elif is_openmp(langs.directive): - if is_cxx(langs.language): - intro += create_data_directive_openmp_cxx(name, size) - outro += exit_data_directive_openmp_cxx(name, size) - elif is_fortran(langs.language): - intro += create_data_directive_openmp_fortran(name, size) - outro += exit_data_directive_openmp_fortran(name, size) - + temp = wrap_data_openmp(name, size) + intro += temp[0] + outro += temp[1] return "\n".join([intro, code, outro]) +def wrap_data_openacc(name: str, size: int) -> Tuple[str, str]: + """Create language specific data directives""" + if is_cxx(langs.language): + intro = create_data_directive_openacc_cxx(name, size) + outro = exit_data_directive_openacc_cxx(name, size) + elif is_fortran(langs.language): + intro = create_data_directive_openacc_fortran(name, size) + outro = exit_data_directive_openacc_fortran(name, size) + return intro, outro + + +def wrap_data_openmp(name: str, size: int) -> Tuple[str, str]: + """Create language specific data directives""" + if is_cxx(langs.language): + intro += create_data_directive_openmp_cxx(name, size) + outro += exit_data_directive_openmp_cxx(name, size) + elif is_fortran(langs.language): + intro += create_data_directive_openmp_fortran(name, size) + outro += exit_data_directive_openmp_fortran(name, size) + return intro, outro + + def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" if is_cxx(langs.language): From 21e56afe9bd062693f20a460a36d8b272dc068a8 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 10:47:38 +0200 Subject: [PATCH 11/36] Bug fixed. --- kernel_tuner/utils/directives.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 3e3358c3..d0aee899 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -481,7 +481,7 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) - temp = None + temp = [] if is_openacc(langs.directive): temp = wrap_data_openacc(name, size) elif is_openmp(langs.directive): @@ -491,7 +491,7 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use return "\n".join([intro, code, outro]) -def wrap_data_openacc(name: str, size: int) -> Tuple[str, str]: +def wrap_data_openacc(name: str, size: int, langs: Code) -> Tuple[str, str]: """Create language specific data directives""" if is_cxx(langs.language): intro = create_data_directive_openacc_cxx(name, size) @@ -502,7 +502,7 @@ def wrap_data_openacc(name: str, size: int) -> Tuple[str, str]: return intro, outro -def wrap_data_openmp(name: str, size: int) -> Tuple[str, str]: +def wrap_data_openmp(name: str, size: int, langs: Code) -> Tuple[str, str]: """Create language specific data directives""" if is_cxx(langs.language): intro += create_data_directive_openmp_cxx(name, size) From 1f815e287c90655d68dd9e06bf68dfc99bcc84a9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 10:49:06 +0200 Subject: [PATCH 12/36] Adding missing parameter. --- kernel_tuner/utils/directives.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index d0aee899..c367b373 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -483,9 +483,9 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) temp = [] if is_openacc(langs.directive): - temp = wrap_data_openacc(name, size) + temp = wrap_data_openacc(name, size, langs) elif is_openmp(langs.directive): - temp = wrap_data_openmp(name, size) + temp = wrap_data_openmp(name, size, langs) intro += temp[0] outro += temp[1] return "\n".join([intro, code, outro]) From ddf501f58648f3592a13e5e35d0443b44965aee5 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 11:08:49 +0200 Subject: [PATCH 13/36] Another bug fixed. --- kernel_tuner/utils/directives.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c367b373..3d3a07fa 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -505,11 +505,11 @@ def wrap_data_openacc(name: str, size: int, langs: Code) -> Tuple[str, str]: def wrap_data_openmp(name: str, size: int, langs: Code) -> Tuple[str, str]: """Create language specific data directives""" if is_cxx(langs.language): - intro += create_data_directive_openmp_cxx(name, size) - outro += exit_data_directive_openmp_cxx(name, size) + intro = create_data_directive_openmp_cxx(name, size) + outro = exit_data_directive_openmp_cxx(name, size) elif is_fortran(langs.language): - intro += create_data_directive_openmp_fortran(name, size) - outro += exit_data_directive_openmp_fortran(name, size) + intro = create_data_directive_openmp_fortran(name, size) + outro = exit_data_directive_openmp_fortran(name, size) return intro, outro From 1698a9b5924df8f182eabc06a2ecb1c1e499164b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Aug 2024 13:02:10 +0200 Subject: [PATCH 14/36] Updated the OpenMP matrix multiply. --- examples/directives/matrix_multiply_c_openmp.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/directives/matrix_multiply_c_openmp.py b/examples/directives/matrix_multiply_c_openmp.py index fde833cc..3298ffb7 100644 --- a/examples/directives/matrix_multiply_c_openmp.py +++ b/examples/directives/matrix_multiply_c_openmp.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """This is an example tuning a naive matrix multiplication using the simplified directives interface""" -from kernel_tuner import tune_kernel +from kernel_tuner import tune_kernel, run_kernel from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives N = 4096 @@ -13,11 +13,11 @@ #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) float temp_sum = 0.0f; #pragma omp target - #pragma omp teams collapse(2) + #pragma omp teams distribute collapse(2) for ( int i = 0; i < N; i++) { for ( int j = 0; j < N; j++ ) { temp_sum = 0.0f; - #pragma omp distribute parallel for num_threads(nthreads) reduction(+:temp_sum) + #pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum) for ( int k = 0; k < N; k++ ) { temp_sum += A[(i * N) + k] * B[(k * N) + j]; } @@ -40,6 +40,12 @@ metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 +# compute reference solution from CPU +results = run_kernel( + "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"] +) +answer = [None, None, results[2]] + tune_kernel( "mm", kernel_string["mm"], @@ -47,6 +53,7 @@ kernel_args["mm"], tune_params, metrics=metrics, - compiler_options=["-fast", "-mp=gpu"], + answer=answer, compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], ) From 2fffcf0c828123fe5cef20bccbbe7908eb5e6b79 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Sep 2024 13:39:46 +0200 Subject: [PATCH 15/36] Update vector add OpenMP code. --- examples/directives/vector_add_c_openmp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py index 011459b2..f58988e8 100644 --- a/examples/directives/vector_add_c_openmp.py +++ b/examples/directives/vector_add_c_openmp.py @@ -16,7 +16,7 @@ float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) - #pragma omp target teams parallel for num_threads(nthreads) + #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) for ( int i = 0; i < size; i++ ) { c[i] = a[i] + b[i]; } @@ -33,6 +33,7 @@ kernel_string, kernel_args = process_directives(app, code) tune_params = dict() +tune_params["nteams"] = [2**i for i in range(1,11)] tune_params["nthreads"] = [32 * i for i in range(1, 33)] metrics = dict() metrics["GB/s"] = ( From c0d240ab891294ca642c342aef0c5cd86fa587be Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 5 Sep 2024 16:02:51 +0200 Subject: [PATCH 16/36] Using "restrict" is too compiler specific. --- kernel_tuner/utils/directives.py | 2 +- test/utils/test_directives.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 3d3a07fa..36cd219b 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -597,7 +597,7 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) p_type = param[1:-1] p_type = p_type.split(":")[0] if "*" in p_type: - p_type = p_type.replace("*", " * restrict") + p_type = p_type.replace("*", " *") if is_cxx(langs.language): params.append(f"{p_type} {p_name}") elif is_fortran(langs.language): diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index b7fd3735..759a20a5 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -284,13 +284,13 @@ def test_extract_directive_signature(): signatures = extract_directive_signature(code, acc_cxx) assert len(signatures) == 1 assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + "float vector_add(float * a, float * b, float * c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, acc_cxx, "vector_add") assert len(signatures) == 1 assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + "float vector_add(float * a, float * b, float * c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, acc_cxx, "vector_add_ext") From 62809fab8c900c22915290696474fc2020aeaf40 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 13:47:43 +0200 Subject: [PATCH 17/36] Reorder parameters. --- examples/directives/vector_add_c_openacc.py | 2 +- examples/directives/vector_add_c_openmp.py | 2 +- examples/directives/vector_add_fortran_openacc.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/directives/vector_add_c_openacc.py b/examples/directives/vector_add_c_openacc.py index c062f2b1..4730f3fd 100644 --- a/examples/directives/vector_add_c_openacc.py +++ b/examples/directives/vector_add_c_openacc.py @@ -67,6 +67,6 @@ tune_params, metrics=metrics, answer=answer, - compiler_options=["-fast", "-acc=gpu"], compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], ) diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py index f58988e8..e212532c 100644 --- a/examples/directives/vector_add_c_openmp.py +++ b/examples/directives/vector_add_c_openmp.py @@ -52,6 +52,6 @@ tune_params, metrics=metrics, answer=answer, - compiler_options=["-fast", "-mp=gpu"], compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], ) diff --git a/examples/directives/vector_add_fortran_openacc.py b/examples/directives/vector_add_fortran_openacc.py index 29e94646..503a4538 100644 --- a/examples/directives/vector_add_fortran_openacc.py +++ b/examples/directives/vector_add_fortran_openacc.py @@ -62,6 +62,6 @@ tune_params, metrics=metrics, answer=answer, - compiler_options=["-fast", "-acc=gpu"], compiler="nvfortran", + compiler_options=["-fast", "-acc=gpu"], ) From fbdadd71024bbeffff96a95d22e4d9fdc3093815 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 13:54:58 +0200 Subject: [PATCH 18/36] Draft example of a histogram. --- examples/directives/histogram_c_openacc.py | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 examples/directives/histogram_c_openacc.py diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py new file mode 100644 index 00000000..ad6a1bb5 --- /dev/null +++ b/examples/directives/histogram_c_openacc.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenACC code with the kernel tuner""" +import numpy as np + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives + + +def histogram(vector, hist): + for i in range(0, len(vector)): + item = vector[i] + hist[item] = hist[item] + 1 + return hist + + +code = """ +#include + +#define HIST_SIZE 256 +#define VECTOR_SIZE 1000000 + +int main(void) { + int * vector = (int *) malloc(VECTOR_SIZE * sizeof(int)); + int * hist = (int *) malloc(HIST_SIZE * sizeof(int)); + + #pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) + #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist) + for ( int i = 0; i < VECTOR_SIZE; i++ ) { + int item = vector[i]; + hist[item] = hist[item] + 1; + } + #pragma tuner stop + + free(vector); + free(hist); +} +""" + +# Extract tunable directive +app = Code(OpenACC(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["ngangs"] = [2**i for i in range(1, 11)] +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) +reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) +reference_hist = histogram(kernel_args["histogram"][0], reference_hist) +answer = [None, reference_hist] + +tune_kernel( + "histogram", + kernel_string["histogram"], + 0, + kernel_args["vector_add"], + tune_params, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], +) From d844e97f1fa5aa716cac03e5c7f681ac795a12a1 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 14:45:09 +0200 Subject: [PATCH 19/36] Bound the values inside the array. --- examples/directives/histogram_c_openacc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index ad6a1bb5..61b01487 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -50,6 +50,7 @@ def histogram(vector, hist): / 10**9 ) +kernel_args["histogram"][0] = np.random.randint(0, 255, len(kernel_args["histogram"][0]), dtype=np.int32) kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) reference_hist = histogram(kernel_args["histogram"][0], reference_hist) From ab9d6b5ee9849d0b25be35291519315e8aef6bcc Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 14:45:52 +0200 Subject: [PATCH 20/36] Typo. --- examples/directives/histogram_c_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index 61b01487..b78e54f5 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -60,7 +60,7 @@ def histogram(vector, hist): "histogram", kernel_string["histogram"], 0, - kernel_args["vector_add"], + kernel_args["histogram"], tune_params, metrics=metrics, answer=answer, From e7fd411d1db1d443be8084e81b8a11a03cb98c90 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 15:31:00 +0200 Subject: [PATCH 21/36] Fixing a bug in the correctness check. --- examples/directives/histogram_c_openacc.py | 28 +++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index b78e54f5..bf6fb29c 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -4,15 +4,30 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives +from kernel_tuner.observers import BenchmarkObserver +# Naive Python histogram implementation def histogram(vector, hist): for i in range(0, len(vector)): - item = vector[i] - hist[item] = hist[item] + 1 + hist[vector[i]] += 1 return hist +# We use this observer to clean output memory in between kernel executions +class MemoryReset(BenchmarkObserver): + def __init__(self, args): + self.args = args + + def before_start(self): + for i, arg in enumerate(self.args): + if not arg is None: + self.dev.memcpy_htod(self.dev.allocations[i], arg) + + def get_results(self): + return {} + + code = """ #include @@ -24,10 +39,11 @@ def histogram(vector, hist): int * hist = (int *) malloc(HIST_SIZE * sizeof(int)); #pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) - #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist) + #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) + #pragma acc loop independent for ( int i = 0; i < VECTOR_SIZE; i++ ) { - int item = vector[i]; - hist[item] = hist[item] + 1; + #pragma acc atomic update + hist[vector[i]] += 1; } #pragma tuner stop @@ -50,7 +66,7 @@ def histogram(vector, hist): / 10**9 ) -kernel_args["histogram"][0] = np.random.randint(0, 255, len(kernel_args["histogram"][0]), dtype=np.int32) +kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32) kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) reference_hist = histogram(kernel_args["histogram"][0], reference_hist) From 939f0c3adb8062822c24325a4ab77ad73378f542 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 15:33:28 +0200 Subject: [PATCH 22/36] Use the cleaning observer. --- examples/directives/histogram_c_openacc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index bf6fb29c..a677d42d 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -72,6 +72,8 @@ def get_results(self): reference_hist = histogram(kernel_args["histogram"][0], reference_hist) answer = [None, reference_hist] +mem_cleaner = MemoryReset([None, kernel_args["histogram"][1]]) + tune_kernel( "histogram", kernel_string["histogram"], @@ -80,6 +82,7 @@ def get_results(self): tune_params, metrics=metrics, answer=answer, + observers=[mem_cleaner], compiler="nvc++", compiler_options=["-fast", "-acc=gpu"], ) From 06c60741ff51a876f5235554ab36742b0026aa84 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 17 Sep 2024 16:46:44 +0200 Subject: [PATCH 23/36] Fixing what is (probably) a long standing bug, observers were ignored for the Compiler backend. --- kernel_tuner/backends/compiler.py | 1 - kernel_tuner/core.py | 150 +++++++++--------------------- 2 files changed, 43 insertions(+), 108 deletions(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index df52a0fa..c95a2c89 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -93,7 +93,6 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers self.lib = None self.using_openmp = False self.using_openacc = False - self.observers = [CompilerRuntimeObserver(self)] self.last_result = None if self.compiler == "g++": diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index c8f1ed0f..4b51d57e 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -80,9 +80,7 @@ def __init__(self, kernel_name, kernel_sources, lang, defines=None): self.defines = defines if lang is None: if callable(self.kernel_sources[0]): - raise TypeError( - "Please specify language when using a code generator function" - ) + raise TypeError("Please specify language when using a code generator function") kernel_string = self.get_kernel_string(0) lang = util.detect_language(kernel_string) @@ -109,9 +107,7 @@ def get_kernel_string(self, index=0, params=None): kernel_source = self.kernel_sources[index] return util.get_kernel_string(kernel_source, params) - def prepare_list_of_files( - self, kernel_name, params, grid, threads, block_size_names - ): + def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_names): """prepare the kernel string along with any additional files The first file in the list is allowed to include or read in the others @@ -147,9 +143,7 @@ def prepare_list_of_files( for i, f in enumerate(self.kernel_sources): if i > 0 and not util.looks_like_a_filename(f): - raise ValueError( - "When passing multiple kernel sources, the secondary entries must be filenames" - ) + raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames") ks = self.get_kernel_string(i, params) # add preprocessor statements @@ -183,9 +177,7 @@ def prepare_list_of_files( def get_user_suffix(self, index=0): """Get the suffix of the kernel filename, if the user specified one. Return None otherwise.""" - if util.looks_like_a_filename(self.kernel_sources[index]) and ( - "." in self.kernel_sources[index] - ): + if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]): return "." + self.kernel_sources[index].split(".")[-1] return None @@ -214,13 +206,9 @@ def check_argument_lists(self, kernel_name, arguments): """ for i, f in enumerate(self.kernel_sources): if not callable(f): - util.check_argument_list( - kernel_name, self.get_kernel_string(i), arguments - ) + util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments) else: - logging.debug( - "Checking of arguments list not supported yet for code generators." - ) + logging.debug("Checking of arguments list not supported yet for code generators.") class DeviceInterface(object): @@ -304,6 +292,7 @@ def __init__( compiler=compiler, compiler_options=compiler_options, iterations=iterations, + observers=observers, ) elif lang.upper() == "HIP": dev = HipFunctions( @@ -313,7 +302,9 @@ def __init__( observers=observers, ) else: - raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet") + raise ValueError( + "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet" + ) self.dev = dev # look for NVMLObserver in observers, if present, enable special tunable parameters through nvml @@ -443,9 +434,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett obs.results = result duration = max(duration, obs.continuous_duration) - self.benchmark_continuous( - func, gpu_args, instance.threads, instance.grid, result, duration - ) + self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration) except Exception as e: # some launches may fail because too many registers are required @@ -458,9 +447,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett "INVALID_WORK_GROUP_SIZE", ] if any([skip_str in str(e) for skip_str in skippable_exceptions]): - logging.debug( - "benchmark fails due to runtime failure too many resources required" - ) + logging.debug("benchmark fails due to runtime failure too many resources required") if verbose: print( f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch" @@ -472,13 +459,11 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett raise e return result - def check_kernel_output( - self, func, gpu_args, instance, answer, atol, verify, verbose - ): + def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose): """runs the kernel once and checks the result against answer""" logging.debug("check_kernel_output") - #if not using custom verify function, check if the length is the same + # if not using custom verify function, check if the length is the same if answer: if len(instance.arguments) != len(answer): raise TypeError("The length of argument list and provided results do not match.") @@ -507,7 +492,7 @@ def check_kernel_output( self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): if not answer[i].is_cuda: - #if the answer is on the host, copy gpu output to host as well + # if the answer is on the host, copy gpu output to host as well result_host.append(torch.zeros_like(answer[i])) self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) else: @@ -535,10 +520,7 @@ def check_kernel_output( correct = True if not correct: - raise RuntimeError( - "Kernel result verification failed for: " - + util.get_config_string(instance.params) - ) + raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): # reset previous timers @@ -552,7 +534,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # Compile and benchmark a kernel instance based on kernel strings and parameters instance_string = util.get_instance_string(params) - logging.debug('compile_and_benchmark ' + instance_string) + logging.debug("compile_and_benchmark " + instance_string) instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): @@ -570,9 +552,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, else: # add shared memory arguments to compiled module if kernel_options.smem_args is not None: - self.dev.copy_shared_memory_args( - util.get_smem_args(kernel_options.smem_args, params) - ) + self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params)) # add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) @@ -586,12 +566,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # test kernel for correctness if func and (to.answer or to.verify or self.output_observers): start_verification = time.perf_counter() - self.check_kernel_output( - func, gpu_args, instance, to.answer, to.atol, to.verify, verbose - ) - last_verification_time = 1000 * ( - time.perf_counter() - start_verification - ) + self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) + last_verification_time = 1000 * (time.perf_counter() - start_verification) # benchmark if func: @@ -607,10 +583,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, except Exception as e: # dump kernel sources to temp file temp_filenames = instance.prepare_temp_files_for_error_msg() - print( - "Error while compiling or benchmarking, see source files: " - + " ".join(temp_filenames) - ) + print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames)) raise e # clean up any temporary files, if no error occured @@ -639,9 +612,7 @@ def compile_kernel(self, instance, verbose): "local memory limit exceeded", ] if any(msg in str(e) for msg in shared_mem_error_messages): - logging.debug( - "compile_kernel failed due to kernel using too much shared memory" - ) + logging.debug("compile_kernel failed due to kernel using too much shared memory") if verbose: print( f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used" @@ -654,7 +625,7 @@ def compile_kernel(self, instance, verbose): @staticmethod def preprocess_gpu_arguments(old_arguments, params): - """ Get a flat list of arguments based on the configuration given by `params` """ + """Get a flat list of arguments based on the configuration given by `params`""" return _preprocess_gpu_arguments(old_arguments, params) def copy_shared_memory_args(self, smem_args): @@ -690,9 +661,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) ) if np.prod(threads) > self.dev.max_threads: if verbose: - print( - f"skipping config {util.get_instance_string(params)} reason: too many threads per block" - ) + print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block") return util.InvalidConfig() # obtain the kernel_string and prepare additional files, if any @@ -711,7 +680,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) # Preprocess GPU arguments. Require for handling `Tunable` arguments arguments = _preprocess_gpu_arguments(kernel_options.arguments, params) - #collect everything we know about this instance and return it + # collect everything we know about this instance and return it return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments) def get_environment(self): @@ -758,12 +727,8 @@ def run_kernel(self, func, gpu_args, instance): try: self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid) except Exception as e: - if "too many resources requested for launch" in str( - e - ) or "OUT_OF_RESOURCES" in str(e): - logging.debug( - "ignoring runtime failure due to too many resources required" - ) + if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e): + logging.debug("ignoring runtime failure due to too many resources required") return False else: logging.debug("encountered unexpected runtime failure: " + str(e)) @@ -772,7 +737,7 @@ def run_kernel(self, func, gpu_args, instance): def _preprocess_gpu_arguments(old_arguments, params): - """ Get a flat list of arguments based on the configuration given by `params` """ + """Get a flat list of arguments based on the configuration given by `params`""" new_arguments = [] for argument in old_arguments: @@ -789,15 +754,11 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): # first check if the length is the same if len(instance.arguments) != len(answer): - raise TypeError( - "The length of argument list and provided results do not match." - ) + raise TypeError("The length of argument list and provided results do not match.") # for each element in the argument list, check if the types match for i, arg in enumerate(instance.arguments): if answer[i] is not None: # skip None elements in the answer list - if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance( - arg, (np.ndarray, cp.ndarray) - ): + if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(arg, (np.ndarray, cp.ndarray)): if answer[i].dtype != arg.dtype: raise TypeError( f"Element {i} of the expected results list is not of the same dtype as the kernel output: " @@ -845,16 +806,14 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): ) else: # either answer[i] and argument have different types or answer[i] is not a numpy type - if not isinstance( - answer[i], (np.ndarray, cp.ndarray, torch.Tensor) - ) or not isinstance(answer[i], np.number): + if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance( + answer[i], np.number + ): raise TypeError( f"Element {i} of expected results list is not a numpy/cupy ndarray, torch Tensor or numpy scalar." ) else: - raise TypeError( - f"Element {i} of expected results list and kernel arguments have different types." - ) + raise TypeError(f"Element {i} of expected results list and kernel arguments have different types.") def _ravel(a): if hasattr(a, "ravel") and len(a.shape) > 1: @@ -874,26 +833,15 @@ def _flatten(a): expected = _flatten(expected) if any([isinstance(array, cp.ndarray) for array in [expected, result]]): output_test = cp.allclose(expected, result, atol=atol) - elif isinstance(expected, torch.Tensor) and isinstance( - result, torch.Tensor - ): + elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor): output_test = torch.allclose(expected, result, atol=atol) else: output_test = np.allclose(expected, result, atol=atol) if not output_test and verbose: - print( - "Error: " - + util.get_config_string(instance.params) - + " detected during correctness check" - ) - print( - "this error occured when checking value of the %oth kernel argument" - % (i,) - ) - print( - "Printing kernel output and expected result, set verbose=False to suppress this debug print" - ) + print("Error: " + util.get_config_string(instance.params) + " detected during correctness check") + print("this error occured when checking value of the %oth kernel argument" % (i,)) + print("Printing kernel output and expected result, set verbose=False to suppress this debug print") np.set_printoptions(edgeitems=50) print("Kernel output:") print(result) @@ -928,11 +876,7 @@ def apply_template_typenames(type_list, templated_typenames): def replace_typename_token(matchobj): """function for a whitespace preserving token regex replace""" # replace only the match, leaving the whitespace around it as is - return ( - matchobj.group(1) - + templated_typenames[matchobj.group(2)] - + matchobj.group(3) - ) + return matchobj.group(1) + templated_typenames[matchobj.group(2)] + matchobj.group(3) for i, arg_type in enumerate(type_list): for k, v in templated_typenames.items(): @@ -963,9 +907,7 @@ def wrap_templated_kernel(kernel_string, kernel_name): # relatively strict regex that does not allow nested template parameters like vector # within the template parameter list regex = ( - r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" - + name - + r"\s*\((.*?)\)\s*\{" + r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{" ) match = re.search(regex, kernel_string, re.S) if not match: @@ -973,15 +915,11 @@ def wrap_templated_kernel(kernel_string, kernel_name): template_parameters = match.group(1).split(",") argument_list = match.group(3).split(",") - argument_list = [ - s.strip() for s in argument_list - ] # remove extra whitespace around 'type name' strings + argument_list = [s.strip() for s in argument_list] # remove extra whitespace around 'type name' strings type_list, name_list = split_argument_list(argument_list) - templated_typenames = get_templated_typenames( - template_parameters, template_arguments - ) + templated_typenames = get_templated_typenames(template_parameters, template_arguments) apply_template_typenames(type_list, templated_typenames) # replace __global__ with __device__ in the templated kernel definition @@ -995,9 +933,7 @@ def wrap_templated_kernel(kernel_string, kernel_name): launch_bounds = match.group(2) # generate code for the compile-time template instantiation - template_instantiation = ( - f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n" - ) + template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n" # generate code for the wrapper kernel new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)]) From 9dc47a4a01a826fdc965fde885cffb9da0ef0618 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 18 Sep 2024 13:25:53 +0200 Subject: [PATCH 24/36] Fixing allocation bugs in the Compiler Backend. --- kernel_tuner/backends/compiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index c95a2c89..2ace61e3 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -18,7 +18,6 @@ get_temp_filename, delete_temp_file, write_file, - SkippableFailure, ) try: @@ -82,8 +81,11 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + self.allocations = [] self.observers = observers or [] self.observers.append(CompilerRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) self.iterations = iterations self.max_threads = 1024 @@ -162,6 +164,7 @@ def ready_argument_list(self, arguments): elif is_cupy_array(arg): data_ctypes = C.c_void_p(arg.data.ptr) ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) + self.allocations.append(ctype_args[i]) return ctype_args def compile(self, kernel_instance): From c7bc0c871e21877959cb97d9b4f55b046a847dc4 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 09:22:21 +0200 Subject: [PATCH 25/36] Move this trailing comment on the previous empty line. --- kernel_tuner/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 4b51d57e..1e31e1dc 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -915,7 +915,8 @@ def wrap_templated_kernel(kernel_string, kernel_name): template_parameters = match.group(1).split(",") argument_list = match.group(3).split(",") - argument_list = [s.strip() for s in argument_list] # remove extra whitespace around 'type name' strings + # remove extra whitespace around 'type name' strings + argument_list = [s.strip() for s in argument_list] type_list, name_list = split_argument_list(argument_list) From ee7432a8d7e20e1034eced45ab46553f9311f493 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 09:35:32 +0200 Subject: [PATCH 26/36] Fixing, for good, a bug that prevented cleaning up the output memory when using the compiler backend. --- kernel_tuner/backends/backend.py | 11 +++++++++++ kernel_tuner/backends/compiler.py | 16 +++++++++------- kernel_tuner/core.py | 7 +++---- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index a37c9d6e..6191485e 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -57,6 +57,12 @@ def memcpy_htod(self, dest, src): """This method must implement a host to device copy.""" pass + def reset(self, arguments, should_sync): + """Copy the original content of the output memory to device memory.""" + for i, arg in enumerate(arguments): + if should_sync[i]: + self.memcpy_htod(self.allocations[i], arg) + class GPUBackend(Backend): """Base class for GPU backends""" @@ -87,3 +93,8 @@ class CompilerBackend(Backend): @abstractmethod def __init__(self, iterations, compiler_options, compiler): pass + + @abstractmethod + def cleanup_lib(self): + """Unload the previously loaded shared library""" + pass diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 2ace61e3..28bbf8fd 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -81,6 +81,7 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + # allocations contains a clean copy of the memory self.allocations = [] self.observers = observers or [] self.observers.append(CompilerRuntimeObserver(self)) @@ -151,11 +152,6 @@ def ready_argument_list(self, arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): - # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference - # to its underlying array, so we need to store a reference to arg.copy() - # in the Argument object manually to avoid it being deleted. - # (This changed in numpy > 1.15.) - # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) else: raise TypeError("unknown dtype for ndarray") @@ -164,7 +160,7 @@ def ready_argument_list(self, arguments): elif is_cupy_array(arg): data_ctypes = C.c_void_p(arg.data.ptr) ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) - self.allocations.append(ctype_args[i]) + self.allocations.append(Argument(numpy=arg.copy(), ctypes=data_ctypes)) return ctype_args def compile(self, kernel_instance): @@ -393,8 +389,14 @@ def memcpy_htod(self, dest, src): xp = get_array_module(dest.numpy) dest.numpy[:] = xp.asarray(value) + def reset(self, arguments, should_sync): + """Copy the preserved content of the output memory to device pointers.""" + for i, arg in enumerate(arguments): + if should_sync[i]: + self.memcpy_htod(arg, self.allocations[i]) + def cleanup_lib(self): - """unload the previously loaded shared library""" + """Unload the previously loaded shared library""" if not self.using_openmp and not self.using_openacc: # this if statement is necessary because shared libraries that use # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1e31e1dc..6de24d13 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -474,14 +474,13 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve # re-copy original contents of output arguments to GPU memory, to overwrite any changes # by earlier kernel runs - for i, arg in enumerate(instance.arguments): - if should_sync[i]: - self.dev.memcpy_htod(gpu_args[i], arg) + self.dev.reset(instance.arguments, should_sync) # run the kernel check = self.run_kernel(func, gpu_args, instance) if not check: - return # runtime failure occured that should be ignored, skip correctness check + # runtime failure occured that should be ignored, skip correctness check + return # retrieve gpu results to host memory result_host = [] From e2c6a094908a4e0594e383966cf9a0c11b51867a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 09:35:48 +0200 Subject: [PATCH 27/36] The example should now work. --- examples/directives/histogram_c_openacc.py | 41 +++++----------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index a677d42d..9ca02737 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -4,7 +4,7 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives -from kernel_tuner.observers import BenchmarkObserver +from kernel_tuner.observers.observer import BenchmarkObserver # Naive Python histogram implementation @@ -14,42 +14,20 @@ def histogram(vector, hist): return hist -# We use this observer to clean output memory in between kernel executions -class MemoryReset(BenchmarkObserver): - def __init__(self, args): - self.args = args - - def before_start(self): - for i, arg in enumerate(self.args): - if not arg is None: - self.dev.memcpy_htod(self.dev.allocations[i], arg) - - def get_results(self): - return {} - - code = """ #include #define HIST_SIZE 256 #define VECTOR_SIZE 1000000 -int main(void) { - int * vector = (int *) malloc(VECTOR_SIZE * sizeof(int)); - int * hist = (int *) malloc(HIST_SIZE * sizeof(int)); - - #pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) - #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) - #pragma acc loop independent - for ( int i = 0; i < VECTOR_SIZE; i++ ) { - #pragma acc atomic update - hist[vector[i]] += 1; - } - #pragma tuner stop - - free(vector); - free(hist); +#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) +#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) +#pragma acc loop independent +for ( int i = 0; i < VECTOR_SIZE; i++ ) { + #pragma acc atomic update + hist[vector[i]] += 1; } +#pragma tuner stop """ # Extract tunable directive @@ -72,8 +50,6 @@ def get_results(self): reference_hist = histogram(kernel_args["histogram"][0], reference_hist) answer = [None, reference_hist] -mem_cleaner = MemoryReset([None, kernel_args["histogram"][1]]) - tune_kernel( "histogram", kernel_string["histogram"], @@ -82,7 +58,6 @@ def get_results(self): tune_params, metrics=metrics, answer=answer, - observers=[mem_cleaner], compiler="nvc++", compiler_options=["-fast", "-acc=gpu"], ) From fb9033a9b002e5692dd052504d332006219dee3e Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 10:10:15 +0200 Subject: [PATCH 28/36] Fix the test to use the new method. --- test/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_core.py b/test/test_core.py index a8624470..bdf53d9a 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -107,7 +107,7 @@ def test_check_kernel_output(dev_func_interface): dev.check_kernel_output('func', answer, instance, answer, atol, None, True) - dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0]) + dfi.reset.assert_called() dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1)) print(dfi.mock_calls) From f9808e17e4372bd0780e7232e817a0058196821e Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 13:31:10 +0200 Subject: [PATCH 29/36] Some refactoring necessary for everything to work. The Compiler backend will not provide memcpy as they do not really do anything for this backend. --- kernel_tuner/backends/backend.py | 2 +- kernel_tuner/backends/compiler.py | 44 +++++++++---------------------- kernel_tuner/core.py | 2 +- test/test_core.py | 3 ++- 4 files changed, 16 insertions(+), 35 deletions(-) diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index 6191485e..c458789e 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -57,7 +57,7 @@ def memcpy_htod(self, dest, src): """This method must implement a host to device copy.""" pass - def reset(self, arguments, should_sync): + def refresh_memory(self, arguments, should_sync): """Copy the original content of the output memory to device memory.""" for i, arg in enumerate(arguments): if should_sync[i]: diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 28bbf8fd..68453d00 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -356,44 +356,24 @@ def memset(self, allocation, value, size): C.memset(allocation.ctypes, value, size) def memcpy_dtoh(self, dest, src): - """a simple memcpy copying from an Argument to a numpy array - - :param dest: A numpy or cupy array to store the data - :type dest: np.ndarray or cupy.ndarray - - :param src: An Argument for some memory allocation - :type src: Argument - """ - if isinstance(dest, np.ndarray) and is_cupy_array(src.numpy): - # Implicit conversion to a NumPy array is not allowed. - value = src.numpy.get() - else: - value = src.numpy - xp = get_array_module(dest) - dest[:] = xp.asarray(value) + """There is no memcpy_dtoh for the compiler backend.""" + pass def memcpy_htod(self, dest, src): - """a simple memcpy copying from a numpy array to an Argument - - :param dest: An Argument for some memory allocation - :type dest: Argument - - :param src: A numpy or cupy array containing the source data - :type src: np.ndarray or cupy.ndarray - """ - if isinstance(dest.numpy, np.ndarray) and is_cupy_array(src): - # Implicit conversion to a NumPy array is not allowed. - value = src.get() - else: - value = src - xp = get_array_module(dest.numpy) - dest.numpy[:] = xp.asarray(value) + """There is no memcpy_htod for the compiler backend.""" + pass - def reset(self, arguments, should_sync): + def refresh_memory(self, arguments, should_sync): """Copy the preserved content of the output memory to device pointers.""" for i, arg in enumerate(arguments): if should_sync[i]: - self.memcpy_htod(arg, self.allocations[i]) + if isinstance(arg, np.ndarray) and is_cupy_array(self.allocations[i].numpy): + # Implicit conversion to a NumPy array is not allowed. + value = self.allocations[i].numpy.get() + else: + value = self.allocations[i].numpy + xp = get_array_module(arg) + arg[:] = xp.asarray(value) def cleanup_lib(self): """Unload the previously loaded shared library""" diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 6de24d13..7ba5b1d7 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -474,7 +474,7 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve # re-copy original contents of output arguments to GPU memory, to overwrite any changes # by earlier kernel runs - self.dev.reset(instance.arguments, should_sync) + self.dev.refresh_memory(instance.arguments, should_sync) # run the kernel check = self.run_kernel(func, gpu_args, instance) diff --git a/test/test_core.py b/test/test_core.py index bdf53d9a..39597b86 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -107,11 +107,12 @@ def test_check_kernel_output(dev_func_interface): dev.check_kernel_output('func', answer, instance, answer, atol, None, True) - dfi.reset.assert_called() + dfi.refresh_memory.assert_called() dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1)) print(dfi.mock_calls) + assert dfi.refresh_memory.called == 1 assert dfi.memcpy_dtoh.called == 1 for name, args, _ in dfi.mock_calls: From 2333916ab41072957fc6715ae48f1eb474ab04d4 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 13:44:14 +0200 Subject: [PATCH 30/36] Remove old tests. --- test/test_compiler_functions.py | 111 -------------------------------- 1 file changed, 111 deletions(-) diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index f227eef9..708b5ed6 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -146,12 +146,6 @@ def test_byte_array_arguments(): assert all(output_arg1 == arg1) - dest = np.zeros_like(arg1) - - cfunc.memcpy_dtoh(dest, output[0]) - - assert all(dest == arg1) - @patch("kernel_tuner.backends.compiler.subprocess") @patch("kernel_tuner.backends.compiler.numpy.ctypeslib") @@ -226,111 +220,6 @@ def test_memset(): assert all(x == np.zeros(4)) -@skip_if_no_cupy -def test_memcpy_dtoh(): - import cupy as cp - - a = [1, 2, 3, 4] - x = cp.asarray(a, dtype=np.float32) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - output = np.zeros(len(x), dtype=x.dtype) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(f"{type(x)=} {x=}") - print(f"{type(a)=} {a=}") - print(f"{type(output)=} {output=}") - - assert all(output == a) - assert all(x.get() == a) - - -@skip_if_no_gcc -def test_memcpy_host_dtoh(): - a = [1, 2, 3, 4] - x = np.array(a).astype(np.float32) - x_c = x.ctypes.data_as(C.POINTER(C.c_float)) - arg = Argument(numpy=x, ctypes=x_c) - output = np.zeros_like(x) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(a) - print(output) - - assert all(output == a) - assert all(x == a) - - -@skip_if_no_cupy -def test_memcpy_device_dtoh(): - import cupy as cp - - a = [1, 2, 3, 4] - x = cp.asarray(a, dtype=np.float32) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - output = cp.zeros_like(x) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(f"{type(x)=} {x=}") - print(f"{type(a)=} {a=}") - print(f"{type(output)=} {output=}") - - assert all(output.get() == a) - assert all(x.get() == a) - - -@skip_if_no_cupy -def test_memcpy_htod(): - import cupy as cp - - a = [1, 2, 3, 4] - src = np.array(a, dtype=np.float32) - x = cp.zeros(len(src), dtype=src.dtype) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy.get() == a) - - -def test_memcpy_host_htod(): - a = [1, 2, 3, 4] - src = np.array(a).astype(np.float32) - x = np.zeros_like(src) - x_c = x.ctypes.data_as(C.POINTER(C.c_float)) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy == a) - - -@skip_if_no_cupy -def test_memcpy_device_htod(): - import cupy as cp - - a = [1, 2, 3, 4] - src = cp.array(a, dtype=np.float32) - x = cp.zeros(len(src), dtype=src.dtype) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy.get() == a) - - @skip_if_no_gfortran def test_complies_fortran_function_no_module(): kernel_string = """ From aa3cadbc9edb3b5fe7f9332c1da0b347828bd22c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 13:50:56 +0200 Subject: [PATCH 31/36] Added test for the compiler memory refresh. --- kernel_tuner/backends/compiler.py | 2 +- test/test_compiler_functions.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 68453d00..e0c72f09 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -364,7 +364,7 @@ def memcpy_htod(self, dest, src): pass def refresh_memory(self, arguments, should_sync): - """Copy the preserved content of the output memory to device pointers.""" + """Copy the preserved content of the output memory to used arrays.""" for i, arg in enumerate(arguments): if should_sync[i]: if isinstance(arg, np.ndarray) and is_cupy_array(self.allocations[i].numpy): diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 708b5ed6..763555f2 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -368,3 +368,14 @@ def test_run_kernel(): lang="C", ) assert cp.all((a + b) == c) + + +def test_refresh_memory(): + arg1 = np.array([1, 2, 3]).astype(np.int8) + cfunc = CompilerFunctions() + output = cfunc.ready_argument_list([arg1]) + assert np.all(output == arg1) + arg1 = np.array([0, 0, 0]).astype(np.int8) + assert np.all(arg1 == [0, 0, 0]) + cfunc.refresh_memory(arg1, [True]) + assert np.all(arg1 == [1, 2, 3]) From 4587539c7a644d74aa55e6d66738b35dbfe31cd3 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 14:01:49 +0200 Subject: [PATCH 32/36] Update the test. --- test/test_compiler_functions.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 763555f2..8dd3c66f 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -371,11 +371,12 @@ def test_run_kernel(): def test_refresh_memory(): - arg1 = np.array([1, 2, 3]).astype(np.int8) + arg1 = np.array([1, 2, 3]).astype(np.int32) + arguments = [arg1] cfunc = CompilerFunctions() - output = cfunc.ready_argument_list([arg1]) - assert np.all(output == arg1) - arg1 = np.array([0, 0, 0]).astype(np.int8) - assert np.all(arg1 == [0, 0, 0]) - cfunc.refresh_memory(arg1, [True]) - assert np.all(arg1 == [1, 2, 3]) + _ = cfunc.ready_argument_list(arguments) + assert np.all(arguments[0] == [1, 2, 3]) + arguments[0] = np.array([0, 0, 0]).astype(np.int8) + assert np.all(arguments[0] == [0, 0, 0]) + cfunc.refresh_memory(arguments, [True]) + assert np.all(arguments[0] == [1, 2, 3]) From 4c77414666c6f3951f70ec023d8f5c01a887adb3 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 16:03:15 +0200 Subject: [PATCH 33/36] Although semantically there is no dtoh copy in the compiler backend, a copy of some kind is still needed. Plus a test. --- kernel_tuner/backends/compiler.py | 29 +++++++++++++++++++---------- test/test_compiler_functions.py | 11 +++++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index e0c72f09..cf24698e 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -356,24 +356,33 @@ def memset(self, allocation, value, size): C.memset(allocation.ctypes, value, size) def memcpy_dtoh(self, dest, src): - """There is no memcpy_dtoh for the compiler backend.""" - pass + """This method implements the semantic of a device to host copy for the Compiler backend. + There is no actual copy from device to host happening, but host to host. + + :param dest: A numpy or cupy array to store the data + :type dest: np.ndarray or cupy.ndarray + + :param src: An Argument for some memory allocation + :type src: Argument + """ + # there is no real copy from device to host, but host to host + if isinstance(dest, np.ndarray) and is_cupy_array(src.numpy): + # Implicit conversion to a NumPy array is not allowed. + value = src.numpy.get() + else: + value = src.numpy + xp = get_array_module(dest) + dest[:] = xp.asarray(value) def memcpy_htod(self, dest, src): - """There is no memcpy_htod for the compiler backend.""" + """There is no memcpy_htod implemented for the compiler backend.""" pass def refresh_memory(self, arguments, should_sync): """Copy the preserved content of the output memory to used arrays.""" for i, arg in enumerate(arguments): if should_sync[i]: - if isinstance(arg, np.ndarray) and is_cupy_array(self.allocations[i].numpy): - # Implicit conversion to a NumPy array is not allowed. - value = self.allocations[i].numpy.get() - else: - value = self.allocations[i].numpy - xp = get_array_module(arg) - arg[:] = xp.asarray(value) + self.memcpy_dtoh(arg, self.allocations[i]) def cleanup_lib(self): """Unload the previously loaded shared library""" diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 8dd3c66f..da7596aa 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -380,3 +380,14 @@ def test_refresh_memory(): assert np.all(arguments[0] == [0, 0, 0]) cfunc.refresh_memory(arguments, [True]) assert np.all(arguments[0] == [1, 2, 3]) + + +def test_memcpy_dtoh(): + arg1 = np.array([0, 5, 0, 7]).astype(np.int32) + arguments = [arg1] + cfunc = CompilerFunctions() + ready_arguments = cfunc.ready_argument_list(arguments) + expected = np.array([0, 0, 0, 0]).astype(np.float32) + assert np.all(ready_arguments.numpy != expected) + cfunc.memcpy_dtoh(expected, ready_arguments) + assert np.all(ready_arguments.numpy == expected) From da2fe0563e47e9e638a24a79dcc5d77a1a5986d5 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 16:16:02 +0200 Subject: [PATCH 34/36] Test fixed. --- test/test_compiler_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index da7596aa..65b02a6e 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -387,7 +387,7 @@ def test_memcpy_dtoh(): arguments = [arg1] cfunc = CompilerFunctions() ready_arguments = cfunc.ready_argument_list(arguments) - expected = np.array([0, 0, 0, 0]).astype(np.float32) - assert np.all(ready_arguments.numpy != expected) - cfunc.memcpy_dtoh(expected, ready_arguments) - assert np.all(ready_arguments.numpy == expected) + output = np.array([5, 0, 7, 0]).astype(np.int32) + assert np.all(ready_arguments[0].numpy != output) + cfunc.memcpy_dtoh(output, ready_arguments[0]) + assert np.all(ready_arguments[0].numpy == output) From 47898d986c003cf61a276e3a0f7fe11fb15dcfb0 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 16:49:45 +0200 Subject: [PATCH 35/36] Added tunable parameters to the example. --- examples/directives/histogram_c_openacc.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index 9ca02737..cb56dddf 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -21,10 +21,16 @@ def histogram(vector, hist): #define VECTOR_SIZE 1000000 #pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) +#if enable_reduction == 1 +#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE]) +#else #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) +#endif #pragma acc loop independent for ( int i = 0; i < VECTOR_SIZE; i++ ) { +#if enable_atomic == 1 #pragma acc atomic update +#endif hist[vector[i]] += 1; } #pragma tuner stop @@ -37,6 +43,9 @@ def histogram(vector, hist): tune_params = dict() tune_params["ngangs"] = [2**i for i in range(1, 11)] tune_params["nthreads"] = [32 * i for i in range(1, 33)] +tune_params["enable_reduction"] = [0, 1] +tune_params["enable_atomic"] = [0, 1] +constraints = ["enable_reduction != enable_atomic"] metrics = dict() metrics["GB/s"] = ( lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) @@ -56,6 +65,7 @@ def histogram(vector, hist): 0, kernel_args["histogram"], tune_params, + restrictions=constraints, metrics=metrics, answer=answer, compiler="nvc++", From 760462c1a332f03c6f3d593cc188e0b01e272c14 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 19 Sep 2024 16:54:48 +0200 Subject: [PATCH 36/36] OpenMP version of the histogram example. --- examples/directives/histogram_c_openacc.py | 1 - examples/directives/histogram_c_openmp.py | 71 ++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 examples/directives/histogram_c_openmp.py diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py index cb56dddf..d6fc1fba 100644 --- a/examples/directives/histogram_c_openacc.py +++ b/examples/directives/histogram_c_openacc.py @@ -4,7 +4,6 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives -from kernel_tuner.observers.observer import BenchmarkObserver # Naive Python histogram implementation diff --git a/examples/directives/histogram_c_openmp.py b/examples/directives/histogram_c_openmp.py new file mode 100644 index 00000000..190c3578 --- /dev/null +++ b/examples/directives/histogram_c_openmp.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenMP code with the kernel tuner""" +import numpy as np + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + + +# Naive Python histogram implementation +def histogram(vector, hist): + for i in range(0, len(vector)): + hist[vector[i]] += 1 + return hist + + +code = """ +#include + +#define HIST_SIZE 256 +#define VECTOR_SIZE 1000000 + +#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) +#if enable_reduction == 1 +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE]) +#else +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) +#endif +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +#if enable_atomic == 1 + #pragma omp atomic update +#endif + hist[vector[i]] += 1; +} +#pragma tuner stop +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["nteams"] = [2**i for i in range(1, 11)] +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +tune_params["enable_reduction"] = [0, 1] +tune_params["enable_atomic"] = [0, 1] +constraints = ["enable_reduction != enable_atomic"] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32) +kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) +reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) +reference_hist = histogram(kernel_args["histogram"][0], reference_hist) +answer = [None, reference_hist] + +tune_kernel( + "histogram", + kernel_string["histogram"], + 0, + kernel_args["histogram"], + tune_params, + restrictions=constraints, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], +)