diff --git a/ci_config.json b/ci_config.json index 8ef616fe8..25aa4f325 100644 --- a/ci_config.json +++ b/ci_config.json @@ -800,6 +800,25 @@ "nowide:tests=true" ] }, + "openblas": { + "_comment": "Doesn't work with windows native compilers", + "build_on": { + "windows": false + }, + "alpine_packages": [ + "gfortran" + ], + "brew_packages": [ + "coreutils" + ], + "debian_packages": [ + "libtinfo5", + "gfortran" + ], + "msys_packages": [ + "mingw-w64-ucrt-x86_64-fc" + ] + }, "openh264": { "alpine_packages": [ "nasm" diff --git a/releases.json b/releases.json index 6afd068aa..bee5f2247 100644 --- a/releases.json +++ b/releases.json @@ -2693,6 +2693,14 @@ "1.21.0-1" ] }, + "openblas": { + "dependency_names": [ + "openblas" + ], + "versions": [ + "0.3.28-1" + ] + }, "opencl-headers": { "dependency_names": [ "opencl-headers" diff --git a/subprojects/openblas.wrap b/subprojects/openblas.wrap new file mode 100644 index 000000000..38463a3d2 --- /dev/null +++ b/subprojects/openblas.wrap @@ -0,0 +1,9 @@ +[wrap-file] +directory = OpenBLAS-0.3.28 +source_url = https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.28/OpenBLAS-0.3.28.tar.gz +source_filename = OpenBLAS-0.3.28.tar.gz +source_hash = f1003466ad074e9b0c8d421a204121100b0751c96fc6fcf3d1456bd12f8a00a1 +patch_directory = openblas + +[provide] +openblas = openblas_dep diff --git a/subprojects/packagefiles/openblas/ctest/meson.build b/subprojects/packagefiles/openblas/ctest/meson.build new file mode 100644 index 000000000..c7da76cdf --- /dev/null +++ b/subprojects/packagefiles/openblas/ctest/meson.build @@ -0,0 +1,90 @@ +testl1_src = ['c_?blat1.f', 'c_?blas1.c'] +testl2_src = ['c_?blat2.f', 'c_?blas2.c', 'c_?2chke.c', 'auxiliary.c', 'constant.c', 'c_xerbla.c'] +testl3_src = ['c_?blat3.f', 'c_?blas3.c', 'c_?3chke.c', 'auxiliary.c', 'constant.c', 'c_xerbla.c'] +testl3_3m_src = ['c_?blat3_3m.f', 'c_?blas3_3m.c', 'c_?3chke_3m.c', 'auxiliary.c', 'constant.c', 'c_xerbla.c'] + +_test_input_array = { + 'l1': { + 'base': 'x?blat1', + 'has_dat': false, + 'types': ['s', 'd', 'c', 'z'], + 'sources': testl1_src, + }, + 'l2': { + 'base': 'x?cblat2', + 'has_dat': true, + 'types': ['s', 'd', 'c', 'z'], + 'sources': testl2_src, + 'input_file': '?in2', + }, + 'l3': { + 'base': 'x?cblat3', + 'has_dat': true, + 'types': ['s', 'd', 'c', 'z'], + 'sources': testl3_src, + 'input_file': '?in3', + }, +} + +lvls = ['l1', 'l3'] +# TODO(rg) : Times out.. +if not is_win + lvls += ['l2'] +endif + +if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86') + _test_input_array += { + 'l3_3m': { + 'base': 'x?cblat3_3m', + 'has_dat': true, + 'types': ['c', 'z'], + 'sources': testl3_3m_src, + 'input_file': '?in3_3m', + } + } + lvls += 'l3_3m' +endif + +_test_runner = executable('test_runner', sources: ['test_runner.c'], install: false) +ctest_inc = _inc + [include_directories('.')] + +foreach lvl : lvls + details = _test_input_array[lvl] + + foreach type : details['types'] + op_name = details['base'].replace('?', type) + + mapped_sources = [] + foreach source : details['sources'] + mapped_sources += source.replace('?', type) + endforeach + + executable( + op_name, + sources: mapped_sources, + link_with: [openblas_static], + dependencies: [dependency('threads')], + include_directories: ctest_inc, + c_args: ['-DADD_', '-DCBLAS'], + ) + + if is_win + obj_name = op_name + else + obj_name = f'./@op_name@' + endif + + _args = [obj_name] + if details.has_key('input_file') + _args += [meson.current_source_dir() / details['input_file'].replace('?', type)] + endif + + test( + op_name, + _test_runner, + args: _args, + workdir: meson.current_build_dir(), + ) # TODO: add OPENBLAS_NUM_THREADS=2 + + endforeach +endforeach diff --git a/subprojects/packagefiles/openblas/ctest/test_runner.c b/subprojects/packagefiles/openblas/ctest/test_runner.c new file mode 100644 index 000000000..749a4f929 --- /dev/null +++ b/subprojects/packagefiles/openblas/ctest/test_runner.c @@ -0,0 +1,24 @@ +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2 && argc != 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + char command[1024]; + if (argc == 2) { + snprintf(command, sizeof(command), "%s", argv[1]); + } else { + snprintf(command, sizeof(command), "%s < %s", argv[1], argv[2]); + } + + int result = system(command); + if (result != EXIT_SUCCESS) { + fprintf(stderr, "Error: Command '%s' failed with return code %d.\n", command, result); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/subprojects/packagefiles/openblas/driver/level2/meson.build b/subprojects/packagefiles/openblas/driver/level2/meson.build new file mode 100644 index 000000000..4a9653c80 --- /dev/null +++ b/subprojects/packagefiles/openblas/driver/level2/meson.build @@ -0,0 +1,556 @@ +# Naming conventions can be read from here: +# https://click.rgoswami.me/intel_blas_names +# Ported in order from the Makefile +# TODO(rg): Add the CONJ flags +driver_kops = [ + { 'base': '?gbmv', + 'sources': { + 'gbmv_k.c': {'mode': ['s', 'd', 'q'], 'exts': ['_n', '_t']}, + 'zgbmv_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_n', '_t', '_r', '_c', + '_o', '_u', '_s', '_d']}, + } + }, + { 'base': '?gbmv_thread', + 'sources': { + 'gbmv_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_n', '_t', '_r', '_c', + '_o', '_u', '_s', '_d']}, + } + }, + { 'base': '?gemv_thread', + 'sources': { + 'gemv_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_n', '_t', '_r', '_c', + '_o', '_u', '_s', '_d']}, + } + }, + { 'base': '?ger_thread', + 'sources': { + 'ger_thread.c': {'mode': ['s', 'd', 'q'], 'exts': ['']}, + } + }, + { 'base': '?ger_thread', + 'sources': { + 'ger_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_C', + '_V', '_D']}, + } + }, + { 'base': '?symv_thread', + 'sources': { + 'symv_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?hemv_thread', + 'sources': { + 'symv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?syr_thread', + 'sources': { + 'syr_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + }, + }, + { 'base': '?her_thread', + 'sources': { + 'syr_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?syr2_thread', + 'sources': { + 'syr2_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + }, + }, + { 'base': '?her2_thread', + 'sources': { + 'syr2_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hbmv', + 'sources': { + 'zhbmv_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hbmv_thread', + 'sources': { + 'sbmv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?her', + 'sources': { + 'zher_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?her2', + 'sources': { + 'zher2_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpmv', + 'sources': { + 'zhpmv_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpmv_thread', + 'sources': { + 'spmv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpr', + 'sources': { + 'zhpr_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpr_thread', + 'sources': { + 'spr_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpr2', + 'sources': { + 'zhpr2_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?hpr2_thread', + 'sources': { + 'spr2_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L', + '_V', '_M']}, + }, + }, + { 'base': '?sbmv', + 'sources': { + 'sbmv_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zsbmv_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?sbmv_thread', + 'sources': { + 'sbmv_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spmv', + 'sources': { + 'spmv_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zspmv_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spmv_thread', + 'sources': { + 'spmv_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spr', + 'sources': { + 'spr_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zspr_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spr_thread', + 'sources': { + 'spr_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spr2', + 'sources': { + 'spr2_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zspr2_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?spr2_thread', + 'sources': { + 'spr2_thread.c': {'mode': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?syr', + 'sources': { + 'syr_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zsyr_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?syr2', + 'sources': { + 'syr2_k.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_U', '_L']}, + 'zsyr2_k.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?tbmv', + 'sources': { + 'tbmv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'tbmv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztbmv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztbmv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?tbmv_thread', + 'sources': { + 'tbmv_thread.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_NLU', '_NLN', + '_TUU', '_TUN']}, + } + }, + { 'base': '?tbmv_thread', + 'sources': { + 'tbmv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_RLU', '_RLN', + '_CLU', '_CLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_RUU', '_RUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?tbsv', + 'sources': { + 'tbsv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'tbsv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztbsv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztbsv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?tpmv', + 'sources': { + 'tpmv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'tpmv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztpmv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztpmv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?tpmv_thread', + 'sources': { + 'tpmv_thread.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_NLU', '_NLN', + '_TUU', '_TUN']}, + } + }, + { 'base': '?tpmv_thread', + 'sources': { + 'tpmv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_RLU', '_RLN', + '_CLU', '_CLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_RUU', '_RUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?tpsv', + 'sources': { + 'tpsv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'tpsv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztpsv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztpsv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?trmv', + 'sources': { + 'trmv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'trmv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztrmv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztrmv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?trmv_thread', + 'sources': { + 'trmv_thread.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_NLU', '_NLN', + '_TUU', '_TUN']}, + } + }, + { 'base': '?trmv_thread', + 'sources': { + 'trmv_thread.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_RLU', '_RLN', + '_CLU', '_CLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_RUU', '_RUN', + '_CUU', '_CUN']}, + } + }, + { 'base': '?trsv', + 'sources': { + 'trsv_U.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN']}, + 'trsv_L.c': {'mode': ['s', 'd', 'q'], + 'exts': ['_NLU', '_NLN', + '_TUU', '_TUN']}, + 'ztrsv_U.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_NUU', '_NUN', + '_TLU', '_TLN', + '_CLU', '_CLN', + '_RUU', '_RUN']}, + 'ztrsv_L.c': {'mode': ['c', 'z', 'x'], + 'exts': ['_RLU', '_RLN', + '_NLU', '_NLN', + '_TUU', '_TUN', + '_CUU', '_CUN']}, + } + }, + # TODO(rg): Add the bfloat conditionals from Makefile:3709 +] + +# Initialize kernel configurations list +kernel_confs = [] + +# Iterate through each kernel operation +foreach _kop : driver_kops + base = _kop['base'] + sources = _kop['sources'] + + # Generate the symbol flags + _ckop_args = [] + if symb_defs.has_key(base) + symb_base = symb_defs[base] + if symb_base.has_key('def') + foreach _d : symb_base['def'] + _ckop_args += ['-D' + _d] + endforeach + endif + if symb_base.has_key('undef') + foreach _u : symb_base['undef'] + _ckop_args += ['-U' + _u] + endforeach + endif + endif + + # Iterate through each source file and its details + foreach fname, details : sources + modes = details['mode'] + exts = details['exts'] + + # Iterate through each mode + foreach mode : modes + if is_win # extensions are not present on Windows + if mode in ['x', 'q'] + continue + endif + endif + # Generate the mapping for the type + __cargs = _cargs + _ckop_args + prec_mode = precision_mappings[mode] + + # Add precision-specific definitions + if prec_mode.has_key('def') + foreach _d : prec_mode['def'] + __cargs += ['-D' + _d] + endforeach + endif + if prec_mode.has_key('undef') + foreach _u : prec_mode['undef'] + __cargs += ['-U' + _u] + endforeach + endif + + # Iterate through each extension + foreach ext : exts + _ext_cargs = [] + + # Check ext_mappings first + if ext_mappings.has_key(ext) and (not ext_mappings[ext].has_key('except') or base not in ext_mappings[ext]['except']) + extmap = ext_mappings[ext] + if extmap.has_key('def') + foreach _d : extmap['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if extmap.has_key('undef') + foreach _u : extmap['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + else + # Fallback to ext_mappings_l2 + foreach ext_map : ext_mappings_l2 + ext_mappings_l3 + if ext_map['ext'] == ext and mode in ext_map['for'] + if ext_map.has_key('def') + foreach _d : ext_map['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if ext_map.has_key('undef') + foreach _u : ext_map['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + break + endif + endforeach + endif + + # Construct the final paths + src = fname + + # Add additional flags if present + if sources[fname].has_key('addl') + _ext_cargs += sources[fname]['addl'] + endif + + # Generate the symbol name + sym_name = base.replace('?', mode) + ext + sym_underscored = f'@sym_name@_' + + # Add standard flags for naming conventions + _ext_cargs += [ + f'-DASMNAME=@asm_name_prefix@@sym_name@', + f'-DASMFNAME=@asm_name_prefix@@sym_underscored@', + f'-DNAME=@sym_underscored@', + f'-DCNAME=@sym_name@', + f'-DCHAR_NAME="@sym_underscored@"', + f'-DCHAR_CNAME="@sym_name@"', + ] + + # Append the current configuration + current_def = { + 'c_args': __cargs + _ext_cargs, + 'name': sym_name, + 'src': src, + } + kernel_confs += [current_def] + endforeach + endforeach + endforeach +endforeach + +# Create the static libraries from the configurations +_kern_libs = [] +foreach conf : kernel_confs + # message(conf['name']) + # message(conf) + _kern_libs += [static_library( + conf['name'], + conf['src'], + include_directories: _inc, + c_args: conf['c_args'], + )] +endforeach + +# Create the final kernel library +_l2_driver = static_library('l2_driver', + link_whole: _kern_libs) diff --git a/subprojects/packagefiles/openblas/driver/level3/meson.build b/subprojects/packagefiles/openblas/driver/level3/meson.build new file mode 100644 index 000000000..298a8e085 --- /dev/null +++ b/subprojects/packagefiles/openblas/driver/level3/meson.build @@ -0,0 +1,476 @@ +# Naming conventions can be read from here: +# https://click.rgoswami.me/intel_blas_names +# Ported in order from the Makefile +driver_kops = [ + # { 'base': '?bgemm', + # 'sources': { + # 'gemm.c': {'mode': ['s'], 'srcs': ['level3.c'], + # 'exts': ['_nn', '_nt', + # '_tn', '_tt']}, + # } + # }, + { 'base': '?gemm', + 'sources': { + 'gemm.c': {'mode': ['s', 'd',], #'q'], + 'exts': ['_nn', '_nt', + '_tn', '_tt']}, + } + }, + { 'base': '?gemm', + 'sources': { + 'gemm.c': {'mode': ['c', 'z',],# 'x'], + 'srcs': ['level3.c'], + 'exts': ['_nn', '_nt', + '_nr', '_nc', + '_tn', '_tt', + '_tr', '_tc', + '_rn', '_rt', + '_rr', '_rc', + '_cn', '_ct', + '_cr', '_cc']}, + } + }, + { 'base': '?gemm_thread', + 'sources': { + 'gemm_thread_m.c': {'mode': [''], + 'exts': ['_m']}, + 'gemm_thread_n.c': {'mode': [''], + 'exts': ['_n']}, + 'gemm_thread_mn.c': {'mode': [''], + 'exts': ['_mn']}, + 'gemm_thread_variable.c': {'mode': [''], + 'exts': ['_variable']}, + } + }, + # { 'base': '?bgemm_thread', + # 'sources': { + # 'gemm.c': {'mode': ['s'], 'srcs': ['level3.c'], + # 'exts': ['_nn', '_nt', + # '_tn', '_tt']}, + # } + # }, + { 'base': '?gemm_thread', + 'sources': { + 'gemm.c': {'mode': ['s', 'd'], #'q'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_nn', '_nt', + '_tn', '_tt']}, + } + }, + { 'base': '?gemm_thread', + 'sources': { + 'gemm.c': {'mode': ['c', 'z'],# 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_nn', '_nt', + '_nr', '_nc', + '_tn', '_tt', + '_tr', '_tc', + '_rn', '_rt', + '_rr', '_rc', + '_cn', '_ct', + '_cr', '_cc']}, + } + }, + { 'base': '?trmm', + 'sources': { + 'trmm_L.c': {'mode': ['s', 'd'],# 'q'], + 'exts': ['_LNUU', '_LNUN', + '_LNLU', '_LNLN', + '_LTUU', '_LTUN', + '_LTLU', '_LTLN']}, + 'trmm_R.c': {'mode': ['s', 'd'],# 'q'], + 'exts': ['_RNUU', '_RNUN', + '_RNLU', '_RNLN', + '_RTUU', '_RTUN', + '_RTLU', '_RTLN']}, + } + }, + { 'base': '?trmm', + 'sources': { + 'trmm_L.c': {'mode': ['c', 'z'],# 'x'], + 'exts': ['_LNUU', '_LNUN', + '_LNLU', '_LNLN', + '_LTUU', '_LTUN', + '_LTLU', '_LTLN', + '_LRUU', '_LRUN', + '_LRLU', '_LRLN', + '_LCUU', '_LCUN', + '_LCLU', '_LCLN', + ]}, + 'trmm_R.c': {'mode': ['c', 'z'],# 'x'], + 'exts': ['_RNUU', '_RNUN', + '_RNLU', '_RNLN', + '_RTUU', '_RTUN', + '_RTLU', '_RTLN', + '_RRUU', '_RRUN', + '_RRLU', '_RRLN', + '_RCUU', '_RCUN', + '_RCLU', '_RCLN', + ]}, + } + }, + { 'base': '?symm', + 'sources': { + 'symm_k.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?symm_thread', + 'sources': { + 'symm_k.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?syrk', + 'sources': { + 'syrk_k.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'exts': ['_UN', '_UT', + '_LN', '_LT']}, + } + }, + { 'base': '?syrk_thread', + 'sources': { + 'syrk_k.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_UN', '_UT', + '_LN', '_LT', '']}, + } + }, + { 'base': 'syrk_thread', + 'sources': { + 'syrk_thread.c': {'mode': [''], 'exts': ['']} + } + }, + { 'base': '?syrk_kernel', + 'sources': { + 'syrk_kernel.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?syr2k', + 'sources': { + 'syr2k_k.c': {'mode': ['s', 'd', # 'q'], + 'c', 'z'], # 'x'], + 'exts': ['_UN', '_UT', + '_LN', '_LT']}, + } + }, + { 'base': '?syr2k_kernel', + 'sources': { + 'syr2k_kernel.c': {'mode': ['s', 'd', 'c', 'z'], + 'exts': ['_U', '_L']}, + } + }, + { 'base': '?hemm', + 'sources': { + 'zhemm_k.c': {'mode': ['c', 'z'], # 'x'], + # TODO(rg): Do we need ../../param.h ? + # See Makefile:1612 + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?hemm_thread', + 'sources': { + 'zhemm_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?herk', + 'sources': { + 'zherk_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DHERK'], + # TODO(rg): Do we need ../../common.h ? + # See Makefile:1684 + 'exts': ['_UN', '_UC', + '_LN', '_LC']}, + } + }, + { 'base': '?herk_kernel', + 'sources': { + 'zherk_kernel.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DHERK'], + 'exts': ['_UN', '_UC', + '_LN', '_LC']}, + } + }, + { 'base': '?herk_thread', + 'sources': { + 'zherk_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DHERK', '-DTHREADED_LEVEL3'], + 'exts': ['_UN', '_UC', + '_LN', '_LC']}, + } + }, + { 'base': '?her2k', + 'sources': { + 'zher2k_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DHER2K'], + # TODO(rg): Do we need ../../common.h ? + # See Makefile:1793 + 'exts': ['_UN', '_UC', + '_LN', '_LC']}, + } + }, + { 'base': '?her2k_kernel', + 'sources': { + 'zher2k_kernel.c': {'mode': ['c', 'z'], # 'x'], + # TODO(rg): Do we need ../../common.h ? + # See Makefile:1793 + 'exts': ['_UN', '_UC', + '_LN', '_LC']}, + } + }, + { 'base': '?gemm3m', + 'sources': { + 'gemm3m.c': {'mode': ['c', 'z',],# 'x'], + 'srcs': ['level3.c'], + 'exts': ['_nn', '_nt', + '_nr', '_nc', + '_tn', '_tt', + '_tr', '_tc', + '_rn', '_rt', + '_rr', '_rc', + '_cn', '_ct', + '_cr', '_cc']}, + } + }, + # { 'base': '?gemmf', + # 'sources': { + # # TODO(rg): This in the makefile:4401 but the file isn't there.. + # 'zgemmf.c': {'mode': ['c', 'z',],# 'x'], + # 'srcs': ['level3.c'], + # 'exts': ['']}, + # } + # }, + { 'base': '?gemm3m_thread', + 'sources': { + 'gemm3m.c': {'mode': ['c', 'z',],# 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'srcs': ['level3.c'], + 'exts': ['_nn', '_nt', + '_nr', '_nc', + '_tn', '_tt', + '_tr', '_tc', + '_rn', '_rt', + '_rr', '_rc', + '_cn', '_ct', + '_cr', '_cc']}, + } + }, + { 'base': '?symm3m', + 'sources': { + 'symm3m_k.c': {'mode': ['c', 'z'], # 'x'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?symm3m_thread', + 'sources': { + 'symm3m_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?hemm3m', + 'sources': { + 'hemm3m_k.c': {'mode': ['c', 'z'], # 'x'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?hemm3m_thread', + 'sources': { + 'hemm3m_k.c': {'mode': ['c', 'z'], # 'x'], + 'addl': ['-DTHREADED_LEVEL3'], + 'exts': ['_LU', '_LL', + '_RU', '_RL']}, + } + }, + { 'base': '?trsm', + 'sources': { + 'trsm_L.c': {'mode': ['s', 'd'],# 'q'], + 'exts': ['_LNUU', '_LNUN', + '_LNLU', '_LNLN', + '_LTUU', '_LTUN', + '_LTLU', '_LTLN']}, + 'trsm_R.c': {'mode': ['s', 'd'],# 'q'], + 'exts': ['_RNUU', '_RNUN', + '_RNLU', '_RNLN', + '_RTUU', '_RTUN', + '_RTLU', '_RTLN']}, + } + }, + { 'base': '?trsm', + 'sources': { + 'trsm_L.c': {'mode': ['c', 'z'],# 'x'], + 'exts': ['_LNUU', '_LNUN', + '_LNLU', '_LNLN', + '_LTUU', '_LTUN', + '_LTLU', '_LTLN', + '_LRUU', '_LRUN', + '_LRLU', '_LRLN', + '_LCUU', '_LCUN', + '_LCLU', '_LCLN', + ]}, + 'trsm_R.c': {'mode': ['c', 'z'],# 'x'], + 'exts': ['_RNUU', '_RNUN', + '_RNLU', '_RNLN', + '_RTUU', '_RTUN', + '_RTLU', '_RTLN', + '_RRUU', '_RRUN', + '_RRLU', '_RRLN', + '_RCUU', '_RCUN', + '_RCLU', '_RCLN', + ]}, + } + }, +] + +# Initialize kernel configurations list +kernel_confs = [] + +# Iterate through each kernel operation +foreach _kop : driver_kops + base = _kop['base'] + sources = _kop['sources'] + + # Generate the symbol flags + _ckop_args = [] + if symb_defs.has_key(base) + symb_base = symb_defs[base] + if symb_base.has_key('def') + foreach _d : symb_base['def'] + _ckop_args += ['-D' + _d] + endforeach + endif + if symb_base.has_key('undef') + foreach _u : symb_base['undef'] + _ckop_args += ['-U' + _u] + endforeach + endif + endif + + # Iterate through each source file and its details + foreach fname, details : sources + modes = details['mode'] + exts = details['exts'] + + # Iterate through each mode + foreach mode : modes + # Generate the mapping for the type + __cargs = _cargs + _ckop_args + prec_mode = precision_mappings[mode] + + # Add precision-specific definitions + if prec_mode.has_key('def') + foreach _d : prec_mode['def'] + __cargs += ['-D' + _d] + endforeach + endif + if prec_mode.has_key('undef') + foreach _u : prec_mode['undef'] + __cargs += ['-U' + _u] + endforeach + endif + + # Iterate through each extension + foreach ext : exts + _ext_cargs = [] + + # Check ext_mappings first + if ext_mappings.has_key(ext) and not (ext_mappings[ext].has_key('except') and base in ext_mappings[ext]['except']) + extmap = ext_mappings[ext] + if extmap.has_key('def') + foreach _d : extmap['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if extmap.has_key('undef') + foreach _u : extmap['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + else + # Fallback to ext_mappings_l2 + foreach ext_map : ext_mappings_l2 + ext_mappings_l3 + if ext_map['ext'] == ext and mode in ext_map['for'] + if ext_map.has_key('def') + foreach _d : ext_map['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if ext_map.has_key('undef') + foreach _u : ext_map['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + break + endif + endforeach + endif + + # Construct the final paths + src = [ fname ] + # if sources[fname].has_key('srcs') + # src += sources[fname]['srcs'] + # endif + + # Add additional flags if present + if sources[fname].has_key('addl') + _ext_cargs += sources[fname]['addl'] + endif + + # Generate the symbol name + sym_name = base.replace('?', mode) + ext + sym_underscored = f'@sym_name@_' + + # Add standard flags for naming conventions + _ext_cargs += [ + f'-DASMNAME=@asm_name_prefix@@sym_name@', + f'-DASMFNAME=@asm_name_prefix@@sym_underscored@', + f'-DNAME=@sym_underscored@', + f'-DCNAME=@sym_name@', + f'-DCHAR_NAME="@sym_underscored@"', + f'-DCHAR_CNAME="@sym_name@"', + ] + + # Append the current configuration + current_def = { + 'c_args': __cargs + _ext_cargs, + 'name': sym_name, + 'src': src, + } + kernel_confs += [current_def] + endforeach + endforeach + endforeach +endforeach + +# Create the static libraries from the configurations +_kern_libs = [] +foreach conf : kernel_confs + # message(conf['name']) + # message(conf) + _kern_libs += [static_library( + conf['name'], + conf['src'], + include_directories: _inc, + c_args: conf['c_args'], + )] +endforeach + +# Create the final kernel library +_l3_driver = static_library('l3_driver', + link_whole: _kern_libs) diff --git a/subprojects/packagefiles/openblas/driver/others/meson.build b/subprojects/packagefiles/openblas/driver/others/meson.build new file mode 100644 index 000000000..d3d7d6924 --- /dev/null +++ b/subprojects/packagefiles/openblas/driver/others/meson.build @@ -0,0 +1,84 @@ +# Define the operations and their sources +others_ops = [ + { 'base': 'memory', 'source': 'memory.c' }, + { 'base': 'xerbla', 'source': 'xerbla.c' }, + { 'base': 'openblas_set_num_threads', 'source': 'openblas_set_num_threads.c' }, + { 'base': 'openblas_get_num_threads', 'source': 'openblas_get_num_threads.c', 'addl': [ + '-DSMALL_MATRIX_OPT', '-Wall', '-DF_OTHERS_GFORT', + '-DSMP_SERVER', '-DNO_WARMUP', '-DMAX_CPU_NUMBER=12', + ]}, + { 'base': 'openblas_get_num_procs', 'source': 'openblas_get_num_procs.c' }, + { 'base': 'openblas_get_parallel', 'source': 'openblas_get_parallel.c' }, + { 'base': 'openblas_error_handle', 'source': 'openblas_error_handle.c' }, + { 'base': 'openblas_env', 'source': 'openblas_env.c' }, + { 'base': 'blas_server', 'source': blas_server_src }, + { 'base': 'divtable', 'source': 'divtable.c', 'addl': ['-UDOUBLE'] }, + { 'base': 'blasL1thread', 'source': 'blas_l1_thread.c' }, + { 'base': 'servercallback', 'source': 'blas_server_callback.c' }, + { 'base': 'parameter', 'source': 'parameter.c' } +] + +# Initialize configurations list +others_confs = [] + +# Iterate through each operation +foreach op : others_ops + base = op['base'] + source = op['source'] + addl = op.get('addl', []) + + # Generate the symbol flags + ckop_args = [] + if symb_defs.has_key(base) + symb_base = symb_defs[base] + if symb_base.has_key('def') + foreach d : symb_base['def'] + ckop_args += ['-D' + d] + endforeach + endif + if symb_base.has_key('undef') + foreach u : symb_base['undef'] + ckop_args += ['-U' + u] + endforeach + endif + endif + + # Default compilation arguments + c_args = _cargs + ckop_args + addl + '-DMAX_PARALLEL_NUMBER=1' + + # Generate the symbol name + sym_name = base + sym_underscored = '@0@_'.format(sym_name) + + # Add standard flags for naming conventions + c_args += [ + f'-DASMNAME=@asm_name_prefix@@sym_name@', + f'-DASMFNAME=@asm_name_prefix@@sym_name@_', + f'-DNAME=@sym_name@_', + f'-DCNAME=@sym_name@', + f'-DCHAR_NAME="@sym_name@_"', + f'-DCHAR_CNAME="@sym_name@"', + ] + + # Append the current configuration + current_def = { + 'c_args': c_args, + 'name': sym_name, + 'src': [source] + } + others_confs += [current_def] +endforeach + +# Create the static libraries from the configurations +others_libs = [] +foreach conf : others_confs + others_libs += [static_library( + conf['name'], + conf['src'], + include_directories: _inc, + c_args: conf['c_args'] + )] +endforeach + +# Create the final interface library +_others = static_library('_interface', link_whole: others_libs) diff --git a/subprojects/packagefiles/openblas/gen_install_headers.py b/subprojects/packagefiles/openblas/gen_install_headers.py new file mode 100644 index 000000000..6e9607879 --- /dev/null +++ b/subprojects/packagefiles/openblas/gen_install_headers.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import argparse +from pathlib import Path + +def write_openblas_config_header(dest_dir, version, config_last_path, template_path): + config_h_path = dest_dir / "openblas_config.h" + with config_h_path.open('w') as f: + f.write("#ifndef OPENBLAS_CONFIG_H\n") + f.write("#define OPENBLAS_CONFIG_H\n") + + with config_last_path.open('r') as config_last: + for line in config_last: + if line.strip(): + defines = line.split('#define ') + for define in defines: + if define.strip(): + parts = define.split(maxsplit=1) + if len(parts) > 0: + macro_name = parts[0] + rest_of_line = " ".join(parts[1:]) if len(parts) > 1 else "" + line_to_write = f"#define OPENBLAS_{macro_name} {rest_of_line}" + f.write(f"{line_to_write.strip()}\n") + + + f.write(f'#define OPENBLAS_VERSION " OpenBLAS {version} "\n') + + with template_path.open('r') as template: + f.write(template.read()) + + f.write("#endif /* OPENBLAS_CONFIG_H */\n") + print(f"Generated openblas_config.h in {dest_dir}") + +def write_f77blas_header(dest_dir, common_interface_path): + f77blas_h_path = dest_dir / "f77blas.h" + with f77blas_h_path.open('w') as f: + f.write("#ifndef OPENBLAS_F77BLAS_H\n") + f.write("#define OPENBLAS_F77BLAS_H\n") + f.write('#include "openblas_config.h"\n') + + with common_interface_path.open('r') as common_interface: + f.write(common_interface.read()) + + f.write("#endif\n") + print(f"Generated f77blas.h in {dest_dir}") + +def write_cblas_header(dest_dir, cblas_path, symbol_prefix, symbol_suffix): + cblas_h_path = dest_dir / "cblas.h" + + with cblas_path.open('r') as cblas_file: + content = cblas_file.read() + + if symbol_prefix: + content = re.sub(r'\bcblas', f'{symbol_prefix}cblas', content) + content = re.sub(r'\bopenblas', f'{symbol_prefix}openblas', content) + content = re.sub(f'{symbol_prefix}openblas_complex', 'openblas_complex', content) + content = re.sub(r'\bgoto', f'{symbol_prefix}goto', content) + + if symbol_suffix: + content = re.sub(r'\bcblas(\w*)', r'cblas\1' + symbol_suffix, content) + content = re.sub(r'\bopenblas(\w*)', r'openblas\1' + symbol_suffix, content) + content = re.sub(r'\bgoto(\w*)', r'goto\1' + symbol_suffix, content) + content = re.sub(r'openblas_complex_(\w*)' + symbol_suffix, r'openblas_complex_\1', content) + + content = content.replace('common', 'openblas_config') + + with cblas_h_path.open('w') as f: + f.write(content) + + print(f"Generated cblas.h in {dest_dir}") + +def main(): + parser = argparse.ArgumentParser(description="Generate OpenBLAS headers") + parser.add_argument('--dest-dir', required=True, help="Destination directory for headers") + parser.add_argument('--version', required=True, help="OpenBLAS version") + parser.add_argument('--config-last', required=True, help="Path to config_last.h") + parser.add_argument('--template', required=True, help="Path to openblas_config_template.h") + parser.add_argument('--common-interface', required=True, help="Path to common_interface.h") + parser.add_argument('--cblas', required=True, help="Path to cblas.h") + parser.add_argument('--symbol-prefix', default="", help="Symbol prefix for cblas.h") + parser.add_argument('--symbol-suffix', default="", help="Symbol suffix for cblas.h") + parser.add_argument('--generate-f77blas', action='store_true', help="Generate f77blas.h") + parser.add_argument('--generate-cblas', action='store_true', help="Generate cblas.h") + + args = parser.parse_args() + + dest_dir = Path(args.dest_dir) + dest_dir.mkdir(parents=True, exist_ok=True) + + config_last_path = Path(args.config_last) + template_path = Path(args.template) + common_interface_path = Path(args.common_interface) + cblas_path = Path(args.cblas) + + write_openblas_config_header(dest_dir, args.version, config_last_path, template_path) + + if args.generate_f77blas: + write_f77blas_header(dest_dir, common_interface_path) + + if args.generate_cblas: + write_cblas_header(dest_dir, cblas_path, args.symbol_prefix, args.symbol_suffix) + +if __name__ == "__main__": + main() diff --git a/subprojects/packagefiles/openblas/interface/lapack/meson.build b/subprojects/packagefiles/openblas/interface/lapack/meson.build new file mode 100644 index 000000000..10aa91a81 --- /dev/null +++ b/subprojects/packagefiles/openblas/interface/lapack/meson.build @@ -0,0 +1,152 @@ +_lapack_interface_roots = [ + { 'base': '?getf2', '_types': ['s', 'd'], + 'fname': 'lapack/getf2.c', + }, + # TODO: Makefile: 2074:2076 but doesn't exist + # { 'base': '?getf2', '_types': ['q'], + # 'fname': 'getf2.c', + # }, + { 'base': '?getf2', '_types': ['c', 'z'], + 'fname': 'lapack/zgetf2.c', + }, + # TODO: Makefile: 2083:2085 but doesn't exist + # { 'base': '?getf2', '_types': ['x'], + # 'fname': 'zgetf2.c', + # }, + { 'base': '?getrf', '_types': ['s', 'd'], + 'fname': 'lapack/getrf.c', + }, + # TODO: Not built, as are the other commented xdouble ones + # { 'base': '?getrf', '_types': ['q'], + # 'fname': 'getrf.c', + # }, + { 'base': '?getrf', '_types': ['c', 'z'], + 'fname': 'lapack/zgetrf.c', + }, + # { 'base': '?getrf', '_types': ['x'], + # 'fname': 'zgetrf.c', + # }, + { 'base': '?lauu2', '_types': ['s', 'd'], + 'fname': 'lapack/lauu2.c', + }, + # { 'base': '?lauu2', '_types': ['q'], + # 'fname': 'lauu2.c', + # }, + { 'base': '?lauu2', '_types': ['c', 'z'], + 'fname': 'lapack/zlauu2.c', + }, + # { 'base': '?lauu2', '_types': ['x'], + # 'fname': 'zlauu2.c', + # }, + { 'base': '?lauum', '_types': ['s', 'd'], + 'fname': 'lapack/lauum.c', + }, + # { 'base': '?lauum', '_types': ['q'], + # 'fname': 'lauum.c', + # }, + { 'base': '?lauum', '_types': ['c', 'z'], + 'fname': 'lapack/zlauum.c', + }, + # { 'base': '?lauum', '_types': ['x'], + # 'fname': 'zlauum.c', + # }, + { 'base': '?potf2', '_types': ['s', 'd'], + 'fname': 'lapack/potf2.c', + }, + # { 'base': '?potf2', '_types': ['q'], + # 'fname': 'potf2.c', + # }, + { 'base': '?potf2', '_types': ['c', 'z'], + 'fname': 'lapack/zpotf2.c', + }, + # { 'base': '?potf2', '_types': ['x'], + # 'fname': 'zpotf2.c', + # }, + { 'base': '?potrf', '_types': ['s', 'd'], + 'fname': 'lapack/potrf.c', + }, + # { 'base': '?potrf', '_types': ['q'], + # 'fname': 'potrf.c', + # }, + { 'base': '?potrf', '_types': ['c', 'z'], + 'fname': 'lapack/zpotrf.c', + }, + # { 'base': '?potrf', '_types': ['x'], + # 'fname': 'zpotrf.c', + # }, + { 'base': '?trti2', '_types': ['s', 'd'], + 'fname': 'lapack/trti2.c', + }, + # { 'base': '?trti2', '_types': ['q'], + # 'fname': 'trti2.c', + # }, + { 'base': '?trti2', '_types': ['c', 'z'], + 'fname': 'lapack/ztrti2.c', + }, + # { 'base': '?trti2', '_types': ['x'], + # 'fname': 'ztrti2.c', + # }, + # { 'base': '?trti', '_types': ['s', 'd'], + # 'fname': 'lapack/trti.c', + # }, + # { 'base': '?trti', '_types': ['q'], + # 'fname': 'trti.c', + # }, + # { 'base': '?trti', '_types': ['c', 'z'], + # 'fname': 'lapack/ztrti.c', + # }, + # { 'base': '?trti', '_types': ['x'], + # 'fname': 'ztrti.c', + # }, + { 'base': '?laswp', '_types': ['s', 'd'], + 'fname': 'lapack/laswp.c', + }, + # { 'base': '?laswp', '_types': ['q'], + # 'fname': 'laswp.c', + # }, + { 'base': '?laswp', '_types': ['c', 'z'], + 'fname': 'lapack/zlaswp.c', + }, + # { 'base': '?laswp', '_types': ['x'], + # 'fname': 'zlaswp.c', + # }, + { 'base': '?getrs', '_types': ['s', 'd', 'q'], + 'fname': 'lapack/getrs.c', + }, + { 'base': '?getrs', '_types': ['c', 'z', 'x'], + 'fname': 'lapack/zgetrs.c', + }, + { 'base': '?trtrs', '_types': ['s', 'd', 'q'], + 'fname': 'lapack/trtrs.c', + }, + { 'base': '?trtrs', '_types': ['c', 'z', 'x'], + 'fname': 'lapack/ztrtrs.c', + }, + { 'base': '?gesv', '_types': ['s', 'd', 'c', 'z'], + 'fname': 'lapack/gesv.c', + }, + # { 'base': '?gesv', '_types': ['q'], + # 'fname': 'gesv.c', + # }, + # { 'base': '?gesv', '_types': ['x'], + # 'fname': 'zgesv.c', + # }, + { 'base': '?potri', '_types': ['s', 'd'], + 'fname': 'lapack/potri.c', + }, + # { 'base': '?potri', '_types': ['q'], + # 'fname': 'potri.c', + # }, + # BUG: potri is broken + # { 'base': '?potri', '_types': ['c', 'z'], + # 'fname': 'lapack/zpotri.c', + # }, + # { 'base': '?potri', '_types': ['x'], + # 'fname': 'zpotri.c', + # }, + # NOTE: Where are these? + # { 'base': '?larf', '_types': ['s', 'd', 'q', + # 'c', 'z', 'x'], + # 'fname': 'larf.c', + # }, + ] diff --git a/subprojects/packagefiles/openblas/interface/meson.build b/subprojects/packagefiles/openblas/interface/meson.build new file mode 100644 index 000000000..4c2aaedfa --- /dev/null +++ b/subprojects/packagefiles/openblas/interface/meson.build @@ -0,0 +1,523 @@ +interface_args = cc.get_supported_arguments( + f'-include @_mbproot@/simd_conf.h', +) +# if not build_without_cblas +# interface_args += '-DCBLAS' +# endif +_blas_roots = [ + # NOTE: q, qx, x, xq do not have cblas_ rules in the Makefile + # NOTE: https://developer.arm.com/documentation/101004/2310/BLAS-Basic-Linear-Algebra-Subprograms/CBLAS-functions?lang=en + # Level 1 + { 'base': '?asum', '_types': ['s', 'd', 'q', + 'sc', 'dz', 'qx'], + 'fname': 'asum.c', + 'cblas': true, + }, + { 'base': '?sum', '_types': ['s', 'd', 'q', + 'sc', 'dz', 'qx'], + 'fname': 'sum.c', + 'cblas': true, + }, + { 'base': '?amax', '_types': ['s', 'd', 'q', + 'sc', 'dz', 'qx'], + 'fname': 'max.c', + 'cblas': true, + }, + { 'base': '?amin', '_types': ['s', 'd', 'q', + 'sc', 'dz', 'qx'], + 'fname': 'max.c', + 'cblas': true, + }, + { 'base': '?max', '_types': ['s', 'd', 'q'], + 'fname': 'max.c', + }, + { 'base': '?min', '_types': ['s', 'd', 'q'], + 'fname': 'max.c', + }, + { 'base': '?axpy', '_types': ['s', 'd', 'q'], + 'fname': 'axpy.c', + 'cblas': true, + }, + { 'base': '?axpy', '_types': ['c', 'z', 'x'], + 'fname': 'zaxpy.c', + 'cblas': true, + }, + { 'base': '?axpyc', '_types': ['c', 'z', 'x'], + 'fname': 'zaxpy.c', + 'cblas': true, + }, + { 'base': '?copy', '_types': ['s', 'd', 'q', 'c', 'z', 'x'], + 'fname': 'copy.c', + 'cblas': true, + }, + { 'base': '?dot', '_types': ['s', 'd', 'q'], + 'fname': 'dot.c', + 'cblas': true, + }, + { 'base': '?dotu', '_types': ['c', 'z', 'x'], + 'fname': 'zdot.c', + 'cblas': true, + }, + { 'base': '?dotc', '_types': ['c', 'z', 'x'], + 'fname': 'zdot.c', + 'cblas': true, + }, + { 'base': 'cblas_?dotu_sub', '_types': ['c', 'z'], + 'fname': 'zdot.c', + 'cblas': false, # These don't non-cblas rules Makefile:1623:1627 + }, + { 'base': 'cblas_?dotc_sub', '_types': ['c', 'z'], + 'fname': 'zdot.c', + 'cblas': false, # These don't non-cblas rules Makefile:1623:1627 + }, + { 'base': '?nrm2', '_types': ['s', 'd', 'q', + 'sc', 'dz', 'qx'], + 'fname': 'nrm2.c', + 'cblas': true, # q, qx don't have cblas_ rules Makefile:1635,1645 + }, + { 'base': '?rot', '_types': ['s', 'd', 'q'], + 'fname': 'rot.c', + 'cblas': true, # q doesn't have cblas_ + }, + { 'base': '?rot', '_types': ['cs', 'zd', 'xq'], + 'fname': 'zrot.c', + 'cblas': true, + }, + { 'base': '?rotm', '_types': ['s', 'd', 'q'], + 'fname': 'rotm.c', + 'cblas': true, + }, + { 'base': '?rotmg', '_types': ['s', 'd', 'q'], + 'fname': 'rotmg.c', + 'cblas': true, + }, + { 'base': '?rotg', '_types': ['s', 'd', 'q'], + 'fname': 'rotg.c', + 'cblas': true, + }, + { 'base': '?rotg', '_types': ['c', 'z', 'x'], + 'fname': 'zrotg.c', + 'cblas': true, + }, + { 'base': '?scal', '_types': ['s', 'd', 'q'], + 'fname': 'scal.c', + 'cblas': true, + }, + { 'base': '?scal', '_types': ['c', 'z', 'x'], + 'fname': 'zscal.c', + 'cblas': true, # x doesn't have cblas_ + }, + { 'base': '?scal', '_types': ['cs', 'zd', 'xq'], + 'fname': 'zscal.c', 'def': [ 'SSCAL' ], 'undef': [ ], + 'cblas': true, + }, + { 'base': '?swap', '_types': ['s', 'd', 'q'], + 'fname': 'swap.c', + 'cblas': true, + }, + { 'base': '?swap', '_types': ['c', 'z', 'x'], + 'fname': 'zswap.c', + 'cblas': true, + }, + { 'base': '?dsdot', '_types': ['s'], + 'fname': 'sdsdot.c', + 'cblas': true, + }, + { 'base': '?dsdot', '_types': [''], + 'fname': 'dsdot.c', + 'cblas': true, + }, + # TODO: Handle BFLOAT16 Makefile 784:795, 1592:1603 + { 'base': 'i?max', '_types': ['s', 'd', 'q', 'c', 'z'], # NOTE: c,z only for cblas, Makefile:1526 + 'fname': 'imax.c', + 'cblas': true, # TODO: Maybe 'cblas_only': ['c', 'z'] + }, + { 'base': 'i?amax', '_types': ['s', 'd', 'q', 'c', 'z', 'x'], + 'fname': 'imax.c', + 'cblas': true, + }, + { 'base': 'i?amin', '_types': ['s', 'd', 'c', 'z', 'x'], + 'fname': 'imax.c', + 'cblas': true, + }, + { 'base': 'i?min', '_types': ['s', 'd', 'q', 'c', 'z'], # NOTE: c,z only for cblas, Makefile:1532 + 'fname': 'imax.c', + 'cblas': true, + }, + # Level 2 + { 'base': '?ger', '_types': ['s', 'd', 'q'], + 'fname': 'ger.c', + 'cblas': true, + }, + { 'base': '?geru', '_types': ['c', 'z', 'x'], + 'fname': 'zger.c', + 'cblas': true, + }, + { 'base': '?gerc', '_types': ['c', 'z', 'x'], + 'fname': 'zger.c', + 'cblas': true, + }, + # TODO: Handle BFLOAT16 Makefile 941:944 + # TODO: Handle Netlib_gemv Makefile 946:958 + { 'base': '?gemv', '_types': ['s', 'd', 'q'], + 'fname': 'gemv.c', + 'cblas': true, + }, + { 'base': '?gemv', '_types': ['c', 'z', 'x'], + 'fname': 'zgemv.c', + 'cblas': true, + }, + { 'base': '?trmv', '_types': ['s', 'd', 'q'], + 'fname': 'trmv.c', + 'cblas': true, + }, + { 'base': '?trmv', '_types': ['c', 'z', 'x'], + 'fname': 'ztrmv.c', + 'cblas': true, + }, + { 'base': '?trsv', '_types': ['s', 'd', 'q'], + 'fname': 'trsv.c', + 'cblas': true, + }, + { 'base': '?trsv', '_types': ['c', 'z', 'x'], + 'fname': 'ztrsv.c', + 'cblas': true, + }, + # TODO: Handle NO_LAPACK + { 'base': '?symv', '_types': ['s', 'd', 'q'], + 'fname': 'symv.c', + 'cblas': true, + }, + { 'base': '?symv', '_types': ['c', 'z', 'x'], + 'fname': 'zsymv.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?syr', '_types': ['s', 'd', 'q'], + 'fname': 'syr.c', + 'cblas': true, + }, + { 'base': '?syr', '_types': ['c', 'z', 'x'], + 'fname': 'zsyr.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?syr2', '_types': ['s', 'd', 'q'], + 'fname': 'syr2.c', + 'cblas': true, + }, + { 'base': '?syr2', '_types': ['c', 'z', 'x'], + 'fname': 'zsyr2.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?gbmv', '_types': ['s', 'd', 'q'], + 'fname': 'gbmv.c', + 'cblas': true, + }, + { 'base': '?gbmv', '_types': ['c', 'z', 'x'], + 'fname': 'zgbmv.c', + 'cblas': true, + }, + { 'base': '?sbmv', '_types': ['s', 'd', 'q'], + 'fname': 'sbmv.c', + 'cblas': true, + }, + { 'base': '?sbmv', '_types': ['c', 'z', 'x'], + 'fname': 'zsbmv.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?spmv', '_types': ['s', 'd', 'q'], + 'fname': 'spmv.c', + 'cblas': true, + }, + # TODO: NO_LAPACK + { 'base': '?spmv', '_types': ['c', 'z', 'x'], + 'fname': 'zspmv.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?spr', '_types': ['s', 'd', 'q'], + 'fname': 'spr.c', + 'cblas': true, + }, + { 'base': '?spr', '_types': ['c', 'z', 'x'], + 'fname': 'zspr.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?spr2', '_types': ['s', 'd', 'q'], + 'fname': 'spr2.c', + 'cblas': true, + }, + { 'base': '?spr2', '_types': ['c', 'z', 'x'], + 'fname': 'zspr2.c', + 'cblas': false, # NOTE: This oddly has no cblas set.. + }, + { 'base': '?tbmv', '_types': ['s', 'd', 'q'], + 'fname': 'tbmv.c', + 'cblas': true, + }, + { 'base': '?tbmv', '_types': ['c', 'z', 'x'], + 'fname': 'ztbmv.c', + 'cblas': true, + }, + { 'base': '?tbsv', '_types': ['s', 'd', 'q'], + 'fname': 'tbsv.c', + 'cblas': true, + }, + { 'base': '?tbsv', '_types': ['c', 'z', 'x'], + 'fname': 'ztbsv.c', + 'cblas': true, + }, + { 'base': '?tpsv', '_types': ['s', 'd', 'q'], + 'fname': 'tpsv.c', + 'cblas': true, + }, + { 'base': '?tpsv', '_types': ['c', 'z', 'x'], + 'fname': 'ztpsv.c', + 'cblas': true, + }, + { 'base': '?tpmv', '_types': ['s', 'd', 'q'], + 'fname': 'tpmv.c', + 'cblas': true, + }, + { 'base': '?tpmv', '_types': ['c', 'z', 'x'], + 'fname': 'ztpmv.c', + 'cblas': true, + }, + { 'base': '?hemv', '_types': ['c', 'z', 'x'], + 'fname': 'zhemv.c', + 'cblas': true, + }, + { 'base': '?hbmv', '_types': ['c', 'z', 'x'], + 'fname': 'zhbmv.c', + 'cblas': true, + }, + { 'base': '?her', '_types': ['c', 'z', 'x'], + 'fname': 'zher.c', + 'cblas': true, + }, + { 'base': '?her2', '_types': ['c', 'z', 'x'], + 'fname': 'zher2.c', + 'cblas': true, + }, + { 'base': '?hpmv', '_types': ['c', 'z', 'x'], + 'fname': 'zhpmv.c', + 'cblas': true, + }, + { 'base': '?hpr', '_types': ['c', 'z', 'x'], + 'fname': 'zhpr.c', + 'cblas': true, + }, + { 'base': '?hpr2', '_types': ['c', 'z', 'x'], + 'fname': 'zhpr2.c', + 'cblas': true, + }, + # TODO: BUILD_BFLOAT16 here, Makefile:1303 + { 'base': '?gemm', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': ['gemm.c', '../param.h'], # TODO: Will this work? + 'cblas': true, + }, + { 'base': '?gemmt', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': ['gemmt.c', '../param.h'], + 'cblas': true, + }, + { 'base': '?symm', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': 'symm.c', + 'cblas': true, + }, + { 'base': '?trmm', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': 'trsm.c', 'def': [ 'TRMM' ], 'undef': [ ], + 'cblas': true, + }, + { 'base': '?trsm', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': 'trsm.c', + 'cblas': true, + }, + { 'base': '?syrk', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': 'syrk.c', + 'cblas': true, + }, + { 'base': '?syr2k', '_types': ['s', 'd', 'q', + 'c', 'z', 'x'], + 'fname': 'syr2k.c', + 'cblas': true, + }, + { 'base': '?hemm', '_types': ['c', 'z', 'x'], + 'fname': 'symm.c', + 'cblas': true, + }, + { 'base': '?herk', '_types': ['c', 'z', 'x'], + 'fname': 'syrk.c', + 'cblas': true, + }, + { 'base': '?her2k', '_types': ['c', 'z', 'x'], + 'fname': 'syr2k.c', + 'cblas': true, + }, + { 'base': '?gemm3m', '_types': ['c', 'z', 'x'], + 'fname': 'gemm.c', + 'cblas': true, + }, + { 'base': '?symm3m', '_types': ['c', 'z', 'x'], + 'fname': 'symm.c', + }, + { 'base': '?hemm3m', '_types': ['c', 'z', 'x'], + 'fname': 'symm.c', + }, + # BLAS Extensions + { 'base': '?axpby', '_types': ['s', 'd'], + 'fname': 'axpby.c', + 'cblas': true, + }, + { 'base': '?axpby', '_types': ['c', 'z'], + 'fname': 'zaxpby.c', + 'cblas': true, + }, + { 'base': '?omatcopy', '_types': ['s', 'd'], + 'fname': 'omatcopy.c', + 'cblas': true, + }, + { 'base': '?omatcopy', '_types': ['c', 'z'], + 'fname': 'zomatcopy.c', + 'cblas': true, + }, + { 'base': '?imatcopy', '_types': ['s', 'd'], + 'fname': 'imatcopy.c', + 'cblas': true, + }, + { 'base': '?imatcopy', '_types': ['c', 'z'], + 'fname': 'zimatcopy.c', + 'cblas': true, + }, + { 'base': '?geadd', '_types': ['s', 'd'], + 'fname': 'geadd.c', + 'cblas': true, + }, + { 'base': '?geadd', '_types': ['c', 'z'], + 'fname': 'zgeadd.c', + 'cblas': true, + }, + { 'base': '?xerbla', '_types': [''], + 'fname': 'xerbla.c', + 'cblas': true, + }, +] + +if not is_win + subdir('lapack') + _blas_roots += _lapack_interface_roots +endif + +_interface_libs = [] +foreach conf : _blas_roots + foreach type : conf['_types'] + if 'q' in type or 'x' in type + # TODO: Figure out when to build these + # These are the XDOUBLE symbols + continue + endif + # Seed with common args + compiler_args = _cargs + interface_args + if fc_id == 'gcc' + compiler_args += ['-DF_INTERFACE_GFORT'] + endif + # Generate the symbol flags + base = conf['base'] + if symb_defs.has_key(base) + symb_base = symb_defs[base] + if symb_base.has_key('def') + foreach _d : symb_base['def'] + compiler_args += ('-D' + _d) + endforeach + endif + if symb_base.has_key('undef') + foreach _u : symb_base['undef'] + compiler_args += ('-U' + _u) + endforeach + endif + endif + # Set the type arguments + if precision_mappings.get(type).has_key('def') + foreach d : precision_mappings[type]['def'] + compiler_args += ['-D' + d] + endforeach + endif + if precision_mappings.get(type).has_key('undef') + foreach u : precision_mappings[type]['undef'] + compiler_args += ['-U' + u] + endforeach + endif + + if conf.has_key('addl') + compiler_args += conf['addl'] + endif + + # Construct the actual symbol names, and mangled symbols + # TODO: This might be conditional on other options + sym_name = conf['base'].replace('?', type) + sym_underscored = f'@sym_name@_' + if conf.get('cblas', false) + cblas_sym_name = 'cblas_' + sym_name + cblas_sym_underscored = f'@cblas_sym_name@_' + endif + + # Construct conditionals + if conf.has_key('def') + foreach d : conf['def'] + compiler_args += ['-D' + d] + endforeach + foreach u : conf['undef'] + compiler_args += ['-U' + u] + endforeach + endif + + # Make mangled symbols + # TODO: This might be conditional on other options + + # Create the static library for each symbol + lib = static_library( + sym_name, + sources: conf['fname'], + include_directories: _inc, + c_args: compiler_args + [ + f'-DASMNAME=@asm_name_prefix@@sym_name@', + f'-DASMFNAME=@asm_name_prefix@@sym_underscored@', + f'-DNAME=@sym_underscored@', + f'-DCNAME=@sym_name@', + f'-DCHAR_NAME="@sym_underscored@"', + f'-DCHAR_CNAME="@sym_name@"', + ] + ) + _interface_libs += lib + + # If it's a CBLAS symbol, also create that + if conf.get('cblas', false) + if 'q' in type or 'x' in type + # There are no cblas_q symbols + # TODO: Handle edge cases around dz zd sc + continue + endif + cblas_lib = static_library( + cblas_sym_name, + sources: conf['fname'], + include_directories: _inc, + c_args: compiler_args + [ + '-DCBLAS', + f'-DASMNAME=@asm_name_prefix@@cblas_sym_name@', + f'-DASMFNAME=@asm_name_prefix@@cblas_sym_underscored@', + f'-DNAME=@cblas_sym_underscored@', + f'-DCNAME=@cblas_sym_name@', + f'-DCHAR_NAME="@cblas_sym_underscored@"', + f'-DCHAR_CNAME="@cblas_sym_name@"', + ] + ) + _interface_libs += cblas_lib + endif + endforeach +endforeach + +# Create a combined static library linking all individual static libraries +_interface = static_library('_interface', link_whole: _interface_libs) diff --git a/subprojects/packagefiles/openblas/interface/meson_options.txt b/subprojects/packagefiles/openblas/interface/meson_options.txt new file mode 100644 index 000000000..8ac8331e7 --- /dev/null +++ b/subprojects/packagefiles/openblas/interface/meson_options.txt @@ -0,0 +1,4 @@ +option('no_fblas', type: 'boolean', value: false, description: 'Disable FBLAS') +option('build_bfloat16', type: 'boolean', value: false, description: 'Enable BFloat16 support') +option('exprecision', type: 'boolean', value: false, description: 'Enable extended precision') +option('quad_precision', type: 'boolean', value: false, description: 'Enable quad precision') diff --git a/subprojects/packagefiles/openblas/kernel/arm64/meson.build b/subprojects/packagefiles/openblas/kernel/arm64/meson.build new file mode 100644 index 000000000..80208c6a6 --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/arm64/meson.build @@ -0,0 +1,59 @@ +arm64_base_dict = { + '?sum': { + 's': { + '_k': 'arm64/sum.S', + }, + 'd': { + '_k': 'arm64/sum.S', + }, + 'c': { + '_k': 'arm64/csum.S' + }, + 'z': { + '_k': 'arm64/zsum.S', + }, + }, + '?nrm2': { + 's': { + '_k': 'arm/nrm2.c', + }, + 'd': { + '_k': 'arm/nrm2.c', + }, + 'c': { + '_k': 'arm/znrm2.c', + }, + 'z': { + '_k': 'arm/znrm2.c', + }, + }, + '?cabs': { + 's': { + '1': 'generic/cabs.c', + }, + 'd': { + '1': 'generic/cabs.c', + }, + }, + '?lsame': { + '': { + '': 'generic/lsame.c', + } + }, + '?gemm': { + 's': { + '_beta': 'generic/gemm_beta.c', + }, + 'd': { + '_beta': 'generic/gemm_beta.c', + }, + 'c': { + '_beta': 'generic/zgemm_beta.c', + }, + 'z': { + '_beta': 'generic/zgemm_beta.c', + }, + }, +} + +subdir('meson_armv8') diff --git a/subprojects/packagefiles/openblas/kernel/arm64/meson_armv8/meson.build b/subprojects/packagefiles/openblas/kernel/arm64/meson_armv8/meson.build new file mode 100644 index 000000000..8760be9f4 --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/arm64/meson_armv8/meson.build @@ -0,0 +1,374 @@ +arm64_armv8_dict = { + '?amin': { + 's': { + '_k': 'arm/amin.c', + }, + 'd': { + '_k': 'arm/amin.c', + }, + 'c': { + '_k': 'arm/zamin.c', + }, + 'z': { + '_k': 'arm/zamin.c', + }, + }, + '?max': { + 's': { + '_k': 'arm/max.c', + }, + 'd': { + '_k': 'arm/max.c', + }, + }, + '?min': { + 's': { + '_k': 'arm/min.c', + }, + 'd': { + '_k': 'arm/min.c', + }, + }, + 'i?amin': { + 's': { + '_k': 'arm/iamin.c', + }, + 'd': { + '_k': 'arm/iamin.c', + }, + 'c': { + '_k': 'arm/izamin.c', + }, + 'z': { + '_k': 'arm/izamin.c', + }, + }, + 'i?max': { + 's': { + '_k': 'arm/imax.c', + }, + 'd': { + '_k': 'arm/imax.c', + }, + }, + 'i?min': { + 's': { + '_k': 'arm/imin.c', + }, + 'd': { + '_k': 'arm/imin.c', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?amax': { + 's': { + '_k': 'arm64/amax.S', + }, + 'd': { + '_k': 'arm64/amax.S', + }, + 'c': { + '_k': 'arm64/zamax.S', + }, + 'z': { + '_k': 'arm64/zamax.S', + }, + }, + '?axpy': { + 's': { + '_k': 'arm64/axpy.S', + }, + 'd': { + '_k': 'arm64/axpy.S', + }, + 'c': { + '_k': 'arm64/zaxpy.S', + }, + 'z': { + '_k': 'arm64/zaxpy.S', + }, + }, + '?axpyc': { + 'c': { + '_k': 'arm64/zaxpy.S', + }, + 'z': { + '_k': 'arm64/zaxpy.S', + }, + }, + '?rot': { + 's': { + '_k': 'arm64/rot.S', + }, + 'd': { + '_k': 'arm64/rot.S', + }, + 'cs': { + '_k': 'arm64/zrot.S', + }, + 'zd': { + '_k': 'arm64/zrot.S', + } + }, + '?scal': { + 's': { + '_k': 'arm64/scal.S', + }, + 'd': { + '_k': 'arm64/scal.S', + }, + 'c': { + '_k': 'arm64/zscal.S', + }, + 'z': { + '_k': 'arm64/zscal.S', + }, + }, + '?gemv': { + 's': { + '_n': 'arm64/gemv_n.S', + '_t': 'arm64/gemv_t.S', + }, + 'd': { + '_n': 'arm64/gemv_n.S', + '_t': 'arm64/gemv_t.S', + }, + 'c': { + '_n': 'arm64/zgemv_n.S', + '_t': 'arm64/zgemv_t.S', + '_r': 'arm64/zgemv_n.S', + '_c': 'arm64/zgemv_t.S', + '_o': 'arm64/zgemv_n.S', + '_u': 'arm64/zgemv_t.S', + '_s': 'arm64/zgemv_n.S', + '_d': 'arm64/zgemv_t.S', + }, + 'z': { + '_n': 'arm64/zgemv_n.S', + '_t': 'arm64/zgemv_t.S', + '_r': 'arm64/zgemv_n.S', + '_c': 'arm64/zgemv_t.S', + '_o': 'arm64/zgemv_n.S', + '_u': 'arm64/zgemv_t.S', + '_s': 'arm64/zgemv_n.S', + '_d': 'arm64/zgemv_t.S', + }, + }, + '?asum': { + 's': { + '_k': 'arm64/asum.S', + }, + 'd': { + '_k': 'arm64/asum.S', + }, + 'c': { + '_k': 'arm64/casum.S', + }, + 'z': { + '_k': 'arm64/zasum.S', + }, + }, + '?copy': { + 's': { + '_k': 'arm64/copy.S', + }, + 'd': { + '_k': 'arm64/copy.S', + }, + 'c': { + '_k': 'arm64/copy.S', + }, + 'z': { + '_k': 'arm64/copy.S', + }, + }, + '?swap': { + 's': { + '_k': 'arm64/swap.S', + }, + 'd': { + '_k': 'arm64/swap.S', + }, + 'c': { + '_k': 'arm64/swap.S', + }, + 'z': { + '_k': 'arm64/swap.S', + }, + }, + 'i?amax': { + 's': { + '_k': 'arm64/iamax.S', + }, + 'd': { + '_k': 'arm64/iamax.S', + }, + 'c': { + '_k': 'arm64/izamax.S', + }, + 'z': { + '_k': 'arm64/izamax.S', + }, + }, + '?nrm2': { + 's': { + '_k': 'arm64/nrm2.S', + }, + 'd': { + '_k': 'arm64/nrm2.S', + }, + 'c': { + '_k': 'arm64/znrm2.S', + }, + 'z': { + '_k': 'arm64/znrm2.S', + }, + }, + '?dot': { + 's': { + '_k': 'generic/dot.c', + }, + 'd': { + '_k': 'arm64/dot.S', + }, + }, + '?dotc': { + 'c': { + '_k': 'arm64/zdot.S', + }, + 'z': { + '_k': 'arm64/zdot.S', + }, + }, + '?dotu': { + 'c': { + '_k': 'arm64/zdot.S', + }, + 'z': { + '_k': 'arm64/zdot.S', + }, + }, + '?dsdot': { + 's': { + '_k': 'arm64/dot.S', + }, + '': { + '_k': 'arm64/dot.S', + }, + }, + '?gemm': { + 's': { + '_beta': 'arm64/sgemm_beta.S', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'arm64/sgemm_tcopy_16.S', + '_oncopy': 'arm64/sgemm_ncopy_4.S', + '_otcopy': 'generic/gemm_tcopy_4.c', + }, + 'd': { + '_beta': 'arm64/dgemm_beta.S', + '_incopy': 'arm64/dgemm_ncopy_8.S', + '_itcopy': 'arm64/dgemm_tcopy_8.S', + '_oncopy': 'arm64/dgemm_ncopy_4.S', + '_otcopy': 'arm64/dgemm_tcopy_4.S', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'arm64/strmm_kernel_16x4.S', + '_LT': 'arm64/strmm_kernel_16x4.S', + '_RN': 'arm64/strmm_kernel_16x4.S', + '_RT': 'arm64/strmm_kernel_16x4.S', + }, + 'd': { + '_LN': 'arm64/dtrmm_kernel_8x4.S', + '_LT': 'arm64/dtrmm_kernel_8x4.S', + '_RN': 'arm64/dtrmm_kernel_8x4.S', + '_RT': 'arm64/dtrmm_kernel_8x4.S', + }, + 'c': { + '_LN': 'arm64/ctrmm_kernel_8x4.S', + '_LT': 'arm64/ctrmm_kernel_8x4.S', + '_LR': 'arm64/ctrmm_kernel_8x4.S', + '_LC': 'arm64/ctrmm_kernel_8x4.S', + '_RN': 'arm64/ctrmm_kernel_8x4.S', + '_RT': 'arm64/ctrmm_kernel_8x4.S', + '_RR': 'arm64/ctrmm_kernel_8x4.S', + '_RC': 'arm64/ctrmm_kernel_8x4.S', + }, + 'z': { + '_LN': 'arm64/ztrmm_kernel_4x4.S', + '_LT': 'arm64/ztrmm_kernel_4x4.S', + '_LR': 'arm64/ztrmm_kernel_4x4.S', + '_LC': 'arm64/ztrmm_kernel_4x4.S', + '_RN': 'arm64/ztrmm_kernel_4x4.S', + '_RT': 'arm64/ztrmm_kernel_4x4.S', + '_RR': 'arm64/ztrmm_kernel_4x4.S', + '_RC': 'arm64/ztrmm_kernel_4x4.S', + }, + }, + '?gemm_kernel': { + 's': { + '': 'arm64/sgemm_kernel_16x4.S', + }, + 'd': { + '': 'arm64/dgemm_kernel_8x4.S', + }, + 'c': { + '_n': 'arm64/cgemm_kernel_8x4.S', + '_l': 'arm64/cgemm_kernel_8x4.S', + '_r': 'arm64/cgemm_kernel_8x4.S', + '_b': 'arm64/cgemm_kernel_8x4.S', + }, + 'z': { + '_n': 'arm64/zgemm_kernel_4x4.S', + '_l': 'arm64/zgemm_kernel_4x4.S', + '_r': 'arm64/zgemm_kernel_4x4.S', + '_b': 'arm64/zgemm_kernel_4x4.S', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/kernel/generic/meson.build b/subprojects/packagefiles/openblas/kernel/generic/meson.build new file mode 100644 index 000000000..8b8c28148 --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/generic/meson.build @@ -0,0 +1 @@ +# Generic diff --git a/subprojects/packagefiles/openblas/kernel/meson.build b/subprojects/packagefiles/openblas/kernel/meson.build new file mode 100644 index 000000000..dd5e71d1a --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/meson.build @@ -0,0 +1,1247 @@ +# Ordered As per https://netlib.org/blas/blasqr.pdf +# NOTE: xROTG xROTMG xROTM have no kernels? +# TODO: Actually test and set this +fma3_flag = [] +if conf_hdat.has('HAVE_FMA3') + fma3_flag += '-mfma' +endif + +SGEMM_UNROLL_N = makefile_conf_dat.get('SGEMM_UNROLL_N') +SGEMM_UNROLL_M = makefile_conf_dat.get('SGEMM_UNROLL_M') +DGEMM_UNROLL_N = makefile_conf_dat.get('DGEMM_UNROLL_N') +DGEMM_UNROLL_M = makefile_conf_dat.get('DGEMM_UNROLL_M') +CGEMM_UNROLL_N = makefile_conf_dat.get('CGEMM_UNROLL_N') +CGEMM_UNROLL_M = makefile_conf_dat.get('CGEMM_UNROLL_M') +ZGEMM_UNROLL_N = makefile_conf_dat.get('ZGEMM_UNROLL_N') +ZGEMM_UNROLL_M = makefile_conf_dat.get('ZGEMM_UNROLL_M') +CGEMM3M_UNROLL_N = makefile_conf_dat.get('CGEMM3M_UNROLL_N') +CGEMM3M_UNROLL_M = makefile_conf_dat.get('CGEMM3M_UNROLL_M') +ZGEMM3M_UNROLL_N = makefile_conf_dat.get('ZGEMM3M_UNROLL_N') +ZGEMM3M_UNROLL_M = makefile_conf_dat.get('ZGEMM3M_UNROLL_M') + +# TODO(rg): Poor detection?!? -_- +# Sanity check, we don't have anything greater than 16 for some of the UNROLL_N +# options; e.g generic/zhemm3m_ucopy_16.c doesn't exist (attempted on macos); so +# we opt for the conservative option.. +if ZGEMM3M_UNROLL_N.to_int() > 8 + ZGEMM3M_UNROLL_N = 8 +endif +if ZGEMM3M_UNROLL_M.to_int() > 8 + ZGEMM3M_UNROLL_M = 8 +endif +if CGEMM3M_UNROLL_N.to_int() > 8 + CGEMM3M_UNROLL_N = 8 +endif +if CGEMM3M_UNROLL_M.to_int() > 8 + CGEMM3M_UNROLL_M = 8 +endif + +subdir('meson_base') +subdir('x86_64') +subdir('arm64') + +# TODO: This is currently following x86_64 generic for src and dir, but it needs +# to diversify +# NOTE: The def and undefs are from Makefile.L1 +# Construct all ModesymbKERNEL from src and dir via files(dir + src) +# For the modes array, the following mapping is used for c_args: +# addl --> passed AS IS +base_kops = [ + # Level 1 BLAS + { 'base': '?rot', + 'modes': { + 's': {'exts': {'_k': {'addl': fma3_flag}}}, + 'd': {'exts': {'_k': {'addl': fma3_flag}}}, + # 'q': {'exts': {'_k': {}}}, + 'cs': {'exts': {'_k': {}}}, + 'zd': {'exts': {'_k': {}}}, + # 'xq': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?swap', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?scal', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?copy', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?axpy', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?axpyc', + 'modes': { + 'c': {'exts': {'_k': {'addl': ['-DCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DCONJ']}}}, + }, + }, + { 'base': '?dot', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?dotc', + 'modes': { + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?dotu', + 'modes': { + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + }, + }, + # TODO(rg): Check? + { 'base': '?dsdot', + 'modes': { + 's': {'exts': {'_k': {}}}, + '': {'exts': {'_k': {'addl': ['-DDSDOT']}}}, + }, + }, + # TODO(rg): Add dsdotkernel conditionals + # xDOTU xDOTC xxDOT aren't present + { 'base': '?nrm2', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?asum', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?amax', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?sum', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?amin', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': 'i?amax', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': 'i?amin', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + }, + }, + { 'base': 'i?max', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + }, + }, + { 'base': 'i?min', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?max', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?min', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?axpby', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + 'c': {'exts': {'_k': {}}}, + 'z': {'exts': {'_k': {}}}, + }, + }, + # Level 2 BLAS + # There are additional sources so now we have srcs + # Ordered as per KERNEL.generic and Makefile.L2 + # exts are used to find the flags for each modality + # ext is attached to base (only useful from Level 2) + # i.e. baseext (e.g., gemv_n, gemv_t, cgeru_k, cgerc_k) + { 'base': '?gemv', + 'modes': { + 's': { + 'exts': { + # TODO(rg): Where are these coming from?? + # Most of these have both generic defines and also per-folder defines.. + # Makefile lists sgemv_n_4.c as the source, though there is a sgemv_n.c + '_n': {}, + '_t': {}, + } + }, + 'd': { + 'exts': { + '_n': {}, + '_t': {}, + } + }, + # 'q': { + # 'exts': { + # '_n': {}, + # '_t': {}, + # } + # }, + 'c': { + 'exts': { + '_n': {}, + '_t': {}, + '_r': {}, + '_c': {}, + '_o': {}, + '_u': {}, + '_s': {}, + '_d': {}, + } + }, + 'z': { + 'exts': { + '_n': {}, + '_t': {}, + '_r': {}, + '_c': {}, + '_o': {}, + '_u': {}, + '_s': {}, + '_d': {}, + } + }, + # 'x': { + # 'exts': { + # '_n': {}, + # '_t': {}, + # '_r': {}, + # '_c': {}, + # '_o': {}, + # '_u': {}, + # '_s': {}, + # '_d': {}, + # } + # }, + }, + }, + { 'base': '?symv', + 'modes': { + 's': { + 'exts': { + '_U': {}, + '_L': {}, + } + }, + 'd': { + 'exts': { + '_U': {}, + '_L': {}, + } + }, + 'c': { + 'exts': { + '_U': {}, + '_L': {}, + } + }, + 'z': { + 'exts': { + '_U': {}, + '_L': {}, + } + }, + # 'q': { + # 'exts': { + # '_U': {}, + # '_L': {}, + # } + # }, + # 'x': { + # 'exts': { + # '_U': {}, + # '_L': {}, + # } + # }, + }, + }, + { 'base': '?lsame', + 'modes': { + '': {'exts': {'': {}}}, + }, + }, + { 'base': '?cabs', + 'modes': { + 's': {'exts': {'1': {}}}, + 'd': {'exts': {'1': {}}}, + # 'q': {'exts': {'': {}}}, + }, + }, + { 'base': '?gemm3m', + 'modes': { + 'c': {'exts': { + '_kernel': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_oncopyb': {'addl': ['-DUSE_ALPHA']}, + '_otcopyb': {'addl': ['-DUSE_ALPHA']}, + '_itcopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_itcopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_itcopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_incopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_incopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_oncopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_otcopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_incopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_oncopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_otcopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + }}, + 'z': {'exts': { + '_kernel': {'addl': ['-DNN']}, + '_oncopyb': {'addl': ['-DUSE_ALPHA']}, + '_otcopyb': {'addl': ['-DUSE_ALPHA']}, + '_itcopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_itcopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_itcopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_incopyb': {'addl': ['-DICOPY', '-UUSE_ALPHA']}, + '_incopyr': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DREAL_ONLY']}, + '_oncopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_otcopyr': {'addl': ['-DUSE_ALPHA', '-DREAL_ONLY']}, + '_incopyi': {'addl': ['-DICOPY', '-UUSE_ALPHA', '-DIMAGE_ONLY']}, + '_oncopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + '_otcopyi': {'addl': ['-DUSE_ALPHA', '-DIMAGE_ONLY']}, + }}, + }, + }, + { 'base': '?ger', + 'modes': { + 's': {'exts': {'_k': {}}}, + 'd': {'exts': {'_k': {}}}, + # 'q': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?geru', + 'modes': { + 'c': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-UCONJ']}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?gerc', + 'modes': { + 'c': {'exts': {'_k': {'addl': ['-DCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-DCONJ']}}}, + # 'x': {'exts': {'_k': {}}}, + }, + }, + { 'base': '?gerv', + 'modes': { + 'c': {'exts': {'_k': {'addl': ['-UCONJ', '-DXCONJ']}}}, + 'z': {'exts': {'_k': {'addl': ['-UCONJ', '-DXCONJ']}}}, + }, + }, + { 'base': '?hemv', + 'modes': { + 'c': { + 'exts': { + '_U': {'addl': ['-DHEMV']}, + '_L': {'addl': ['-DHEMV']}, + '_V': {'addl': ['-DHEMV', '-DHEMVREV']}, + '_M': {'addl': ['-DHEMV', '-DHEMVREV']}, + } + }, + 'z': { + 'exts': { + '_U': {'addl': ['-DHEMV']}, + '_L': {'addl': ['-DHEMV']}, + '_V': {'addl': ['-DHEMV', '-DHEMVREV']}, + '_M': {'addl': ['-DHEMV', '-DHEMVREV']}, + } + }, + # 'x': { + # 'exts': { + # '_U': {}, + # '_L': {}, + # '_V': {}, + # '_M': {}, + # } + # }, + }, + }, + # { 'base': '?bgemv', + # 'modes': { + # 's': { + # 'exts': { + # '_n': {}, + # '_t': {}, + # } + # } + # }, + # }, + # Level 3 symbols + { 'base': '?gemm_kernel', + 'modes': { + 's': {'exts': {'': {}}}, + 'd': {'exts': {'': {}}}, + 'c': { + 'exts': { + '_n': {'addl': ['-DNN']}, + '_l': {'addl': ['-DCN']}, + # TODO(rg): What about _r conditionals? Makefile.L3:2969 + '_r': {'addl': ['-DNC']}, + '_b': {'addl': ['-DCC']}, + } + }, + 'z': { + 'exts': { + '_n': {'addl': ['-DNN']}, + '_l': {'addl': ['-DCN']}, + '_r': {'addl': ['-DNC']}, + '_b': {'addl': ['-DCC']}, + } + } + # 'q': {'exts': {'': {}}}, + # 'x': {'exts': {'': {}}}, + }, + }, + { 'base': '?trmm_kernel', + 'modes': { + 's': { + 'exts': { + '_LN': {}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA']}, + '_RN': {}, + '_RT': {}, + } + }, + 'd': { + 'exts': { + '_LN': {}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA']}, + '_RN': {}, + '_RT': {}, + } + }, + 'c': { + 'exts': { + '_LN': {'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_LR': {'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, + '_LC': {'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, + '_RN': {'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_RT': {'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_RR': {'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, + '_RC': {'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, + } + }, + 'z': { + 'exts': { + '_LN': {'addl': ['-DLEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_LT': {'addl': ['-DLEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_LR': {'addl': ['-DLEFT', '-UTRANSA', '-DCONJ', '-DCN']}, + '_LC': {'addl': ['-DLEFT', '-DTRANSA', '-DCONJ', '-DCN']}, + '_RN': {'addl': ['-ULEFT', '-UTRANSA', '-UCONJ', '-DNN']}, + '_RT': {'addl': ['-ULEFT', '-DTRANSA', '-UCONJ', '-DNN']}, + '_RR': {'addl': ['-ULEFT', '-UTRANSA', '-DCONJ', '-DNC']}, + '_RC': {'addl': ['-ULEFT', '-DTRANSA', '-DCONJ', '-DNC']}, + }, + }, + }, + }, + { 'base': '?trsm_kernel', + 'modes': { + 's': { + 'exts': { + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + }, + }, + 'd': { + 'exts': { + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + }, + }, + 'c': { + 'exts': { + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_LR': {'addl': ['-DLN', '-DUPPER', '-DCONJ']}, + '_LC': {'addl': ['-DLT', '-UUPPER', '-DCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_RR': {'addl': ['-DRN', '-DUPPER', '-DCONJ']}, + '_RC': {'addl': ['-DRT', '-UUPPER', '-DCONJ']}, + }, + }, + 'z': { + 'exts': { + '_LN': {'addl': ['-DLN', '-DUPPER', '-UCONJ']}, + '_LT': {'addl': ['-DLT', '-UUPPER', '-UCONJ']}, + '_LR': {'addl': ['-DLN', '-DUPPER', '-DCONJ']}, + '_LC': {'addl': ['-DLT', '-UUPPER', '-DCONJ']}, + '_RN': {'addl': ['-DRN', '-DUPPER', '-UCONJ']}, + '_RT': {'addl': ['-DRT', '-UUPPER', '-UCONJ']}, + '_RR': {'addl': ['-DRN', '-DUPPER', '-DCONJ']}, + '_RC': {'addl': ['-DRT', '-UUPPER', '-DCONJ']}, + }, + }, + }, + }, + { 'base': '?gemm', + 'modes': { + 's': {'exts': { + '_beta': {}, + '_small_matrix_permit': {}, + # TODO(rg): the _NUM prefixes are arch dependent + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, + # TODO(rg): direct and direct_performant are built only conditionally + '_direct': {}, + '_direct_performant': {}, + }}, + 'd': {'exts': { + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, + }}, + 'c': {'exts': { + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, + }}, + 'z': {'exts': { + '_beta': {}, + '_small_matrix_permit': {}, + '_incopy': {}, + '_itcopy': {}, + '_oncopy': {}, + '_otcopy': {}, + }}, + }, + }, + { 'base': '?trmm', + 'modes': { + 's': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'd': {'exts': { + '_iunucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'c': {'exts': { + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'z': {'exts': { + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + }, + }, + { 'base': '?hemm', + 'modes': { + 'c': {'exts': { + '_iutcopy': {'addl': ['-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DOUTER', '-DLOWER']}, + '3m_iucopyb': {}, + '3m_ilcopyb': {}, + '3m_olcopyb': {}, + '3m_oucopyb': {}, + '3m_olcopyr': {}, + '3m_oucopyr': {}, + '3m_iucopyr': {}, + '3m_ilcopyr': {}, + '3m_oucopyi': {}, + '3m_olcopyi': {}, + '3m_iucopyi': {}, + '3m_ilcopyi': {}, + }}, + 'z': {'exts': { + '_iutcopy': {'addl': ['-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DOUTER', '-DLOWER']}, + '3m_iucopyb': {}, + '3m_ilcopyb': {}, + '3m_olcopyb': {}, + '3m_oucopyb': {}, + '3m_olcopyr': {}, + '3m_oucopyr': {}, + '3m_iucopyr': {}, + '3m_ilcopyr': {}, + '3m_oucopyi': {}, + '3m_olcopyi': {}, + '3m_iucopyi': {}, + '3m_ilcopyi': {}, + }}, + }, + }, + { 'base': '?trsm', + 'modes': { + 's': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'd': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'q': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'c': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + 'z': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iunucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iunncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_ilnucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_ilnncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_iutucopy': {'addl': ['-UOUTER', '-ULOWER', '-DUNIT']}, + '_iutncopy': {'addl': ['-UOUTER', '-ULOWER', '-UUNIT']}, + '_iltucopy': {'addl': ['-UOUTER', '-DLOWER', '-DUNIT']}, + '_iltncopy': {'addl': ['-UOUTER', '-DLOWER', '-UUNIT']}, + '_ounucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_ounncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_olnucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_olnncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + '_outucopy': {'addl': ['-DOUTER', '-ULOWER', '-DUNIT']}, + '_outncopy': {'addl': ['-DOUTER', '-ULOWER', '-UUNIT']}, + '_oltucopy': {'addl': ['-DOUTER', '-DLOWER', '-DUNIT']}, + '_oltncopy': {'addl': ['-DOUTER', '-DLOWER', '-UUNIT']}, + }}, + }, + }, + { 'base': '?symm', + 'modes': { + 's': {'exts': { + # TODO(rg): These actually use $(SGEMM_UNROLL_M) to choose the size + '_iutcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-UDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, + }}, + 'd': {'exts': { + '_iutcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DDOUBLE', '-UCOMPLEX', '-DOUTER', '-DLOWER']}, + }}, + # NOTE(rg): for c,z; there are a bunch of 3m_i?copy* symbols where there + # may be a compulsion to reduce this into a series of loops; resist + # this, since there's no good way to extend the exts keys alone without + # replacing the whole dictionary. + 'c': {'exts': { + '_iutcopy': {'addl': ['-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DOUTER', '-DLOWER']}, + '3m_iucopyb': {}, + '3m_ilcopyb': {}, + '3m_olcopyb': {}, + '3m_oucopyb': {}, + '3m_olcopyr': {}, + '3m_oucopyr': {}, + '3m_iucopyr': {}, + '3m_ilcopyr': {}, + '3m_oucopyi': {}, + '3m_olcopyi': {}, + '3m_iucopyi': {}, + '3m_ilcopyi': {}, + }}, + 'z': {'exts': { + '_iutcopy': {'addl': ['-UOUTER', '-ULOWER']}, + '_iltcopy': {'addl': ['-UOUTER', '-DLOWER']}, + '_outcopy': {'addl': ['-DOUTER', '-ULOWER']}, + '_oltcopy': {'addl': ['-DOUTER', '-DLOWER']}, + '3m_iucopyb': {}, + '3m_ilcopyb': {}, + '3m_olcopyb': {}, + '3m_oucopyb': {}, + '3m_olcopyr': {}, + '3m_oucopyr': {}, + '3m_iucopyr': {}, + '3m_ilcopyr': {}, + '3m_oucopyi': {}, + '3m_olcopyi': {}, + '3m_iucopyi': {}, + '3m_ilcopyi': {}, + }}, + }, + }, + { 'base': '?omatcopy_k', + 'modes': { + 's': {'exts': { + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, + }}, + 'd': {'exts': { + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, + }}, + 'c': {'exts': { + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, + }}, + 'z': {'exts': { + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, + }}, + }, + }, + { 'base': '?imatcopy_k', + 'modes': { + 's': {'exts': { + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, + }}, + 'd': {'exts': { + '_cn': {'addl': ['-UROWM']}, + '_rn': {'addl': ['-DROWM']}, + '_ct': {'addl': ['-UROWM']}, + '_rt': {'addl': ['-DROWM']}, + }}, + 'c': {'exts': { + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, + }}, + 'z': {'exts': { + '_cn': {'addl': ['-UROWM', '-UCONJ']}, + '_rn': {'addl': ['-DROWM', '-UCONJ']}, + '_ct': {'addl': ['-UROWM', '-UCONJ']}, + '_rt': {'addl': ['-DROWM', '-UCONJ']}, + '_cnc': {'addl': ['-UROWM', '-DCONJ']}, + '_rnc': {'addl': ['-DROWM', '-DCONJ']}, + '_ctc': {'addl': ['-UROWM', '-DCONJ']}, + '_rtc': {'addl': ['-DROWM', '-DCONJ']}, + }}, + }, + }, + { + 'base': '?geadd', + 'modes': { + 's': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'd': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'c': {'exts': {'_k': {'addl': ['-UROWM']}}}, + 'z': {'exts': {'_k': {'addl': ['-UROWM']}}}, + }, + }, + { 'base': '?gemm_small_kernel', + 'modes': { + 's': { + 'exts': { + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, + # '_b0_nn': {'addl': ['-DB0']}, + # '_b0_nt': {'addl': ['-DB0']}, + # '_b0_tn': {'addl': ['-DB0']}, + # '_b0_tt': {'addl': ['-DB0']}, + } + }, + 'd': { + 'exts': { + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, + } + }, + 'c': { + 'exts': { + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, + } + }, + 'z': { + 'exts': { + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, + } + }, + }, + }, + { 'base': '?gemm_small_kernel_b0', + 'modes': { + 's': { + 'exts': { + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, + } + }, + 'd': { + 'exts': { + '_nn': {}, + '_nt': {}, + '_tn': {}, + '_tt': {}, + } + }, + 'c': { + 'exts': { + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, + } + }, + 'z': { + 'exts': { + '_nn': {}, + '_nr': {}, + '_rn': {}, + '_rr': {}, + '_nt': {}, + '_nc': {}, + '_rt': {}, + '_rc': {}, + '_tn': {}, + '_tr': {}, + '_cn': {}, + '_cr': {}, + '_tt': {}, + '_tc': {}, + '_ct': {}, + '_cc': {}, + } + }, + }, + }, +] + +search_order = [base_dict] + +if conf_hdat.has('ARCH_X86_64') + search_order = [x86_64_base_dict] + search_order + if conf_hdat.has('HASWELL') + search_order = [x86_64_haswell_dict] + search_order + elif conf_hdat.has('SKYLAKEX') + search_order = [x86_64_skylakex_dict, x86_64_haswell_dict] + search_order + elif conf_hdat.has('ZEN') + search_order = [x86_64_zen_dict] + search_order + elif conf_hdat.has('SANDYBRIDGE') + search_order = [x86_64_sandybridge_dict] + search_order + endif +elif conf_hdat.has('ARCH_ARM64') + search_order = [arm64_base_dict] + search_order + if conf_hdat.has('ARMV8') + search_order = [arm64_armv8_dict] + search_order + endif +endif + +kernel_confs = [] +foreach _kop : base_kops + base = _kop['base'] + modes = _kop['modes'] + # Generate the symbol flags + _ckop_args = [] + if symb_defs.has_key(base) + symb_base = symb_defs[base] + if symb_base.has_key('def') + foreach _d : symb_base['def'] + _ckop_args += ('-D' + _d) + endforeach + endif + if symb_base.has_key('undef') + foreach _u : symb_base['undef'] + _ckop_args += ('-U' + _u) + endforeach + endif + endif + foreach mode, details : modes + if mode == 'x' or mode == 'q' + continue + endif + # Generally, one list is required for each foreach + __cargs = _cargs + _ckop_args + prec_mode = precision_mappings[mode] + # Generate the mapping for the type + if prec_mode.has_key('def') + foreach _d : prec_mode['def'] + __cargs += ('-D' + _d) + endforeach + endif + if prec_mode.has_key('undef') + foreach _u : prec_mode['undef'] + __cargs += ('-U' + _u) + endforeach + endif + # Now the rest, one run for each ext, to get the final symbols + foreach ext, extdat : details['exts'] + _ext_cargs = [] # Will be wiped for each ext preventing redefinitions + # Check ext_mappings first + if ext_mappings.has_key(ext) and (not ext_mappings.has_key('except') or base not in ext_mappings['except']) + extmap = ext_mappings[ext] + if extmap.has_key('def') + foreach _d : extmap['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if extmap.has_key('undef') + foreach _u : extmap['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + else + # Fallback to ext_mappings_l2 + foreach ext_map : ext_mappings_l2 + ext_mappings_l3 + if ext_map['ext'] == ext and mode in ext_map['for'] and (not ext_map.has_key('except') or base not in ext_map['except']) + if ext_map.has_key('def') + foreach _d : ext_map['def'] + _ext_cargs += ['-D' + _d] + endforeach + endif + if ext_map.has_key('undef') + foreach _u : ext_map['undef'] + _ext_cargs += ['-U' + _u] + endforeach + endif + break + endif + endforeach + endif + + # ?{hemm,symm}3m_* Rules + if '3m_' in ext + foreach sw_key, sw_val : m3_startswith + if ext.startswith(sw_key) + _ext_cargs += sw_val + endif + endforeach + foreach ew_key, ew_val : m3_endswith + if ext.endswith(ew_key) + _ext_cargs += ew_val + endif + endforeach + endif + if base == '?gemm3m' and conf_hdat.has('ARCH_ARM64') + continue + endif + + src = '' + foreach dict : search_order + if dict.has_key(base) and dict[base].has_key(mode) and dict[base][mode].has_key(ext) + src = dict[base][mode][ext] + break + endif + endforeach + if src == '' + error(f'Missing src file for @base@ @mode@ @ext@', search_order, conf_hdat.keys()) + endif + + if extdat.has_key('addl') + _ext_cargs += extdat['addl'] + endif + sym_name = base.replace('?', mode) + ext + sym_underscored = f'@sym_name@_' + _ext_cargs += [ + f'-DASMNAME=@asm_name_prefix@@sym_name@', + f'-DASMFNAME=@asm_name_prefix@@sym_underscored@', + f'-DNAME=@sym_underscored@', + f'-DCNAME=@sym_name@', + f'-DCHAR_NAME="@sym_underscored@"', + f'-DCHAR_CNAME="@sym_name@"', + ] + current_def = { + 'c_args': __cargs + _ext_cargs, + 'name': sym_name, + 'src': src + } + kernel_confs += current_def + endforeach + endforeach +endforeach + +_kern_libs = [] +_kern_deps = [] +_is_asm = false +foreach conf: kernel_confs + _kern_libs += static_library( + conf['name'], + conf['src'], + include_directories: _inc, + c_args: conf['c_args'], + # See gh discussion 13374 for why, basically .S are coded as fortran.. + fortran_args: conf['c_args'], + ) +endforeach + +_kern = static_library('_kern', + link_whole: _kern_libs, + dependencies: _kern_deps) diff --git a/subprojects/packagefiles/openblas/kernel/meson_base/meson.build b/subprojects/packagefiles/openblas/kernel/meson_base/meson.build new file mode 100644 index 000000000..7055fa591 --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/meson_base/meson.build @@ -0,0 +1,561 @@ +base_dict = { + # Level 1 BLAS + '?axpy': { + 's': { + '_k': 'arm/axpy.c', + }, + 'd': { + '_k': 'arm/axpy.c', + }, + 'c': { + '_k': 'arm/zaxpy.c', + }, + 'z': { + '_k': 'arm/zaxpy.c', + }, + }, + '?axpby': { + 's': { + '_k': 'arm/axpby.c', + }, + 'd': { + '_k': 'arm/axpby.c', + }, + 'c': { + '_k': 'arm/zaxpby.c', + }, + 'z': { + '_k': 'arm/zaxpby.c', + }, + }, + # Level 2 BLAS + '?symv': { + 's': { + '_U': 'generic/symv_k.c', + '_L': 'generic/symv_k.c', + }, + 'd': { + '_U': 'generic/symv_k.c', + '_L': 'generic/symv_k.c', + }, + 'c': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + 'z': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + }, + '?ger': { + 's': { + '_k': 'generic/ger.c', + }, + 'd': { + '_k': 'generic/ger.c', + }, + }, + '?geru': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?gerc': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?gerv': { + 'c': { + '_k': 'generic/zger.c', + }, + 'z': { + '_k': 'generic/zger.c', + }, + }, + '?hemv': { + 'c': { + '_U': 'generic/zhemv_k.c', + '_L': 'generic/zhemv_k.c', + '_V': 'generic/zhemv_k.c', + '_M': 'generic/zhemv_k.c', + }, + 'z': { + '_U': 'generic/zhemv_k.c', + '_L': 'generic/zhemv_k.c', + '_V': 'generic/zhemv_k.c', + '_M': 'generic/zhemv_k.c', + }, + }, + # Level 3 BLAS + '?gemm': { + 's': { + '_direct': 'x86_64/sgemm_direct_skylakex.c', + '_direct_performant': 'x86_64/sgemm_direct_performant.c', + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'd': { + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'c': { + '_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c', + }, + 'z': { + '_small_matrix_permit': 'generic/zgemm_small_matrix_permit.c', + }, + }, + '?gemm3m': { + 'c': { + '_oncopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_oncopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_oncopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_otcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_N@.c', + '_incopyb': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_incopyr': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_incopyi': f'generic/zgemm3m_ncopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyb': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyr': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + '_itcopyi': f'generic/zgemm3m_tcopy_@CGEMM3M_UNROLL_M@.c', + }, + 'z': { + '_oncopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_oncopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_oncopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_otcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_N@.c', + '_incopyb': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_incopyi': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_incopyr': f'generic/zgemm3m_ncopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyb': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyi': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + '_itcopyr': f'generic/zgemm3m_tcopy_@ZGEMM3M_UNROLL_M@.c', + } + }, + '?trmm': { + 's': { + '_iunucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trmm_uncopy_@SGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trmm_lncopy_@SGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trmm_utcopy_@SGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trmm_ltcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iunucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trmm_uncopy_@DGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trmm_lncopy_@DGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trmm_utcopy_@DGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trmm_ltcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iunucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrmm_uncopy_@CGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrmm_lncopy_@CGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrmm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrmm_ltcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iunucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrmm_uncopy_@ZGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrmm_lncopy_@ZGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrmm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrmm_ltcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?hemm': { + 'c': { + '_iutcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zhemm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zhemm_ltcopy_@CGEMM_UNROLL_N@.c', + '3m_oucopyb': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_olcopyb': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + '3m_oucopyr': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_olcopyr': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + '3m_oucopyi': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_olcopyi': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + '3m_iucopyb': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_ilcopyb': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + '3m_iucopyr': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_ilcopyr': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + '3m_iucopyi': f'generic/zhemm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_ilcopyi': f'generic/zhemm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + }, + 'z': { + '_iutcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zhemm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zhemm_ltcopy_@ZGEMM_UNROLL_N@.c', + '3m_oucopyb': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_olcopyb': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + '3m_oucopyr': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_olcopyr': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + '3m_oucopyi': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_olcopyi': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + '3m_iucopyb': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_ilcopyb': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + '3m_iucopyr': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_ilcopyr': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + '3m_iucopyi': f'generic/zhemm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_ilcopyi': f'generic/zhemm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + }, + }, + '?trsm': { + 's': { + '_iunucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trsm_uncopy_@SGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trsm_lncopy_@SGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trsm_utcopy_@SGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trsm_ltcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iunucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/trsm_uncopy_@DGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/trsm_lncopy_@DGEMM_UNROLL_N@.c', + '_outucopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c', + '_outncopy': f'generic/trsm_utcopy_@DGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/trsm_ltcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iunucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrsm_uncopy_@CGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrsm_lncopy_@CGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrsm_utcopy_@CGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrsm_ltcopy_@CGEMM_UNROLL_N@.c', + }, + 'z': { + '_iunucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c', + '_iunncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_M@.c', + '_ilnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c', + '_ilnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_M@.c', + '_iutucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iutncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_M@.c', + '_iltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_iltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_M@.c', + '_ounucopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c', + '_ounncopy': f'generic/ztrsm_uncopy_@ZGEMM_UNROLL_N@.c', + '_olnucopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c', + '_olnncopy': f'generic/ztrsm_lncopy_@ZGEMM_UNROLL_N@.c', + '_outucopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c', + '_outncopy': f'generic/ztrsm_utcopy_@ZGEMM_UNROLL_N@.c', + '_oltucopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c', + '_oltncopy': f'generic/ztrsm_ltcopy_@ZGEMM_UNROLL_N@.c', + }, + }, + '?symm': { + 's': { + '_iutcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_M@.c', + '_outcopy': f'generic/symm_ucopy_@SGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/symm_lcopy_@SGEMM_UNROLL_N@.c', + }, + 'd': { + '_iutcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_M@.c', + '_outcopy': f'generic/symm_ucopy_@DGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/symm_lcopy_@DGEMM_UNROLL_N@.c', + }, + 'c': { + '_iutcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zsymm_ucopy_@CGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zsymm_lcopy_@CGEMM_UNROLL_N@.c', + '3m_iucopyb': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_oucopyb': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_iucopyr': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_oucopyr': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_iucopyi': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_M@.c', + '3m_oucopyi': f'generic/zsymm3m_ucopy_@CGEMM3M_UNROLL_N@.c', + '3m_ilcopyb': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + '3m_olcopyb': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + '3m_ilcopyr': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + '3m_olcopyr': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + '3m_ilcopyi': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_M@.c', + '3m_olcopyi': f'generic/zsymm3m_lcopy_@CGEMM3M_UNROLL_N@.c', + }, + 'z': { + '_iutcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_M@.c', + '_iltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_M@.c', + '_outcopy': f'generic/zsymm_ucopy_@ZGEMM_UNROLL_N@.c', + '_oltcopy': f'generic/zsymm_lcopy_@ZGEMM_UNROLL_N@.c', + '3m_iucopyb': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_oucopyb': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_iucopyr': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_oucopyr': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_iucopyi': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_M@.c', + '3m_oucopyi': f'generic/zsymm3m_ucopy_@ZGEMM3M_UNROLL_N@.c', + '3m_ilcopyb': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + '3m_olcopyb': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + '3m_ilcopyr': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + '3m_olcopyr': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + '3m_ilcopyi': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_M@.c', + '3m_olcopyi': f'generic/zsymm3m_lcopy_@ZGEMM3M_UNROLL_N@.c', + }, + }, + '?omatcopy_k': { + 's': { + '_cn': 'arm/omatcopy_cn.c', + '_rn': 'arm/omatcopy_rn.c', + '_ct': 'arm/omatcopy_ct.c', + '_rt': 'arm/omatcopy_rt.c', + }, + 'd': { + '_cn': 'arm/omatcopy_cn.c', + '_rn': 'arm/omatcopy_rn.c', + '_ct': 'arm/omatcopy_ct.c', + '_rt': 'arm/omatcopy_rt.c', + }, + 'c': { + '_cn': 'arm/zomatcopy_cn.c', + '_rn': 'arm/zomatcopy_rn.c', + '_ct': 'arm/zomatcopy_ct.c', + '_rt': 'arm/zomatcopy_rt.c', + '_cnc': 'arm/zomatcopy_cnc.c', + '_rnc': 'arm/zomatcopy_rnc.c', + '_ctc': 'arm/zomatcopy_ctc.c', + '_rtc': 'arm/zomatcopy_rtc.c', + }, + 'z': { + '_cn': 'arm/zomatcopy_cn.c', + '_rn': 'arm/zomatcopy_rn.c', + '_ct': 'arm/zomatcopy_ct.c', + '_rt': 'arm/zomatcopy_rt.c', + '_cnc': 'arm/zomatcopy_cnc.c', + '_rnc': 'arm/zomatcopy_rnc.c', + '_ctc': 'arm/zomatcopy_ctc.c', + '_rtc': 'arm/zomatcopy_rtc.c', + }, + }, + '?imatcopy_k': { + 's': { + '_cn': 'generic/imatcopy_cn.c', + '_rn': 'generic/imatcopy_rn.c', + '_ct': 'generic/imatcopy_ct.c', + '_rt': 'generic/imatcopy_rt.c', + }, + 'd': { + '_cn': 'generic/imatcopy_cn.c', + '_rn': 'generic/imatcopy_rn.c', + '_ct': 'generic/imatcopy_ct.c', + '_rt': 'generic/imatcopy_rt.c', + }, + 'c': { + '_cn': 'generic/zimatcopy_cn.c', + '_rn': 'generic/zimatcopy_rn.c', + '_ct': 'generic/zimatcopy_ct.c', + '_rt': 'generic/zimatcopy_rt.c', + '_cnc': 'generic/zimatcopy_cnc.c', + '_rnc': 'generic/zimatcopy_rnc.c', + '_ctc': 'generic/zimatcopy_ctc.c', + '_rtc': 'generic/zimatcopy_rtc.c', + }, + 'z': { + '_cn': 'generic/zimatcopy_cn.c', + '_rn': 'generic/zimatcopy_rn.c', + '_ct': 'generic/zimatcopy_ct.c', + '_rt': 'generic/zimatcopy_rt.c', + '_cnc': 'generic/zimatcopy_cnc.c', + '_rnc': 'generic/zimatcopy_rnc.c', + '_ctc': 'generic/zimatcopy_ctc.c', + '_rtc': 'generic/zimatcopy_rtc.c', + }, + }, + '?geadd': { + 's': { + '_k': 'generic/geadd.c', + }, + 'd': { + '_k': 'generic/geadd.c', + }, + 'c': { + '_k': 'generic/zgeadd.c', + }, + 'z': { + '_k': 'generic/zgeadd.c', + }, + }, + '?gemm_small_kernel': { + 's': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'd': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'c': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + 'z': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + }, + '?gemm_small_kernel_b0': { + 's': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'd': { + '_nn': 'generic/gemm_small_matrix_kernel_nn.c', + '_nt': 'generic/gemm_small_matrix_kernel_nt.c', + '_tn': 'generic/gemm_small_matrix_kernel_tn.c', + '_tt': 'generic/gemm_small_matrix_kernel_tt.c', + }, + 'c': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + 'z': { + '_nn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rn': 'generic/zgemm_small_matrix_kernel_nn.c', + '_rr': 'generic/zgemm_small_matrix_kernel_nn.c', + '_nt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_nc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rt': 'generic/zgemm_small_matrix_kernel_nt.c', + '_rc': 'generic/zgemm_small_matrix_kernel_nt.c', + '_tn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cn': 'generic/zgemm_small_matrix_kernel_tn.c', + '_cr': 'generic/zgemm_small_matrix_kernel_tn.c', + '_tt': 'generic/zgemm_small_matrix_kernel_tt.c', + '_tc': 'generic/zgemm_small_matrix_kernel_tt.c', + '_ct': 'generic/zgemm_small_matrix_kernel_tt.c', + '_cc': 'generic/zgemm_small_matrix_kernel_tt.c', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/kernel/x86_64/meson.build b/subprojects/packagefiles/openblas/kernel/x86_64/meson.build new file mode 100644 index 000000000..ff1180c7d --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/x86_64/meson.build @@ -0,0 +1,385 @@ +x86_64_base_dict = { + # Level 1 BLAS + '?rot': { + 's': { + '_k': 'x86_64/rot_sse.S', + }, + 'd': { + '_k': 'x86_64/rot_sse2.S', + }, + 'cs': { + '_k': 'x86_64/zrot_sse.S', + }, + 'zd': { + '_k': 'x86_64/zrot_sse2.S', + } + }, + '?swap': { + 's': { + '_k': 'x86_64/swap_sse.S', + }, + 'd': { + '_k': 'x86_64/swap_sse2.S', + }, + 'c': { + '_k': 'x86_64/zswap_sse.S', + }, + 'z': { + '_k': 'x86_64/zswap_sse2.S', + }, + }, + '?scal': { + 's': { + '_k': 'x86_64/scal_sse.S', + }, + 'd': { + '_k': 'x86_64/scal_sse2.S', + }, + 'c': { + '_k': 'x86_64/zscal_sse.S', + }, + 'z': { + '_k': 'x86_64/zscal_sse2.S', + }, + }, + '?copy': { + 's': { + '_k': 'x86_64/copy_sse.S', + }, + 'd': { + '_k': 'x86_64/copy_sse2.S', + }, + 'c': { + '_k': 'x86_64/zcopy_sse.S', + }, + 'z': { + '_k': 'x86_64/zcopy_sse2.S', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/axpy_sse.S', + }, + 'd': { + '_k': 'x86_64/axpy_sse2.S', + }, + 'c': { + '_k': 'x86_64/zaxpy_sse.S', + }, + 'z': { + '_k': 'x86_64/zaxpy_sse2.S', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/zaxpy_sse.S', + }, + 'z': { + '_k': 'x86_64/zaxpy_sse2.S', + }, + }, + '?dot': { + 's': { + '_k': 'generic/dot.c', + }, + 'd': { + '_k': 'x86_64/dot_sse2.S', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/zdot_sse.S', + }, + 'z': { + '_k': 'x86_64/zdot_sse2.S', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/zdot_sse.S', + }, + 'z': { + '_k': 'x86_64/zdot_sse2.S', + }, + }, + '?dsdot': { + 's': { + '_k': 'generic/dot.c', + }, + '': { + '_k': 'generic/dot.c', + }, + }, + '?nrm2': { + 's': { + '_k': 'x86_64/nrm2_sse.S', + }, + 'd': { + '_k': 'x86_64/nrm2.S', + }, + 'c': { + '_k': 'x86_64/znrm2_sse.S', + }, + 'z': { + '_k': 'x86_64/znrm2.S', + }, + }, + '?asum': { + 's': { + '_k': 'x86_64/asum_sse.S', + }, + 'd': { + '_k': 'x86_64/asum_sse2.S', + }, + 'c': { + '_k': 'x86_64/zasum_sse.S', + }, + 'z': { + '_k': 'x86_64/zasum_sse2.S', + }, + }, + '?amax': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + 'c': { + '_k': 'x86_64/zamax_sse.S', + }, + 'z': { + '_k': 'x86_64/zamax_sse2.S', + }, + }, + '?sum': { + 's': { + '_k': 'arm/sum.c', + }, + 'd': { + '_k': 'arm/sum.c', + }, + 'c': { + '_k': 'x86_64/zsum_sse.S', + }, + 'z': { + '_k': 'x86_64/zsum_sse2.S', + }, + }, + '?amin': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + 'c': { + '_k': 'x86_64/zamax_sse.S', + }, + 'z': { + '_k': 'x86_64/zamax_sse2.S', + }, + }, + 'i?amax': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + 'c': { + '_k': 'x86_64/izamax_sse.S', + }, + 'z': { + '_k': 'x86_64/izamax_sse2.S', + }, + }, + 'i?amin': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + 'c': { + '_k': 'x86_64/izamax_sse.S', + }, + 'z': { + '_k': 'x86_64/izamax_sse2.S', + }, + }, + 'i?max': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + }, + 'i?min': { + 's': { + '_k': 'x86_64/iamax_sse.S', + }, + 'd': { + '_k': 'x86_64/iamax_sse2.S', + }, + }, + '?max': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + }, + '?min': { + 's': { + '_k': 'x86_64/amax_sse.S', + }, + 'd': { + '_k': 'x86_64/amax_sse2.S', + }, + }, + '?axpby': { + 's': { + '_k': 'arm/axpby.c', + }, + 'd': { + '_k': 'arm/axpby.c', + }, + 'c': { + '_k': 'arm/zaxpby.c', + }, + 'z': { + '_k': 'arm/zaxpby.c', + }, + }, + # Level 2 BLAS + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n.c', + '_t': 'x86_64/sgemv_t.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n.S', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + '_r': 'x86_64/cgemv_n_4.c', + '_c': 'x86_64/cgemv_t_4.c', + '_o': 'x86_64/cgemv_n_4.c', + '_u': 'x86_64/cgemv_t_4.c', + '_s': 'x86_64/cgemv_n_4.c', + '_d': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + '_r': 'x86_64/zgemv_n_4.c', + '_c': 'x86_64/zgemv_t_4.c', + '_o': 'x86_64/zgemv_n_4.c', + '_u': 'x86_64/zgemv_t_4.c', + '_s': 'x86_64/zgemv_n_4.c', + '_d': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/symv_U_sse.S', + '_L': 'x86_64/symv_L_sse.S', + }, + 'd': { + '_U': 'x86_64/symv_U_sse2.S', + '_L': 'x86_64/symv_L_sse2.S', + }, + 'c': { + '_U': 'generic/zsymv_k.c', + '_L': 'generic/zsymv_k.c', + }, + 'z': { + '_U': 'x86_64/zsymv_U_sse2.S', + '_L': 'x86_64/zsymv_L_sse2.S', + }, + }, + '?lsame': { + '': { + '': 'x86_64/lsame.S', + } + }, + '?cabs': { + 's': { + '1': 'x86_64/cabs.S', + }, + 'd': { + '1': 'x86_64/cabs.S', + }, + }, + '?gemm3m': { + + }, + '?hemv': { + 'z': { + '_U': 'x86_64/zsymv_U_sse2.S', + '_L': 'x86_64/zsymv_L_sse2.S', + }, + }, + # Level 3 BLAS + '?gemm_kernel': { + # done + }, + '?trmm_kernel': { + # done + }, + '?trsm_kernel': { + # done + }, + '?gemm': { + 's': { + '_beta': 'x86_64/gemm_beta.S', + '_small_matrix_permit': 'generic/gemm_small_matrix_permit.c', + }, + 'd': { + '_beta': 'x86_64/gemm_beta.S', + }, + 'c': { + '_beta': 'x86_64/zgemm_beta.S', + }, + 'z': { + '_beta': 'x86_64/zgemm_beta.S', + }, + }, + '?trmm': { + + }, + '?hemm': { + + }, + '?trsm': { + + }, + '?symm': { + + }, + '?omatcopy_k': { + + }, + '?imatcopy_k': { + + }, + '?geadd': { + + }, + '?gemm_small_kernel': { + + }, + '?gemm_small_kernel_b0': { + + }, +} + +subdir('meson_haswell') +subdir('meson_skylakex') +subdir('meson_zen') +subdir('meson_sandybridge') diff --git a/subprojects/packagefiles/openblas/kernel/x86_64/meson_haswell/meson.build b/subprojects/packagefiles/openblas/kernel/x86_64/meson_haswell/meson.build new file mode 100644 index 000000000..45ccce05c --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/x86_64/meson_haswell/meson.build @@ -0,0 +1,238 @@ +x86_64_haswell_dict = { + '?scal': { + 's': { + '_k': 'x86_64/sscal.c', + }, + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + 'z': { + '_k': 'x86_64/zscal.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n_4.c', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_8x4_haswell_2.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_haswell.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_l': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_r': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_b': 'x86_64/cgemm_kernel_8x2_haswell.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_l': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_r': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_b': 'x86_64/zgemm_kernel_4x2_haswell.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_LT': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RT': 'x86_64/sgemm_kernel_8x4_haswell.c', + }, + 'd': { + '_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LC': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RC': 'x86_64/cgemm_kernel_8x2_haswell.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LC': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RC': 'x86_64/zgemm_kernel_4x2_haswell.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c', + '_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c', + '_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c', + '_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'x86_64/dtrsm_kernel_RN_haswell.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_beta': 'x86_64/sgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_beta': 'x86_64/dgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_4.c', + '_itcopy': 'generic/gemm_tcopy_4.c', + '_oncopy': 'x86_64/dgemm_ncopy_8_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_8.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c', + }, + }, + '?asum': { + 's': { + '_k': 'x86_64/sasum.c', + }, + 'd': { + '_k': 'x86_64/dasum.c', + }, + }, + '?rot': { + 's': { + '_k': 'x86_64/srot.c', + }, + 'd': { + '_k': 'x86_64/drot.c', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/kernel/x86_64/meson_sandybridge/meson.build b/subprojects/packagefiles/openblas/kernel/x86_64/meson_sandybridge/meson.build new file mode 100644 index 000000000..1c83a58fc --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/x86_64/meson_sandybridge/meson.build @@ -0,0 +1,213 @@ +x86_64_sandybridge_dict = { + '?scal': { + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + }, + '?ger': { + 's': { + '_k': 'x86_64/sger.c', + }, + 'd': { + '_k': 'x86_64/dger.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_16x4_sandy.S', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_sandy.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_l': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_r': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_b': 'x86_64/cgemm_kernel_8x2_sandy.S', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_l': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_r': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_b': 'x86_64/zgemm_kernel_1x4_nehalem.S', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_LT': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_RN': 'x86_64/sgemm_kernel_16x4_sandy.S', + '_RT': 'x86_64/sgemm_kernel_16x4_sandy.S', + }, + 'd': { + '_LN': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_LT': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_RN': 'x86_64/dgemm_kernel_4x8_sandy.S', + '_RT': 'x86_64/dgemm_kernel_4x8_sandy.S', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LT': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LR': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_LC': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RN': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RT': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RR': 'x86_64/cgemm_kernel_8x2_sandy.S', + '_RC': 'x86_64/cgemm_kernel_8x2_sandy.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LT': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LR': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_LC': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RN': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RT': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RR': 'x86_64/zgemm_kernel_1x4_nehalem.S', + '_RC': 'x86_64/zgemm_kernel_1x4_nehalem.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'generic/gemm_tcopy_16.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'x86_64/zgemm_ncopy_1.S', + '_itcopy': 'x86_64/zgemm_tcopy_1.S', + '_oncopy': 'generic/zgemm_ncopy_4.c', + '_otcopy': 'generic/zgemm_tcopy_4.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/zgemm3m_kernel_4x8_nehalem.S', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_2x8_nehalem.S', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/kernel/x86_64/meson_skylakex/meson.build b/subprojects/packagefiles/openblas/kernel/x86_64/meson_skylakex/meson.build new file mode 100644 index 000000000..8f28e6929 --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/x86_64/meson_skylakex/meson.build @@ -0,0 +1,109 @@ +x86_64_skylakex_dict = { + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_16x4_skylakex_3.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_16x2_skylakex.c', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_l': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_r': 'x86_64/cgemm_kernel_8x2_skylakex.c', + '_b': 'x86_64/cgemm_kernel_8x2_skylakex.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_l': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_r': 'x86_64/zgemm_kernel_4x2_skylakex.c', + '_b': 'x86_64/zgemm_kernel_4x2_skylakex.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_LT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_RN': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + '_RT': 'x86_64/sgemm_kernel_16x4_skylakex_2.c', + }, + 'd': { + '_LN': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_LT': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_RN': 'x86_64/dgemm_kernel_16x2_skylakex.c', + '_RT': 'x86_64/dgemm_kernel_16x2_skylakex.c', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'd': { + '_RN': 'generic/trsm_kernel_RN.c', + }, + }, + '?gemm_small_kernel': { + 's': { + '_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c', + }, + 'd': { + '_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c', + }, + }, + '?gemm_small_kernel_b0': { + 's': { + '_nn': 'x86_64/sgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/sgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/sgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/sgemm_small_kernel_tt_skylakex.c', + }, + 'd': { + '_nn': 'x86_64/dgemm_small_kernel_nn_skylakex.c', + '_nt': 'x86_64/dgemm_small_kernel_nt_skylakex.c', + '_tn': 'x86_64/dgemm_small_kernel_tn_skylakex.c', + '_tt': 'x86_64/dgemm_small_kernel_tt_skylakex.c', + }, + }, + '?gemm': { + 's': { + '_small_matrix_permit': 'x86_64/sgemm_small_kernel_permit_skylakex.c', + '_beta': 'x86_64/sgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'x86_64/sgemm_tcopy_16_skylakex.c', + '_oncopy': 'x86_64/sgemm_ncopy_4_skylakex.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_small_matrix_permit': 'x86_64/dgemm_small_kernel_permit_skylakex.c', + '_beta': 'x86_64/dgemm_beta_skylakex.c', + '_incopy': 'generic/gemm_ncopy_16.c', + '_itcopy': 'x86_64/dgemm_tcopy_16_skylakex.c', + '_oncopy': 'generic/gemm_ncopy_2.c', + '_otcopy': 'generic/gemm_tcopy_2.c', + }, + }, + '?asum': { + 'c': { + '_k': 'x86_64/casum.c', + }, + 'z': { + '_k': 'x86_64/zasum.c', + }, + }, + '?sum': { + 'c': { + '_k': 'x86_64/csum.c', + }, + 'z': { + '_k': 'x86_64/zsum.c', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/kernel/x86_64/meson_zen/meson.build b/subprojects/packagefiles/openblas/kernel/x86_64/meson_zen/meson.build new file mode 100644 index 000000000..ff4ce2f7a --- /dev/null +++ b/subprojects/packagefiles/openblas/kernel/x86_64/meson_zen/meson.build @@ -0,0 +1,228 @@ +x86_64_zen_dict = { + '?scal': { + 's': { + '_k': 'x86_64/sscal.c', + }, + 'd': { + '_k': 'x86_64/dscal.c', + }, + 'c': { + '_k': 'x86_64/cscal.c', + }, + 'z': { + '_k': 'x86_64/zscal.c', + }, + }, + '?gemv': { + 's': { + '_n': 'x86_64/sgemv_n_4.c', + '_t': 'x86_64/sgemv_t_4.c', + }, + 'd': { + '_n': 'x86_64/dgemv_n_4.c', + '_t': 'x86_64/dgemv_t_4.c', + }, + 'c': { + '_n': 'x86_64/cgemv_n_4.c', + '_t': 'x86_64/cgemv_t_4.c', + }, + 'z': { + '_n': 'x86_64/zgemv_n_4.c', + '_t': 'x86_64/zgemv_t_4.c', + }, + }, + '?symv': { + 's': { + '_U': 'x86_64/ssymv_U.c', + '_L': 'x86_64/ssymv_L.c', + }, + 'd': { + '_U': 'x86_64/dsymv_U.c', + '_L': 'x86_64/dsymv_L.c', + }, + }, + '?dot': { + 's': { + '_k': 'x86_64/sdot.c', + }, + 'd': { + '_k': 'x86_64/ddot.c', + }, + }, + '?dotc': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dotu': { + 'c': { + '_k': 'x86_64/cdot.c', + }, + 'z': { + '_k': 'x86_64/zdot.c', + }, + }, + '?dsdot': { + '': { + '_k': 'x86_64/sdot.c', + }, + 's': { + '_k': 'x86_64/sdot.c', + }, + }, + '?axpy': { + 's': { + '_k': 'x86_64/saxpy.c', + }, + 'd': { + '_k': 'x86_64/daxpy.c', + }, + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?axpyc': { + 'c': { + '_k': 'x86_64/caxpy.c', + }, + 'z': { + '_k': 'x86_64/zaxpy.c', + }, + }, + '?gemm_kernel': { + 's': { + '': 'x86_64/sgemm_kernel_8x4_haswell_2.c', + }, + 'd': { + '': 'x86_64/dgemm_kernel_4x8_haswell.S', + }, + 'c': { + '_n': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_l': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_r': 'x86_64/cgemm_kernel_8x2_haswell.c', + '_b': 'x86_64/cgemm_kernel_8x2_haswell.c', + }, + 'z': { + '_n': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_l': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_r': 'x86_64/zgemm_kernel_4x2_haswell.c', + '_b': 'x86_64/zgemm_kernel_4x2_haswell.c', + }, + }, + '?trmm_kernel': { + 's': { + '_LN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_LT': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RN': 'x86_64/sgemm_kernel_8x4_haswell.c', + '_RT': 'x86_64/sgemm_kernel_8x4_haswell.c', + }, + 'd': { + '_LN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_LT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RN': 'x86_64/dtrmm_kernel_4x8_haswell.c', + '_RT': 'x86_64/dtrmm_kernel_4x8_haswell.c', + }, + 'c': { + '_LN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_LC': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RN': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RT': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RR': 'x86_64/cgemm_kernel_8x2_haswell.S', + '_RC': 'x86_64/cgemm_kernel_8x2_haswell.S', + }, + 'z': { + '_LN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_LC': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RN': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RT': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RR': 'x86_64/zgemm_kernel_4x2_haswell.S', + '_RC': 'x86_64/zgemm_kernel_4x2_haswell.S', + }, + }, + '?trsm_kernel': { + 's': { + '_LN': 'x86_64/strsm_kernel_8x4_haswell_LN.c', + '_LT': 'x86_64/strsm_kernel_8x4_haswell_LT.c', + '_RN': 'x86_64/strsm_kernel_8x4_haswell_RN.c', + '_RT': 'x86_64/strsm_kernel_8x4_haswell_RT.c', + }, + 'd': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_RN': 'x86_64/dtrsm_kernel_RN_haswell.c', + '_RT': 'generic/trsm_kernel_RT.c', + }, + 'c': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + 'z': { + '_LN': 'generic/trsm_kernel_LN.c', + '_LT': 'generic/trsm_kernel_LT.c', + '_LR': 'generic/trsm_kernel_LN.c', + '_LC': 'generic/trsm_kernel_LT.c', + '_RN': 'generic/trsm_kernel_RN.c', + '_RT': 'generic/trsm_kernel_RT.c', + '_RR': 'generic/trsm_kernel_RN.c', + '_RC': 'generic/trsm_kernel_RT.c', + }, + }, + '?gemm': { + 's': { + '_incopy': 'generic/gemm_ncopy_8.c', + '_itcopy': 'generic/gemm_tcopy_8.c', + '_oncopy': 'generic/gemm_ncopy_4.c', + '_otcopy': 'generic/gemm_tcopy_4.c' + }, + 'd': { + '_incopy': 'generic/gemm_ncopy_4.c', + '_itcopy': 'generic/gemm_tcopy_4.c', + '_oncopy': 'generic/gemm_ncopy_8.c', + '_otcopy': 'generic/gemm_tcopy_8.c', + }, + 'c': { + '_incopy': 'generic/zgemm_ncopy_8.c', + '_itcopy': 'generic/zgemm_tcopy_8.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + 'z': { + '_incopy': 'generic/zgemm_ncopy_4.c', + '_itcopy': 'generic/zgemm_tcopy_4.c', + '_oncopy': 'generic/zgemm_ncopy_2.c', + '_otcopy': 'generic/zgemm_tcopy_2.c' + }, + }, + '?gemm3m': { + 'c': { + '_kernel': 'x86_64/cgemm3m_kernel_8x4_haswell.c', + }, + 'z': { + '_kernel': 'x86_64/zgemm3m_kernel_4x4_haswell.c', + }, + }, + '?rot': { + 's': { + '_k': 'x86_64/srot.c', + }, + 'd': { + '_k': 'x86_64/drot.c', + }, + }, +} diff --git a/subprojects/packagefiles/openblas/lapack-netlib/BLAS/SRC/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/SRC/meson.build new file mode 100644 index 000000000..920c06e91 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/SRC/meson.build @@ -0,0 +1,284 @@ +# Derived from the CMakeLists.txt +# +# Relevant groups and variables: +# +# _allblas -- Auxiliary routines for Level 2 and 3 BLAS +# +# Level 1 BLAS # +# +# _dblas1 -- Double precision real BLAS 1 routines +# _zblas1 -- Double precision complex BLAS 1 routines +# _zb1aux -- D.P. real BLAS 1 routines called by d.p. complex routines +# +# _sblas1 -- Single precision real BLAS routines +# _cblas1 -- Single precision complex BLAS routines +# _cb1aux -- Real BLAS routines called by complex routines +# +# Level 2 BLAS # +# +# _dblas2 -- Double precision real BLAS 2 routines +# _zblas2 -- Double precision complex BLAS 2 routines +# +# _sblas2 -- Single precision real BLAS2 routines +# _cblas2 -- Single precision complex BLAS2 routines +# +# Level 3 BLAS # +# +# _dblas3 -- Double precision real BLAS 3 routines +# _zblas3 -- Double precision complex BLAS 3 routines +# +# _sblas3 -- Single precision real BLAS3 routines +# _cblas3 -- Single precision complex BLAS3 routines + +# _allblas -- Auxiliary routines for Level 2 and 3 BLAS +_allblas = library('_allblas', + sources: [ 'lsame.f', 'xerbla.f', 'xerbla_array.f' ]) + + +# All other sources +_blas_netlib_srcs = [] + +# Level 1 BLAS +# _dblas1 -- Double precision real BLAS 1 routines +_dblas1 = [ + 'idamax.f', + 'dasum.f', + 'daxpy.f', + 'dcopy.f', + 'ddot.f', + 'dnrm2.f', + 'drot.f', + 'drotg.f', + 'dscal.f', + 'dsdot.f', + 'dswap.f', + 'drotmg.f', + 'drotm.f', +] + +# _zblas1 -- Double precision complex BLAS 1 routines +_zblas1 = [ + 'dcabs1.f', + 'dzasum.f', + 'dznrm2.f', + 'izamax.f', + 'zaxpy.f', + 'zcopy.f', + 'zdotc.f', + 'zdotu.f', + 'zdscal.f', + 'zrotg.f', + 'zscal.f', + 'zswap.f', + 'zdrot.f', +] + +# _zb1aux -- D.P. real BLAS routines called by d.p. complex routines +_zb1aux = [ + 'idamax.f', + 'dasum.f', + 'daxpy.f', + 'dcopy.f', + 'dnrm2.f', + 'dscal.f', +] + +# _sblas1 -- Single precision real BLAS routines +_sblas1 = [ + 'isamax.f', + 'sasum.f', + 'saxpy.f', + 'scopy.f', + 'sdot.f', + 'snrm2.f', + 'srot.f', + 'srotg.f', + 'sscal.f', + 'sswap.f', + 'sdsdot.f', + 'srotmg.f', + 'srotm.f', +] + +# _cblas1 -- Single precision complex BLAS routines +_cblas1 = [ + 'scabs1.f', + 'scasum.f', + 'scnrm2.f', + 'icamax.f', + 'caxpy.f', + 'ccopy.f', + 'cdotc.f', + 'cdotu.f', + 'csscal.f', + 'crotg.f', + 'cscal.f', + 'cswap.f', + 'csrot.f', +] + +# _cb1aux -- Real BLAS routines called by complex routines +_cb1aux = [ + 'isamax.f', + 'sasum.f', + 'saxpy.f', + 'scopy.f', + 'snrm2.f', + 'sscal.f' +] + +# Level 2 BLAS +# _dblas2 -- Double precision real BLAS2 routines +_dblas2 = [ + 'dgemv.f', + 'dgbmv.f', + 'dsymv.f', + 'dsbmv.f', + 'dspmv.f', + 'dtrmv.f', + 'dtbmv.f', + 'dtpmv.f', + 'dtrsv.f', + 'dtbsv.f', + 'dtpsv.f', + 'dger.f', + 'dsyr.f', + 'dspr.f', + 'dsyr2.f', + 'dspr2.f', +] + +# _zblas2 -- Double precision complex BLAS2 routines +_zblas2 = [ + 'zgemv.f', + 'zgbmv.f', + 'zhemv.f', + 'zhbmv.f', + 'zhpmv.f', + 'ztrmv.f', + 'ztbmv.f', + 'ztpmv.f', + 'ztrsv.f', + 'ztbsv.f', + 'ztpsv.f', + 'zgerc.f', + 'zgeru.f', + 'zher.f', + 'zhpr.f', + 'zher2.f', + 'zhpr2.f', +] + +# _sblas2 -- Single precision real BLAS2 routines +_sblas2 = [ + 'sgemv.f', + 'sgbmv.f', + 'ssymv.f', + 'ssbmv.f', + 'sspmv.f', + 'strmv.f', + 'stbmv.f', + 'stpmv.f', + 'strsv.f', + 'stbsv.f', + 'stpsv.f', + 'sger.f', + 'ssyr.f', + 'sspr.f', + 'ssyr2.f', + 'sspr2.f', +] + +# _cblas2 -- Single precision complex BLAS2 routines +_cblas2 = [ + 'cgemv.f', + 'cgbmv.f', + 'chemv.f', + 'chbmv.f', + 'chpmv.f', + 'ctrmv.f', + 'ctbmv.f', + 'ctpmv.f', + 'ctrsv.f', + 'ctbsv.f', + 'ctpsv.f', + 'cgerc.f', + 'cgeru.f', + 'cher.f', + 'chpr.f', + 'cher2.f', + 'chpr2.f', +] + +# Level 3 BLAS + +# _dblas3 -- Double precision real BLAS3 routines +_dblas3 = [ + 'dgemm.f', + 'dsymm.f', + 'dsyrk.f', + 'dsyr2k.f', + 'dtrmm.f', + 'dtrsm.f', +] + +# _zblas3 -- Double precision complex BLAS3 routines +_zblas3 = [ + 'zgemm.f', + 'zsymm.f', + 'zsyrk.f', + 'zsyr2k.f', + 'ztrmm.f', + 'ztrsm.f', + 'zhemm.f', + 'zherk.f', + 'zher2k.f', +] + +# _sblas3 -- Single precision real BLAS3 routines +_sblas3 =[ + 'sgemm.f', + 'ssymm.f', + 'ssyrk.f', + 'ssyr2k.f', + 'strmm.f', + 'strsm.f', +] + +# _cblas3 -- Single precision complex BLAS3 routines +_cblas3 = [ + 'cgemm.f', + 'csymm.f', + 'csyrk.f', + 'csyr2k.f', + 'ctrmm.f', + 'ctrsm.f', + 'chemm.f', + 'cherk.f', + 'cher2k.f', +] + +# Start making the blas target +if prec == 'c' or build_complex or build_all_prec + _blas_netlib_srcs += _cblas1 + _cb1aux + _cblas2 + _cblas3 +endif + +if prec == 'z' or build_complex16 or build_all_prec + _blas_netlib_srcs += _zblas1 + _zb1aux + _zblas2 + _zblas3 +endif + +if prec == 'd' or build_double or build_all_prec + _blas_netlib_srcs += _dblas1 + _dblas2 + _dblas3 +endif + +if prec == 's' or build_single or build_all_prec + _blas_netlib_srcs += _sblas1 + _sblas2 + _sblas3 +endif + +# Create the blas library +netlib_blas = library(_netlib_blas_name, + sources: _blas_netlib_srcs, + link_with: _allblas, + version: lapack_version, + soversion: lapack_major_version, + install: true) diff --git a/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/meson.build new file mode 100644 index 000000000..63c3cbf66 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/meson.build @@ -0,0 +1,73 @@ +_blas_noinput_test_array = [# + # ['Pretty name', 'binary_name', 'BlahTest.cpp'] +] +_blas_input_test_array = [# + # ['Pretty name', 'binary_name', 'BlahTest.cpp', 'inputfile.in'] +] + +if prec == 's' or build_single or build_all_prec + _blas_noinput_test_array += [ + ['Test REAL Level 1 BLAS', 'xblat1s', 'sblat1.f'], + ] + _blas_input_test_array += [ + ['Test REAL Level 2 BLAS', 'xblat2s', 'sblat2.f', 'sblat2.in'], + ['Test REAL Level 3 BLAS', 'xblat3s', 'sblat3.f', 'sblat3.in'], + ] +endif + +if prec == 'd' or build_double or build_all_prec + _blas_noinput_test_array += [ + ['Test DOUBLE PRECISION Level 1 BLAS', 'xblat1d', 'dblat1.f'], + ] + _blas_input_test_array += [ + ['Test DOUBLE PRECISION Level 2 BLAS', 'xblat2d', 'dblat2.f', 'dblat2.in'], + ['Test DOUBLE PRECISION Level 3 BLAS', 'xblat3d', 'dblat3.f', 'dblat3.in'], + ] +endif + + +if prec == 'c' or build_complex or build_all_prec + _blas_noinput_test_array += [ + ['Test COMPLEX Level 1 BLAS', 'xblat1c', 'cblat1.f'], + ] + _blas_input_test_array += [ + ['Test COMPLEX Level 2 BLAS', 'xblat2c', 'cblat2.f', 'cblat2.in'], + ['Test COMPLEX Level 3 BLAS', 'xblat3c', 'cblat3.f', 'cblat3.in'], + ] +endif + +if prec == 'z' or build_complex16 or build_all_prec + _blas_noinput_test_array += [ + ['Test COMPLEX*16 Level 1 BLAS', 'xblat1z', 'zblat1.f'], + ] + _blas_input_test_array += [ + ['Test COMPLEX*16 Level 2 BLAS', 'xblat2z', 'zblat2.f', 'zblat2.in'], + ['Test COMPLEX*16 Level 3 BLAS', 'xblat3z', 'zblat3.f', 'zblat3.in'], + ] +endif + +foreach _test : _blas_noinput_test_array + test(_test.get(0), + executable(_test.get(1), + sources : _test.get(2), + link_with : netlib_blas, + ), + ) +endforeach + +fortran_test_runner = executable('run_fortran_test', + sources: ['run_fortran.c'], + install: false) + +# NOTE: For the tests to pass the executables need to be compiled first +foreach _test : _blas_input_test_array + executable(_test.get(1), + sources : _test.get(2), + link_with : netlib_blas, + ) + test_exe = meson.current_build_dir() / _test.get(1) + input_file = meson.source_root() + '/lapack-netlib/BLAS/TESTING/' + _test.get(3) + test(_test.get(0), fortran_test_runner, + args: [test_exe, input_file], + workdir: meson.current_build_dir()) +endforeach diff --git a/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/run_fortran.c b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/run_fortran.c new file mode 100644 index 000000000..28dbe64e4 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/TESTING/run_fortran.c @@ -0,0 +1,21 @@ +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + char command[1024]; + snprintf(command, sizeof(command), "%s < %s", argv[1], argv[2]); + + int result = system(command); + if (result != 0) { + fprintf(stderr, "Error: Command '%s' failed with return code %d.\n", command, result); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/subprojects/packagefiles/openblas/lapack-netlib/BLAS/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/meson.build new file mode 100644 index 000000000..6a3f776b0 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/BLAS/meson.build @@ -0,0 +1,12 @@ +_netlib_blas_name = get_option('netlib_blas_name') +_netlib_cblas_name = get_option('netlib_cblas_name') +_netlib_lapack_name = get_option('netlib_lapack_name') +_netlib_tmglib_name = get_option('netlib_tmglib_name') +_netlib_lapacke_name = get_option('netlib_lapacke_name') + +subdir('SRC') + +build_testing = get_option('build_testing') +if build_testing and not is_win + subdir('TESTING') +endif diff --git a/subprojects/packagefiles/openblas/lapack-netlib/INSTALL/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/INSTALL/meson.build new file mode 100644 index 000000000..f474e5761 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/INSTALL/meson.build @@ -0,0 +1,66 @@ +# Relevant groups and variables: +# +# _install_aux -- Auxiliary function library +# _int_cpu_time -- Timing library, Fortran only +# +# C / Fortran +_install_aux_srcs = [] +_ia_linkwith = [] + +if not use_c_lapack + _int_cpu_srcs = [] # Fortran only + _install_aux_srcs += ['ilaver.f'] + _ia_linkwith = _allblas # Includes lsame.f + # TODO: Delete the one in install in favor of the one from BLAS + # The meson build uses the BLAS variant anyway + # lsame.f is the same modulo formatting changes as the one in blas/src/lsame.f + # lsame.f claims to be from lapack 3.7.0 and has more whitespace but is + # otherwise identical to the reference from blas 3.1 + + if prec == 'd' or build_double or build_all_prec + _int_cpu_srcs += [ + 'dsecnd_INT_CPU_TIME.f', + ] + _install_aux_srcs += [ + 'dlamch.f', + 'droundup_lwork.f', + ] + endif + + if prec == 's' or build_single or build_all_prec + _int_cpu_srcs += [ + 'second_INT_CPU_TIME.f', + ] + _install_aux_srcs += [ + 'slamch.f', + 'sroundup_lwork.f', + ] + endif + _int_cpu = library('_int_cpu', + sources: _int_cpu_srcs, + version: lapack_version, + soversion: lapack_major_version, + install: true) +else + _install_aux_srcs += ['ilaver.c', 'lsame.c'] + + if prec == 'd' or build_double or build_all_prec + _install_aux_srcs += [ + 'dlamch.c' + ] + endif + + if prec == 's' or build_single or build_all_prec + _install_aux_srcs += [ + 'slamch.c' + ] + endif + +endif + +_install_aux = library('_install_aux', + sources: _install_aux_srcs, + link_with: _ia_linkwith, + version: lapack_version, + soversion: lapack_major_version, + install: true) diff --git a/subprojects/packagefiles/openblas/lapack-netlib/SRC/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/SRC/meson.build new file mode 100644 index 000000000..1e9a120be --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/SRC/meson.build @@ -0,0 +1,512 @@ +# Relevant groups and variables: +# +# _allaux -- Auxiliary routines called from all precisions +# +# _slaux -- Auxiliary routines called from single precision +# _dzlaux -- Auxiliary routines called from double precision +# +# _dslasrc -- Double-single mixed precision real routines called from single, single-extra, and double precision real LAPACK routines (SLASRC, SXLASRC, DLASRC) +# _zclasrc -- Double-single mixed precision complex routines called from single, single-extra, and double precision complex LAPACK routines (CLASRC, CXLASRC, ZLASRC) +# +# _slasrc -- Single precision real LAPACK routines +# _sxlasrc -- Single precision real LAPACK routines using extra precision +# _clasrc -- Single precision complex LAPACK routines +# _cxlasrc -- Single precision complex LAPACK routines using extra precision +# +# _dlasrc -- Double precision real LAPACK routines +# _dxlasrc -- Double precision real LAPACK routines using extra precision +# _zlasrc -- Double precision complex LAPACK routines +# _zxlasrc -- Double precision complex LAPACK routines using extra precision +# +# TODO: meson doesn't support the deprecated routines yet +# _deprecated -- Deprecated routines in all precisions + +# _allaux -- Auxiliary routines called from all precisions +# Needs _install_aux +# _allblas contains xerbla and xerbla_array already +_allaux = [ + 'ilaenv.f', 'ilaenv2stage.f', 'ieeeck.f', 'lsamen.f', 'iparmq.f', + 'iparam2stage.F', 'ilaprec.f', 'ilatrans.f', 'ilauplo.f', 'iladiag.f', + 'chla_transtype.f', 'la_xisnan.F90', 'la_constants.f90', +] + +_deps = [] +_libs = [ _allblas, _install_aux ] +# _auxlapack -- Auxiliary routines for lapack +_auxlapack = library('_auxlapack', + sources: _allaux, + link_with: _libs, + ) +_libs += _auxlapack + + +# All other sources +_lapack_netlib_srcs = [] + + +# _slaux -- Auxiliary routines called from single precision +# Needs _int_cpu and _install_aux +# For sroundup_lwork.f +# TODO: Handle SECOND_SRC +_sclaux = [ + 'sbdsvdx.f', 'sbdsdc.f', 'sbdsqr.f', 'sdisna.f', 'slabad.f', 'slacpy.f', + 'sladiv.f', 'slae2.f', 'slaebz.f', 'slaed0.f', 'slaed1.f', 'slaed2.f', + 'slaed3.f', 'slaed4.f', 'slaed5.f', 'slaed6.f', 'slaed7.f', 'slaed8.f', + 'slaed9.f', 'slaeda.f', 'slaev2.f', 'slagtf.f', 'slagts.f', 'slamrg.f', + 'slanst.f', 'slapy2.f', 'slapy3.f', 'slarnv.f', 'slarra.f', 'slarrb.f', + 'slarrc.f', 'slarrd.f', 'slarre.f', 'slarrf.f', 'slarrj.f', 'slarrk.f', + 'slarrr.f', 'slaneg.f', 'slartg.f90', 'slaruv.f', 'slas2.f', 'slascl.f', + 'slasd0.f', 'slasd1.f', 'slasd2.f', 'slasd3.f', 'slasd4.f', 'slasd5.f', + 'slasd6.f', 'slasd7.f', 'slasd8.f', 'slasda.f', 'slasdq.f', 'slasdt.f', + 'slaset.f', 'slasq1.f', 'slasq2.f', 'slasq3.f', 'slasq4.f', 'slasq5.f', + 'slasq6.f', 'slasr.f', 'slasrt.f', 'slassq.f90', 'slasv2.f', 'spttrf.f', + 'sstebz.f', 'sstedc.f', 'sstein.f', 'ssteqr.f', 'ssterf.f', 'sstevx.f', + 'slartgp.f', 'slartgs.f', +] + +# _dzlaux -- Auxiliary routines called from double precision +# Needs _int_cpu and _install_aux +# For droundup_lwork.f and dlamch.f +# TODO: Handle DSECOND_SRC +_dzlaux = [ + 'la_constants.f90', 'dbdsdc.f', 'dbdsvdx.f', 'dbdsqr.f', 'ddisna.f', + 'disnan.f', 'dlabad.f', 'dlacpy.f', 'dladiv.f', 'dlae2.f', 'dlaebz.f', + 'dlaed0.f', 'dlaed1.f', 'dlaed2.f', 'dlaed3.f', 'dlaed4.f', 'dlaed5.f', + 'dlaed6.f', 'dlaed7.f', 'dlaed8.f', 'dlaed9.f', 'dlaeda.f', 'dlaev2.f', + 'dlagtf.f', 'dlagts.f', 'dlamrg.f', 'dlanst.f', 'dlapy2.f', 'dlapy3.f', + 'dlarnv.f', 'dlarra.f', 'dlarrb.f', 'dlarrc.f', 'dlarrd.f', 'dlarre.f', + 'dlarrf.f', 'dlarrj.f', 'dlarrk.f', 'dlarrr.f', 'dlaneg.f', 'dlartg.f90', + 'dlaruv.f', 'dlas2.f', 'dlascl.f', 'dlasd0.f', 'dlasd1.f', 'dlasd2.f', + 'dlasd3.f', 'dlasd4.f', 'dlasd5.f', 'dlasd6.f', 'dlasd7.f', 'dlasd8.f', + 'dlasda.f', 'dlasdq.f', 'dlasdt.f', 'dlaset.f', 'dlasq1.f', 'dlasq2.f', + 'dlasq3.f', 'dlasq4.f', 'dlasq5.f', 'dlasq6.f', 'dlasr.f', 'dlasrt.f', + 'dlassq.f90', 'dlasv2.f', 'dlaisnan.f', 'dpttrf.f', 'dstebz.f', 'dstedc.f', + 'dstein.f', 'dsteqr.f', 'dsterf.f', 'dstevx.f', 'dlartgp.f', 'dlartgs.f', +] + + +# _slasrc -- Single precision real LAPACK routines +_slasrc = [ + 'sgbbrd.f', 'sgbcon.f', 'sgbequ.f', 'sgbrfs.f', 'sgbsv.f', 'sgbsvx.f', + 'sgbtf2.f', 'sgbtrf.f', 'sgbtrs.f', 'sgebak.f', 'sgebal.f', 'sgebd2.f', + 'sgebrd.f', 'sgecon.f', 'sgeequ.f', 'sgees.f', 'sgeesx.f', 'sgeev.f', + 'sgeevx.f', 'sgehd2.f', 'sgehrd.f', 'sgelq2.f', 'sgelqf.f', 'sgels.f', + 'sgelsd.f', 'sgelss.f', 'sgelsy.f', 'sgeql2.f', 'sgeqlf.f', 'sgeqp3.f', + 'sgeqr2.f', 'sgeqr2p.f', 'sgeqrf.f', 'sgeqrfp.f', 'sgerfs.f', 'sgerq2.f', + 'sgerqf.f', 'sgesc2.f', 'sgesdd.f', 'sgesv.f', 'sgesvd.f', 'sgesvdx.f', + 'sgesvx.f', 'sgetc2.f', 'sgetf2.f', 'sgetri.f', 'sggbak.f', 'sggbal.f', + 'sgges.f', 'sgges3.f', 'sggesx.f', 'sggev.f', 'sggev3.f', 'sggevx.f', + 'sggglm.f', 'sgghrd.f', 'sgghd3.f', 'sgglse.f', 'sggqrf.f', 'sggrqf.f', + 'sggsvd3.f', 'sggsvp3.f', 'sgtcon.f', 'sgtrfs.f', 'sgtsv.f', 'sgtsvx.f', + 'sgttrf.f', 'sgttrs.f', 'sgtts2.f', 'shgeqz.f', 'slaqz0.f', 'slaqz1.f', + 'slaqz2.f', 'slaqz3.f', 'slaqz4.f', 'shsein.f', 'shseqr.f', 'slabrd.f', + 'slacon.f', 'slacn2.f', 'slaein.f', 'slaexc.f', 'slag2.f', 'slags2.f', + 'slagtm.f', 'slagv2.f', 'slahqr.f', 'slahr2.f', 'slaic1.f', 'slaln2.f', + 'slals0.f', 'slalsa.f', 'slalsd.f', 'slangb.f', 'slange.f', 'slangt.f', + 'slanhs.f', 'slansb.f', 'slansp.f', 'slansy.f', 'slantb.f', 'slantp.f', + 'slantr.f', 'slanv2.f', 'slapll.f', 'slapmt.f', 'slaqgb.f', 'slaqge.f', + 'slaqp2.f', 'slaqps.f', 'slaqsb.f', 'slaqsp.f', 'slaqsy.f', 'slaqr0.f', + 'slaqr1.f', 'slaqr2.f', 'slaqr3.f', 'slaqr4.f', 'slaqr5.f', 'slaqtr.f', + 'slar1v.f', 'slar2v.f', 'ilaslr.f', 'ilaslc.f', 'slarf.f', 'slarfb.f', + 'slarfb_gett.f', 'slarfg.f', 'slarfgp.f', 'slarft.f', 'slarfx.f', + 'slarfy.f', 'slargv.f', 'slarrv.f', 'slartv.f', 'slarz.f', 'slarzb.f', + 'slarzt.f', 'slasy2.f', 'slasyf.f', 'slasyf_rook.f', 'slasyf_rk.f', + 'slasyf_aa.f', 'slatbs.f', 'slatdf.f', 'slatps.f', 'slatrd.f', 'slatrs.f', + 'slatrz.f', 'slauu2.f', 'slauum.f', 'sopgtr.f', 'sopmtr.f', 'sorg2l.f', + 'sorg2r.f', 'sorgbr.f', 'sorghr.f', 'sorgl2.f', 'sorglq.f', 'sorgql.f', + 'sorgqr.f', 'sorgr2.f', 'sorgrq.f', 'sorgtr.f', 'sorgtsqr.f', + 'sorgtsqr_row.f', 'sorm2l.f', 'sorm2r.f', 'sorm22.f', 'sormbr.f', + 'sormhr.f', 'sorml2.f', 'sormlq.f', 'sormql.f', 'sormqr.f', 'sormr2.f', + 'sormr3.f', 'sormrq.f', 'sormrz.f', 'sormtr.f', 'spbcon.f', 'spbequ.f', + 'spbrfs.f', 'spbstf.f', 'spbsv.f', 'spbsvx.f', 'spbtf2.f', 'spbtrf.f', + 'spbtrs.f', 'spocon.f', 'spoequ.f', 'sporfs.f', 'sposv.f', 'sposvx.f', + 'spotf2.f', 'spotri.f', 'spstrf.f', 'spstf2.f', 'sppcon.f', 'sppequ.f', + 'spprfs.f', 'sppsv.f', 'sppsvx.f', 'spptrf.f', 'spptri.f', 'spptrs.f', + 'sptcon.f', 'spteqr.f', 'sptrfs.f', 'sptsv.f', 'sptsvx.f', 'spttrs.f', + 'sptts2.f', 'srscl.f', 'ssbev.f', 'ssbevd.f', 'ssbevx.f', 'ssbgst.f', + 'ssbgv.f', 'ssbgvd.f', 'ssbgvx.f', 'ssbtrd.f', 'sspcon.f', 'sspev.f', + 'sspevd.f', 'sspevx.f', 'sspgst.f', 'sspgv.f', 'sspgvd.f', 'sspgvx.f', + 'ssprfs.f', 'sspsv.f', 'sspsvx.f', 'ssptrd.f', 'ssptrf.f', 'ssptri.f', + 'ssptrs.f', 'sstegr.f', 'sstev.f', 'sstevd.f', 'sstevr.f', 'ssycon.f', + 'ssyev.f', 'ssyevd.f', 'ssyevr.f', 'ssyevx.f', 'ssygs2.f', 'ssygst.f', + 'ssygv.f', 'ssygvd.f', 'ssygvx.f', 'ssyrfs.f', 'ssysv.f', 'ssysvx.f', + 'ssytd2.f', 'ssytf2.f', 'ssytrd.f', 'ssytrf.f', 'ssytri.f', 'ssytri2.f', + 'ssytri2x.f', 'ssyswapr.f', 'ssytrs.f', 'ssytrs2.f', 'ssyconv.f', + 'ssyconvf.f', 'ssyconvf_rook.f', 'ssytf2_rook.f', 'ssytrf_rook.f', + 'ssytrs_rook.f', 'ssytri_rook.f', 'ssycon_rook.f', 'ssysv_rook.f', + 'ssytf2_rk.f', 'ssytrf_rk.f', 'ssytrs_3.f', 'ssytri_3.f', 'ssytri_3x.f', + 'ssycon_3.f', 'ssysv_rk.f', 'ssysv_aa.f', 'ssytrf_aa.f', 'ssytrs_aa.f', + 'ssysv_aa_2stage.f', 'ssytrf_aa_2stage.f', 'ssytrs_aa_2stage.f', 'stbcon.f', + 'stbrfs.f', 'stbtrs.f', 'stgevc.f', 'stgex2.f', 'stgexc.f', 'stgsen.f', + 'stgsja.f', 'stgsna.f', 'stgsy2.f', 'stgsyl.f', 'stpcon.f', 'stprfs.f', + 'stptri.f', 'stptrs.f', 'strcon.f', 'strevc.f', 'strevc3.f', 'strexc.f', + 'strrfs.f', 'strsen.f', 'strsna.f', 'strsyl.f', 'strti2.f', 'strtri.f', + 'strtrs.f', 'stzrzf.f', 'sstemr.f', 'slansf.f', 'spftrf.f', 'spftri.f', + 'spftrs.f', 'ssfrk.f', 'stfsm.f', 'stftri.f', 'stfttp.f', 'stfttr.f', + 'stpttf.f', 'stpttr.f', 'strttf.f', 'strttp.f', 'sgejsv.f', 'sgesvj.f', + 'sgsvj0.f', 'sgsvj1.f', 'sgeequb.f', 'ssyequb.f', 'spoequb.f', 'sgbequb.f', + 'sbbcsd.f', 'slapmr.f', 'sorbdb.f', 'sorbdb1.f', 'sorbdb2.f', 'sorbdb3.f', + 'sorbdb4.f', 'sorbdb5.f', 'sorbdb6.f', 'sorcsd.f', 'sorcsd2by1.f', + 'sgeqrt.f', 'sgeqrt2.f', 'sgeqrt3.f', 'sgemqrt.f', 'stpqrt.f', 'stpqrt2.f', + 'stpmqrt.f', 'stprfb.f', 'sgelqt.f', 'sgelqt3.f', 'sgemlqt.f', 'sgetsls.f', + 'sgetsqrhrt.f', 'sgeqr.f', 'slatsqr.f', 'slamtsqr.f', 'sgemqr.f', 'sgelq.f', + 'slaswlq.f', 'slamswlq.f', 'sgemlq.f', 'stplqt.f', 'stplqt2.f', 'stpmlqt.f', + 'sorhr_col.f', 'slaorhr_col_getrfnp.f', 'slaorhr_col_getrfnp2.f', + 'ssytrd_2stage.f', 'ssytrd_sy2sb.f', 'ssytrd_sb2st.F', 'ssb2st_kernels.f', + 'ssyevd_2stage.f', 'ssyev_2stage.f', 'ssyevx_2stage.f', 'ssyevr_2stage.f', + 'ssbev_2stage.f', 'ssbevx_2stage.f', 'ssbevd_2stage.f', 'ssygv_2stage.f', + 'sgesvdq.f', +] + +# DSLASRC -- Double-single mixed precision real routines +_dslasrc = [ + 'sgetrf.f', 'sgetrf2.f', 'sgetrs.f', 'sisnan.f', 'slaisnan.f', 'slaswp.f', + 'spotrf.f', 'spotrf2.f', 'spotrs.f', +] + +# SXLASRC -- Single precision real LAPACK routines using extra precision +_sxlasrc = [ + 'sgesvxx.f', 'sgerfsx.f', 'sla_gerfsx_extended.f', 'sla_geamv.f', + 'sla_gercond.f', 'sla_gerpvgrw.f', 'ssysvxx.f', 'ssyrfsx.f', + 'sla_syrfsx_extended.f', 'sla_syamv.f', 'sla_syrcond.f', 'sla_syrpvgrw.f', + 'sposvxx.f', 'sporfsx.f', 'sla_porfsx_extended.f', 'sla_porcond.f', + 'sla_porpvgrw.f', 'sgbsvxx.f', 'sgbrfsx.f', 'sla_gbrfsx_extended.f', + 'sla_gbamv.f', 'sla_gbrcond.f', 'sla_gbrpvgrw.f', 'sla_lin_berr.f', + 'slarscl2.f', 'slascl2.f', 'sla_wwaddw.f' +] + +# CLASRC -- Single precision complex LAPACK routines +_clasrc = [ + 'cbdsqr.f', 'cgbbrd.f', 'cgbcon.f', 'cgbequ.f', 'cgbrfs.f', 'cgbsv.f', + 'cgbsvx.f', 'cgbtf2.f', 'cgbtrf.f', 'cgbtrs.f', 'cgebak.f', 'cgebal.f', + 'cgebd2.f', 'cgebrd.f', 'cgecon.f', 'cgeequ.f', 'cgees.f', 'cgeesx.f', + 'cgeev.f', 'cgeevx.f', 'cgehd2.f', 'cgehrd.f', 'cgelq2.f', 'cgelqf.f', + 'cgels.f', 'cgelsd.f', 'cgelss.f', 'cgelsy.f', 'cgeql2.f', 'cgeqlf.f', + 'cgeqp3.f', 'cgeqr2.f', 'cgeqr2p.f', 'cgeqrf.f', 'cgeqrfp.f', 'cgerfs.f', + 'cgerq2.f', 'cgerqf.f', 'cgesc2.f', 'cgesdd.f', 'cgesv.f', 'cgesvd.f', + 'cgesvdx.f', 'cgesvj.f', 'cgejsv.f', 'cgsvj0.f', 'cgsvj1.f', 'cgesvx.f', + 'cgetc2.f', 'cgetf2.f', 'cgetrf2.f', 'cgetri.f', 'cggbak.f', 'cggbal.f', + 'cgges.f', 'cgges3.f', 'cggesx.f', 'cggev.f', 'cggev3.f', 'cggevx.f', + 'cggglm.f', 'cgghrd.f', 'cgghd3.f', 'cgglse.f', 'cggqrf.f', 'cggrqf.f', + 'cggsvd3.f', 'cggsvp3.f', 'cgtcon.f', 'cgtrfs.f', 'cgtsv.f', 'cgtsvx.f', + 'cgttrf.f', 'cgttrs.f', 'cgtts2.f', 'chbev.f', 'chbevd.f', 'chbevx.f', + 'chbgst.f', 'chbgv.f', 'chbgvd.f', 'chbgvx.f', 'chbtrd.f', 'checon.f', + 'cheev.f', 'cheevd.f', 'cheevr.f', 'cheevx.f', 'chegs2.f', 'chegst.f', + 'chegv.f', 'chegvd.f', 'chegvx.f', 'cherfs.f', 'chesv.f', 'chesvx.f', + 'chetd2.f', 'chetf2.f', 'chetrd.f', 'chetrf.f', 'chetri.f', 'chetri2.f', + 'chetri2x.f', 'cheswapr.f', 'chetrs.f', 'chetrs2.f', 'chetf2_rook.f', + 'chetrf_rook.f', 'chetri_rook.f', 'chetrs_rook.f', 'checon_rook.f', + 'chesv_rook.f', 'chetf2_rk.f', 'chetrf_rk.f', 'chetri_3.f', 'chetri_3x.f', + 'chetrs_3.f', 'checon_3.f', 'chesv_rk.f', 'chesv_aa.f', 'chetrf_aa.f', + 'chetrs_aa.f', 'chesv_aa_2stage.f', 'chetrf_aa_2stage.f', + 'chetrs_aa_2stage.f', 'chgeqz.f', 'chpcon.f', 'chpev.f', 'chpevd.f', + 'claqz0.f', 'claqz1.f', 'claqz2.f', 'claqz3.f', 'chpevx.f', 'chpgst.f', + 'chpgv.f', 'chpgvd.f', 'chpgvx.f', 'chprfs.f', 'chpsv.f', 'chpsvx.f', + 'chptrd.f', 'chptrf.f', 'chptri.f', 'chptrs.f', 'chsein.f', 'chseqr.f', + 'clabrd.f', 'clacgv.f', 'clacon.f', 'clacn2.f', 'clacp2.f', 'clacpy.f', + 'clacrm.f', 'clacrt.f', 'cladiv.f', 'claed0.f', 'claed7.f', 'claed8.f', + 'claein.f', 'claesy.f', 'claev2.f', 'clags2.f', 'clagtm.f', 'clahef.f', + 'clahef_rook.f', 'clahef_rk.f', 'clahef_aa.f', 'clahqr.f', 'clahr2.f', + 'claic1.f', 'clals0.f', 'clalsa.f', 'clalsd.f', 'clangb.f', 'clange.f', + 'clangt.f', 'clanhb.f', 'clanhe.f', 'clanhp.f', 'clanhs.f', 'clanht.f', + 'clansb.f', 'clansp.f', 'clansy.f', 'clantb.f', 'clantp.f', 'clantr.f', + 'clapll.f', 'clapmt.f', 'clarcm.f', 'claqgb.f', 'claqge.f', 'claqhb.f', + 'claqhe.f', 'claqhp.f', 'claqp2.f', 'claqps.f', 'claqsb.f', 'claqr0.f', + 'claqr1.f', 'claqr2.f', 'claqr3.f', 'claqr4.f', 'claqr5.f', 'claqsp.f', + 'claqsy.f', 'clar1v.f', 'clar2v.f', 'ilaclr.f', 'ilaclc.f', 'clarf.f', + 'clarfb.f', 'clarfb_gett.f', 'clarfg.f', 'clarfgp.f', 'clarft.f', + 'clarfx.f', 'clarfy.f', 'clargv.f', 'clarnv.f', 'clarrv.f', 'clartg.f90', + 'clartv.f', 'clarz.f', 'clarzb.f', 'clarzt.f', 'clascl.f', 'claset.f', + 'clasr.f', 'classq.f90', 'claswp.f', 'clasyf.f', 'clasyf_rook.f', + 'clasyf_rk.f', 'clasyf_aa.f', 'clatbs.f', 'clatdf.f', 'clatps.f', + 'clatrd.f', 'clatrs.f', 'clatrz.f', 'clauu2.f', 'clauum.f', 'cpbcon.f', + 'cpbequ.f', 'cpbrfs.f', 'cpbstf.f', 'cpbsv.f', 'cpbsvx.f', 'cpbtf2.f', + 'cpbtrf.f', 'cpbtrs.f', 'cpocon.f', 'cpoequ.f', 'cporfs.f', 'cposv.f', + 'cposvx.f', 'cpotf2.f', 'cpotrf2.f', 'cpotri.f', 'cpstrf.f', 'cpstf2.f', + 'cppcon.f', 'cppequ.f', 'cpprfs.f', 'cppsv.f', 'cppsvx.f', 'cpptrf.f', + 'cpptri.f', 'cpptrs.f', 'cptcon.f', 'cpteqr.f', 'cptrfs.f', 'cptsv.f', + 'cptsvx.f', 'cpttrf.f', 'cpttrs.f', 'cptts2.f', 'crscl.f', 'crot.f', + 'cspcon.f', 'cspmv.f', 'cspr.f', 'csprfs.f', 'cspsv.f', 'cspsvx.f', + 'csptrf.f', 'csptri.f', 'csptrs.f', 'csrscl.f', 'cstedc.f', 'cstegr.f', + 'cstein.f', 'csteqr.f', 'csycon.f', 'csymv.f', 'csyr.f', 'csyrfs.f', + 'csysv.f', 'csysvx.f', 'csytf2.f', 'csytrf.f', 'csytri.f', 'csytri2.f', + 'csytri2x.f', 'csyswapr.f', 'csytrs.f', 'csytrs2.f', 'csyconv.f', + 'csyconvf.f', 'csyconvf_rook.f', 'csytf2_rook.f', 'csytrf_rook.f', + 'csytrs_rook.f', 'csytri_rook.f', 'csycon_rook.f', 'csysv_rook.f', + 'csytf2_rk.f', 'csytrf_rk.f', 'csytrf_aa.f', 'csytrs_3.f', 'csytrs_aa.f', + 'csytri_3.f', 'csytri_3x.f', 'csycon_3.f', 'csysv_rk.f', 'csysv_aa.f', + 'csysv_aa_2stage.f', 'csytrf_aa_2stage.f', 'csytrs_aa_2stage.f', 'ctbcon.f', + 'ctbrfs.f', 'ctbtrs.f', 'ctgevc.f', 'ctgex2.f', 'ctgexc.f', 'ctgsen.f', + 'ctgsja.f', 'ctgsna.f', 'ctgsy2.f', 'ctgsyl.f', 'ctpcon.f', 'ctprfs.f', + 'ctptri.f', 'ctptrs.f', 'ctrcon.f', 'ctrevc.f', 'ctrevc3.f', 'ctrexc.f', + 'ctrrfs.f', 'ctrsen.f', 'ctrsna.f', 'ctrsyl.f', 'ctrti2.f', 'ctrtri.f', + 'ctrtrs.f', 'ctzrzf.f', 'cung2l.f', 'cung2r.f', 'cungbr.f', 'cunghr.f', + 'cungl2.f', 'cunglq.f', 'cungql.f', 'cungqr.f', 'cungr2.f', 'cungrq.f', + 'cungtr.f', 'cungtsqr.f', 'cungtsqr_row.f', 'cunm2l.f', 'cunm2r.f', + 'cunmbr.f', 'cunmhr.f', 'cunml2.f', 'cunm22.f', 'cunmlq.f', 'cunmql.f', + 'cunmqr.f', 'cunmr2.f', 'cunmr3.f', 'cunmrq.f', 'cunmrz.f', 'cunmtr.f', + 'cupgtr.f', 'cupmtr.f', 'icmax1.f', 'scsum1.f', 'cstemr.f', 'chfrk.f', + 'ctfttp.f', 'clanhf.f', 'cpftrf.f', 'cpftri.f', 'cpftrs.f', 'ctfsm.f', + 'ctftri.f', 'ctfttr.f', 'ctpttf.f', 'ctpttr.f', 'ctrttf.f', 'ctrttp.f', + 'cgeequb.f', 'cgbequb.f', 'csyequb.f', 'cpoequb.f', 'cheequb.f', 'cbbcsd.f', + 'clapmr.f', 'cunbdb.f', 'cunbdb1.f', 'cunbdb2.f', 'cunbdb3.f', 'cunbdb4.f', + 'cunbdb5.f', 'cunbdb6.f', 'cuncsd.f', 'cuncsd2by1.f', 'cgeqrt.f', + 'cgeqrt2.f', 'cgeqrt3.f', 'cgemqrt.f', 'ctpqrt.f', 'ctpqrt2.f', 'ctpmqrt.f', + 'ctprfb.f', 'cgelqt.f', 'cgelqt3.f', 'cgemlqt.f', 'cgetsls.f', + 'cgetsqrhrt.f', 'cgeqr.f', 'clatsqr.f', 'clamtsqr.f', 'cgemqr.f', 'cgelq.f', + 'claswlq.f', 'clamswlq.f', 'cgemlq.f', 'ctplqt.f', 'ctplqt2.f', 'ctpmlqt.f', + 'cunhr_col.f', 'claunhr_col_getrfnp.f', 'claunhr_col_getrfnp2.f', + 'chetrd_2stage.f', 'chetrd_he2hb.f', 'chetrd_hb2st.F', 'chb2st_kernels.f', + 'cheevd_2stage.f', 'cheev_2stage.f', 'cheevx_2stage.f', 'cheevr_2stage.f', + 'chbev_2stage.f', 'chbevx_2stage.f', 'chbevd_2stage.f', 'chegv_2stage.f', + 'cgesvdq.f', +] + +# _cxlasrc -- Single precision complex LAPACK routines using extra precision +_cxlasrc = [ + 'cgesvxx.f', 'cgerfsx.f', 'cla_gerfsx_extended.f', 'cla_geamv.f', + 'cla_gercond_c.f', 'cla_gercond_x.f', 'cla_gerpvgrw.f', 'csysvxx.f', + 'csyrfsx.f', 'cla_syrfsx_extended.f', 'cla_syamv.f', 'cla_syrcond_c.f', + 'cla_syrcond_x.f', 'cla_syrpvgrw.f', 'cposvxx.f', 'cporfsx.f', + 'cla_porfsx_extended.f', 'cla_porcond_c.f', 'cla_porcond_x.f', + 'cla_porpvgrw.f', 'cgbsvxx.f', 'cgbrfsx.f', 'cla_gbrfsx_extended.f', + 'cla_gbamv.f', 'cla_gbrcond_c.f', 'cla_gbrcond_x.f', 'cla_gbrpvgrw.f', + 'chesvxx.f', 'cherfsx.f', 'cla_herfsx_extended.f', 'cla_heamv.f', + 'cla_hercond_c.f', 'cla_hercond_x.f', 'cla_herpvgrw.f', 'cla_lin_berr.f', + 'clarscl2.f', 'clascl2.f', 'cla_wwaddw.f', +] + +# _zclasrc -- Double-single mixed precision complex routines +_zclasrc = [ + 'cgetrf.f', 'cgetrf2.f', 'cgetrs.f', 'claswp.f', 'cpotrf.f', 'cpotrf2.f', + 'cpotrs.f', 'cgetrs.f', 'cpotrf.f', 'cgetrf.f', 'sisnan.f', 'slaisnan.f' +] + +# _dlasrc -- Double precision real LAPACK routines +_dlasrc = [ + 'dbdsvdx.f', 'dgbbrd.f', 'dgbcon.f', 'dgbequ.f', 'dgbrfs.f', 'dgbsv.f', + 'dgbsvx.f', 'dgbtf2.f', 'dgbtrf.f', 'dgbtrs.f', 'dgebak.f', 'dgebal.f', + 'dgebd2.f', 'dgebrd.f', 'dgecon.f', 'dgeequ.f', 'dgees.f', 'dgeesx.f', + 'dgeev.f', 'dgeevx.f', 'dgehd2.f', 'dgehrd.f', 'dgelq2.f', 'dgelqf.f', + 'dgels.f', 'dgelsd.f', 'dgelss.f', 'dgelsy.f', 'dgeql2.f', 'dgeqlf.f', + 'dgeqp3.f', 'dgeqr2.f', 'dgeqr2p.f', 'dgeqrf.f', 'dgeqrfp.f', 'dgerfs.f', + 'dgerq2.f', 'dgerqf.f', 'dgesc2.f', 'dgesdd.f', 'dgesv.f', 'dgesvd.f', + 'dgesvdx.f', 'dgesvx.f', 'dgetc2.f', 'dgetf2.f', 'dgetrf.f', 'dgetrf2.f', + 'dgetri.f', 'dgetrs.f', 'dggbak.f', 'dggbal.f', 'dgges.f', 'dgges3.f', + 'dggesx.f', 'dggev.f', 'dggev3.f', 'dggevx.f', 'dggglm.f', 'dgghrd.f', + 'dgghd3.f', 'dgglse.f', 'dggqrf.f', 'dggrqf.f', 'dggsvd3.f', 'dggsvp3.f', + 'dgtcon.f', 'dgtrfs.f', 'dgtsv.f', 'dgtsvx.f', 'dgttrf.f', 'dgttrs.f', + 'dgtts2.f', 'dhgeqz.f', 'dlaqz0.f', 'dlaqz1.f', 'dlaqz2.f', 'dlaqz3.f', + 'dlaqz4.f', 'dhsein.f', 'dhseqr.f', 'dlabrd.f', 'dlacon.f', 'dlacn2.f', + 'dlaein.f', 'dlaexc.f', 'dlag2.f', 'dlags2.f', 'dlagtm.f', 'dlagv2.f', + 'dlahqr.f', 'dlahr2.f', 'dlaic1.f', 'dlaln2.f', 'dlals0.f', 'dlalsa.f', + 'dlalsd.f', 'dlangb.f', 'dlange.f', 'dlangt.f', 'dlanhs.f', 'dlansb.f', + 'dlansp.f', 'dlansy.f', 'dlantb.f', 'dlantp.f', 'dlantr.f', 'dlanv2.f', + 'dlapll.f', 'dlapmt.f', 'dlaqgb.f', 'dlaqge.f', 'dlaqp2.f', 'dlaqps.f', + 'dlaqsb.f', 'dlaqsp.f', 'dlaqsy.f', 'dlaqr0.f', 'dlaqr1.f', 'dlaqr2.f', + 'dlaqr3.f', 'dlaqr4.f', 'dlaqr5.f', 'dlaqtr.f', 'dlar1v.f', 'dlar2v.f', + 'iladlr.f', 'iladlc.f', 'dlarf.f', 'dlarfb.f', 'dlarfb_gett.f', 'dlarfg.f', + 'dlarfgp.f', 'dlarft.f', 'dlarfx.f', 'dlarfy.f', 'dlargv.f', 'dlarrv.f', + 'dlartv.f', 'dlarz.f', 'dlarzb.f', 'dlarzt.f', 'dlaswp.f', 'dlasy2.f', + 'dlasyf.f', 'dlasyf_rook.f', 'dlasyf_rk.f', 'dlasyf_aa.f', 'dlatbs.f', + 'dlatdf.f', 'dlatps.f', 'dlatrd.f', 'dlatrs.f', 'dlatrz.f', 'dlauu2.f', + 'dlauum.f', 'dopgtr.f', 'dopmtr.f', 'dorg2l.f', 'dorg2r.f', 'dorgbr.f', + 'dorghr.f', 'dorgl2.f', 'dorglq.f', 'dorgql.f', 'dorgqr.f', 'dorgr2.f', + 'dorgrq.f', 'dorgtr.f', 'dorgtsqr.f', 'dorgtsqr_row.f', 'dorm2l.f', + 'dorm2r.f', 'dorm22.f', 'dormbr.f', 'dormhr.f', 'dorml2.f', 'dormlq.f', + 'dormql.f', 'dormqr.f', 'dormr2.f', 'dormr3.f', 'dormrq.f', 'dormrz.f', + 'dormtr.f', 'dpbcon.f', 'dpbequ.f', 'dpbrfs.f', 'dpbstf.f', 'dpbsv.f', + 'dpbsvx.f', 'dpbtf2.f', 'dpbtrf.f', 'dpbtrs.f', 'dpocon.f', 'dpoequ.f', + 'dporfs.f', 'dposv.f', 'dposvx.f', 'dpotf2.f', 'dpotrf.f', 'dpotrf2.f', + 'dpotri.f', 'dpotrs.f', 'dpstrf.f', 'dpstf2.f', 'dppcon.f', 'dppequ.f', + 'dpprfs.f', 'dppsv.f', 'dppsvx.f', 'dpptrf.f', 'dpptri.f', 'dpptrs.f', + 'dptcon.f', 'dpteqr.f', 'dptrfs.f', 'dptsv.f', 'dptsvx.f', 'dpttrs.f', + 'dptts2.f', 'drscl.f', 'dsbev.f', 'dsbevd.f', 'dsbevx.f', 'dsbgst.f', + 'dsbgv.f', 'dsbgvd.f', 'dsbgvx.f', 'dsbtrd.f', 'dspcon.f', 'dspev.f', + 'dspevd.f', 'dspevx.f', 'dspgst.f', 'dspgv.f', 'dspgvd.f', 'dspgvx.f', + 'dsprfs.f', 'dspsv.f', 'dspsvx.f', 'dsptrd.f', 'dsptrf.f', 'dsptri.f', + 'dsptrs.f', 'dstegr.f', 'dstev.f', 'dstevd.f', 'dstevr.f', 'dsycon.f', + 'dsyev.f', 'dsyevd.f', 'dsyevr.f', 'dsyevx.f', 'dsygs2.f', 'dsygst.f', + 'dsygv.f', 'dsygvd.f', 'dsygvx.f', 'dsyrfs.f', 'dsysv.f', 'dsysvx.f', + 'dsytd2.f', 'dsytf2.f', 'dsytrd.f', 'dsytrf.f', 'dsytri.f', 'dsytrs.f', + 'dsytrs2.f', 'dsytri2.f', 'dsytri2x.f', 'dsyswapr.f', 'dsyconv.f', + 'dsyconvf.f', 'dsyconvf_rook.f', 'dsytf2_rook.f', 'dsytrf_rook.f', + 'dsytrs_rook.f', 'dsytri_rook.f', 'dsycon_rook.f', 'dsysv_rook.f', + 'dsytf2_rk.f', 'dsytrf_rk.f', 'dsytrs_3.f', 'dsytri_3.f', 'dsytri_3x.f', + 'dsycon_3.f', 'dsysv_rk.f', 'dsysv_aa.f', 'dsytrf_aa.f', 'dsytrs_aa.f', + 'dsysv_aa_2stage.f', 'dsytrf_aa_2stage.f', 'dsytrs_aa_2stage.f', 'dtbcon.f', + 'dtbrfs.f', 'dtbtrs.f', 'dtgevc.f', 'dtgex2.f', 'dtgexc.f', 'dtgsen.f', + 'dtgsja.f', 'dtgsna.f', 'dtgsy2.f', 'dtgsyl.f', 'dtpcon.f', 'dtprfs.f', + 'dtptri.f', 'dtptrs.f', 'dtrcon.f', 'dtrevc.f', 'dtrevc3.f', 'dtrexc.f', + 'dtrrfs.f', 'dtrsen.f', 'dtrsna.f', 'dtrsyl.f', 'dtrti2.f', 'dtrtri.f', + 'dtrtrs.f', 'dtzrzf.f', 'dstemr.f', 'dsgesv.f', 'dsposv.f', 'dlag2s.f', + 'slag2d.f', 'dlat2s.f', 'dlansf.f', 'dpftrf.f', 'dpftri.f', 'dpftrs.f', + 'dsfrk.f', 'dtfsm.f', 'dtftri.f', 'dtfttp.f', 'dtfttr.f', 'dtpttf.f', + 'dtpttr.f', 'dtrttf.f', 'dtrttp.f', 'dgejsv.f', 'dgesvj.f', 'dgsvj0.f', + 'dgsvj1.f', 'dgeequb.f', 'dsyequb.f', 'dpoequb.f', 'dgbequb.f', 'dbbcsd.f', + 'dlapmr.f', 'dorbdb.f', 'dorbdb1.f', 'dorbdb2.f', 'dorbdb3.f', 'dorbdb4.f', + 'dorbdb5.f', 'dorbdb6.f', 'dorcsd.f', 'dorcsd2by1.f', 'dgeqrt.f', + 'dgeqrt2.f', 'dgeqrt3.f', 'dgemqrt.f', 'dtpqrt.f', 'dtpqrt2.f', 'dtpmqrt.f', + 'dtprfb.f', 'dgelqt.f', 'dgelqt3.f', 'dgemlqt.f', 'dgetsls.f', + 'dgetsqrhrt.f', 'dgeqr.f', 'dlatsqr.f', 'dlamtsqr.f', 'dgemqr.f', 'dgelq.f', + 'dlaswlq.f', 'dlamswlq.f', 'dgemlq.f', 'dtplqt.f', 'dtplqt2.f', 'dtpmlqt.f', + 'dorhr_col.f', 'dlaorhr_col_getrfnp.f', 'dlaorhr_col_getrfnp2.f', + 'dsytrd_2stage.f', 'dsytrd_sy2sb.f', 'dsytrd_sb2st.F', 'dsb2st_kernels.f', + 'dsyevd_2stage.f', 'dsyev_2stage.f', 'dsyevx_2stage.f', 'dsyevr_2stage.f', + 'dsbev_2stage.f', 'dsbevx_2stage.f', 'dsbevd_2stage.f', 'dsygv_2stage.f', + 'dgesvdq.f', +] + +# _dxlasrc -- Double precision real LAPACK routines using extra precision +_dxlasrc = [ + 'dgesvxx.f', 'dgerfsx.f', 'dla_gerfsx_extended.f', 'dla_geamv.f', + 'dla_gercond.f', 'dla_gerpvgrw.f', 'dsysvxx.f', 'dsyrfsx.f', + 'dla_syrfsx_extended.f', 'dla_syamv.f', 'dla_syrcond.f', 'dla_syrpvgrw.f', + 'dposvxx.f', 'dporfsx.f', 'dla_porfsx_extended.f', 'dla_porcond.f', + 'dla_porpvgrw.f', 'dgbsvxx.f', 'dgbrfsx.f', 'dla_gbrfsx_extended.f', + 'dla_gbamv.f', 'dla_gbrcond.f', 'dla_gbrpvgrw.f', 'dla_lin_berr.f', + 'dlarscl2.f', 'dlascl2.f', 'dla_wwaddw.f' +] + +# _zlasrc -- Double precision complex LAPACK routines +_zlasrc = [ + 'zbdsqr.f', 'zgbbrd.f', 'zgbcon.f', 'zgbequ.f', 'zgbrfs.f', 'zgbsv.f', + 'zgbsvx.f', 'zgbtf2.f', 'zgbtrf.f', 'zgbtrs.f', 'zgebak.f', 'zgebal.f', + 'zgebd2.f', 'zgebrd.f', 'zgecon.f', 'zgeequ.f', 'zgees.f', 'zgeesx.f', + 'zgeev.f', 'zgeevx.f', 'zgehd2.f', 'zgehrd.f', 'zgelq2.f', 'zgelqf.f', + 'zgels.f', 'zgelsd.f', 'zgelss.f', 'zgelsy.f', 'zgeql2.f', 'zgeqlf.f', + 'zgeqp3.f', 'zgeqr2.f', 'zgeqr2p.f', 'zgeqrf.f', 'zgeqrfp.f', 'zgerfs.f', + 'zgerq2.f', 'zgerqf.f', 'zgesc2.f', 'zgesdd.f', 'zgesv.f', 'zgesvd.f', + 'zgesvdx.f', 'zgesvx.f', 'zgesvj.f', 'zgejsv.f', 'zgsvj0.f', 'zgsvj1.f', + 'zgetc2.f', 'zgetf2.f', 'zgetrf.f', 'zgetrf2.f', 'zgetri.f', 'zgetrs.f', + 'zggbak.f', 'zggbal.f', 'zgges.f', 'zgges3.f', 'zggesx.f', 'zggev.f', + 'zggev3.f', 'zggevx.f', 'zggglm.f', 'zgghrd.f', 'zgghd3.f', 'zgglse.f', + 'zggqrf.f', 'zggrqf.f', 'zggsvd3.f', 'zggsvp3.f', 'zgtcon.f', 'zgtrfs.f', + 'zgtsv.f', 'zgtsvx.f', 'zgttrf.f', 'zgttrs.f', 'zgtts2.f', 'zhbev.f', + 'zhbevd.f', 'zhbevx.f', 'zhbgst.f', 'zhbgv.f', 'zhbgvd.f', 'zhbgvx.f', + 'zhbtrd.f', 'zhecon.f', 'zheev.f', 'zheevd.f', 'zheevr.f', 'zheevx.f', + 'zhegs2.f', 'zhegst.f', 'zhegv.f', 'zhegvd.f', 'zhegvx.f', 'zherfs.f', + 'zhesv.f', 'zhesvx.f', 'zhetd2.f', 'zhetf2.f', 'zhetrd.f', 'zhetrf.f', + 'zhetri.f', 'zhetri2.f', 'zhetri2x.f', 'zheswapr.f', 'zhetrs.f', + 'zhetrs2.f', 'zhetf2_rook.f', 'zhetrf_rook.f', 'zhetri_rook.f', + 'zhetrs_rook.f', 'zhecon_rook.f', 'zhesv_rook.f', 'zhetf2_rk.f', + 'zhetrf_rk.f', 'zhetri_3.f', 'zhetri_3x.f', 'zhetrs_3.f', 'zhecon_3.f', + 'zhesv_rk.f', 'zhesv_aa.f', 'zhetrf_aa.f', 'zhetrs_aa.f', + 'zhesv_aa_2stage.f', 'zhetrf_aa_2stage.f', 'zhetrs_aa_2stage.f', 'zhgeqz.f', + 'zhpcon.f', 'zhpev.f', 'zhpevd.f', 'zlaqz0.f', 'zlaqz1.f', 'zlaqz2.f', + 'zlaqz3.f', 'zhpevx.f', 'zhpgst.f', 'zhpgv.f', 'zhpgvd.f', 'zhpgvx.f', + 'zhprfs.f', 'zhpsv.f', 'zhpsvx.f', 'zhptrd.f', 'zhptrf.f', 'zhptri.f', + 'zhptrs.f', 'zhsein.f', 'zhseqr.f', 'zlabrd.f', 'zlacgv.f', 'zlacon.f', + 'zlacn2.f', 'zlacp2.f', 'zlacpy.f', 'zlacrm.f', 'zlacrt.f', 'zladiv.f', + 'zlaed0.f', 'zlaed7.f', 'zlaed8.f', 'zlaein.f', 'zlaesy.f', 'zlaev2.f', + 'zlags2.f', 'zlagtm.f', 'zlahef.f', 'zlahef_rook.f', 'zlahef_rk.f', + 'zlahef_aa.f', 'zlahqr.f', 'zlahr2.f', 'zlaic1.f', 'zlals0.f', 'zlalsa.f', + 'zlalsd.f', 'zlangb.f', 'zlange.f', 'zlangt.f', 'zlanhb.f', 'zlanhe.f', + 'zlanhp.f', 'zlanhs.f', 'zlanht.f', 'zlansb.f', 'zlansp.f', 'zlansy.f', + 'zlantb.f', 'zlantp.f', 'zlantr.f', 'zlapll.f', 'zlapmt.f', 'zlaqgb.f', + 'zlaqge.f', 'zlaqhb.f', 'zlaqhe.f', 'zlaqhp.f', 'zlaqp2.f', 'zlaqps.f', + 'zlaqsb.f', 'zlaqr0.f', 'zlaqr1.f', 'zlaqr2.f', 'zlaqr3.f', 'zlaqr4.f', + 'zlaqr5.f', 'zlaqsp.f', 'zlaqsy.f', 'zlar1v.f', 'zlar2v.f', 'ilazlr.f', + 'ilazlc.f', 'zlarcm.f', 'zlarf.f', 'zlarfb.f', 'zlarfb_gett.f', 'zlarfg.f', + 'zlarfgp.f', 'zlarft.f', 'zlarfx.f', 'zlarfy.f', 'zlargv.f', 'zlarnv.f', + 'zlarrv.f', 'zlartg.f90', 'zlartv.f', 'zlarz.f', 'zlarzb.f', 'zlarzt.f', + 'zlascl.f', 'zlaset.f', 'zlasr.f', 'zlassq.f90', 'zlaswp.f', 'zlasyf.f', + 'zlasyf_rook.f', 'zlasyf_rk.f', 'zlasyf_aa.f', 'zlatbs.f', 'zlatdf.f', + 'zlatps.f', 'zlatrd.f', 'zlatrs.f', 'zlatrz.f', 'zlauu2.f', 'zlauum.f', + 'zpbcon.f', 'zpbequ.f', 'zpbrfs.f', 'zpbstf.f', 'zpbsv.f', 'zpbsvx.f', + 'zpbtf2.f', 'zpbtrf.f', 'zpbtrs.f', 'zpocon.f', 'zpoequ.f', 'zporfs.f', + 'zposv.f', 'zposvx.f', 'zpotf2.f', 'zpotrf.f', 'zpotrf2.f', 'zpotri.f', + 'zpotrs.f', 'zpstrf.f', 'zpstf2.f', 'zppcon.f', 'zppequ.f', 'zpprfs.f', + 'zppsv.f', 'zppsvx.f', 'zpptrf.f', 'zpptri.f', 'zpptrs.f', 'zptcon.f', + 'zpteqr.f', 'zptrfs.f', 'zptsv.f', 'zptsvx.f', 'zpttrf.f', 'zpttrs.f', + 'zptts2.f', 'zrscl.f', 'zrot.f', 'zspcon.f', 'zspmv.f', 'zspr.f', + 'zsprfs.f', 'zspsv.f', 'zspsvx.f', 'zsptrf.f', 'zsptri.f', 'zsptrs.f', + 'zdrscl.f', 'zstedc.f', 'zstegr.f', 'zstein.f', 'zsteqr.f', 'zsycon.f', + 'zsymv.f', 'zsyr.f', 'zsyrfs.f', 'zsysv.f', 'zsysvx.f', 'zsytf2.f', + 'zsytrf.f', 'zsytri.f', 'zsytri2.f', 'zsytri2x.f', 'zsyswapr.f', 'zsytrs.f', + 'zsytrs2.f', 'zsyconv.f', 'zsyconvf.f', 'zsyconvf_rook.f', 'zsytf2_rook.f', + 'zsytrf_rook.f', 'zsytrs_rook.f', 'zsytrs_aa.f', 'zsytri_rook.f', + 'zsycon_rook.f', 'zsysv_rook.f', 'zsytf2_rk.f', 'zsytrf_rk.f', + 'zsytrf_aa.f', 'zsytrs_3.f', 'zsysv_aa_2stage.f', 'zsytrf_aa_2stage.f', + 'zsytrs_aa_2stage.f', 'zsytri_3.f', 'zsytri_3x.f', 'zsycon_3.f', + 'zsysv_rk.f', 'zsysv_aa.f', 'ztbcon.f', 'ztbrfs.f', 'ztbtrs.f', 'ztgevc.f', + 'ztgex2.f', 'ztgexc.f', 'ztgsen.f', 'ztgsja.f', 'ztgsna.f', 'ztgsy2.f', + 'ztgsyl.f', 'ztpcon.f', 'ztprfs.f', 'ztptri.f', 'ztptrs.f', 'ztrcon.f', + 'ztrevc.f', 'ztrevc3.f', 'ztrexc.f', 'ztrrfs.f', 'ztrsen.f', 'ztrsna.f', + 'ztrsyl.f', 'ztrti2.f', 'ztrtri.f', 'ztrtrs.f', 'ztzrzf.f', 'zung2l.f', + 'zung2r.f', 'zungbr.f', 'zunghr.f', 'zungl2.f', 'zunglq.f', 'zungql.f', + 'zungqr.f', 'zungr2.f', 'zungrq.f', 'zungtr.f', 'zungtsqr.f', + 'zungtsqr_row.f', 'zunm2l.f', 'zunm2r.f', 'zunmbr.f', 'zunmhr.f', + 'zunml2.f', 'zunm22.f', 'zunmlq.f', 'zunmql.f', 'zunmqr.f', 'zunmr2.f', + 'zunmr3.f', 'zunmrq.f', 'zunmrz.f', 'zunmtr.f', 'zupgtr.f', 'zupmtr.f', + 'izmax1.f', 'dzsum1.f', 'zstemr.f', 'zcgesv.f', 'zcposv.f', 'zlag2c.f', + 'clag2z.f', 'zlat2c.f', 'zhfrk.f', 'ztfttp.f', 'zlanhf.f', 'zpftrf.f', + 'zpftri.f', 'zpftrs.f', 'ztfsm.f', 'ztftri.f', 'ztfttr.f', 'ztpttf.f', + 'ztpttr.f', 'ztrttf.f', 'ztrttp.f', 'zgeequb.f', 'zgbequb.f', 'zsyequb.f', + 'zpoequb.f', 'zheequb.f', 'zbbcsd.f', 'zlapmr.f', 'zunbdb.f', 'zunbdb1.f', + 'zunbdb2.f', 'zunbdb3.f', 'zunbdb4.f', 'zunbdb5.f', 'zunbdb6.f', 'zuncsd.f', + 'zuncsd2by1.f', 'zgeqrt.f', 'zgeqrt2.f', 'zgeqrt3.f', 'zgemqrt.f', + 'ztpqrt.f', 'ztpqrt2.f', 'ztpmqrt.f', 'ztprfb.f', 'ztplqt.f', 'ztplqt2.f', + 'ztpmlqt.f', 'zgelqt.f', 'zgelqt3.f', 'zgemlqt.f', 'zgetsls.f', + 'zgetsqrhrt.f', 'zgeqr.f', 'zlatsqr.f', 'zlamtsqr.f', 'zgemqr.f', 'zgelq.f', + 'zlaswlq.f', 'zlamswlq.f', 'zgemlq.f', 'zunhr_col.f', + 'zlaunhr_col_getrfnp.f', 'zlaunhr_col_getrfnp2.f', 'zhetrd_2stage.f', + 'zhetrd_he2hb.f', 'zhetrd_hb2st.F', 'zhb2st_kernels.f', 'zheevd_2stage.f', + 'zheev_2stage.f', 'zheevx_2stage.f', 'zheevr_2stage.f', 'zhbev_2stage.f', + 'zhbevx_2stage.f', 'zhbevd_2stage.f', 'zhegv_2stage.f', 'zgesvdq.f', +] + +# _zxlasrc -- Double precision complex LAPACK routines using extra precision +_zxlasrc = [ + 'zgesvxx.f', 'zgerfsx.f', 'zla_gerfsx_extended.f', 'zla_geamv.f', + 'zla_gercond_c.f', 'zla_gercond_x.f', 'zla_gerpvgrw.f', 'zsysvxx.f', + 'zsyrfsx.f', 'zla_syrfsx_extended.f', 'zla_syamv.f', 'zla_syrcond_c.f', + 'zla_syrcond_x.f', 'zla_syrpvgrw.f', 'zposvxx.f', 'zporfsx.f', + 'zla_porfsx_extended.f', 'zla_porcond_c.f', 'zla_porcond_x.f', + 'zla_porpvgrw.f', 'zgbsvxx.f', 'zgbrfsx.f', 'zla_gbrfsx_extended.f', + 'zla_gbamv.f', 'zla_gbrcond_c.f', 'zla_gbrcond_x.f', 'zla_gbrpvgrw.f', + 'zhesvxx.f', 'zherfsx.f', 'zla_herfsx_extended.f', 'zla_heamv.f', + 'zla_hercond_c.f', 'zla_hercond_x.f', 'zla_herpvgrw.f', 'zla_lin_berr.f', + 'zlarscl2.f', 'zlascl2.f', 'zla_wwaddw.f' +] + +# Start making the lapack target +if use_xblas + _lapack_netlib_srcs += _sxlasrc + _dxlasrc + _cxlasrc + _zxlasrc + _deps = dependency('xblas') +else + _libs += netlib_blas +endif + +if prec == 's' or build_single or build_all_prec + _lapack_netlib_srcs += _slasrc + _dslasrc + _sclaux +endif + +if prec == 'd' or build_double or build_all_prec + _lapack_netlib_srcs += _dlasrc + _dslasrc + _dzlaux +endif + +if prec == 'c' or build_complex or build_all_prec + _lapack_netlib_srcs += _clasrc + _zclasrc + _sclaux +endif + +if prec == 'z' or build_complex16 or build_all_prec + _lapack_netlib_srcs += _zlasrc + _zclasrc + _dzlaux +endif + +# Create the blas library +netlib_lapack = library(_netlib_lapack_name, + sources: _lapack_netlib_srcs, + link_with: _libs, + version: lapack_version, + soversion: lapack_major_version, + install: true, + ) + +pkg.generate(netlib_lapack, + name: 'lapack', + filebase: 'meson-lapack', + description: 'lapack via meson build', + version: f'@pkg_ver@_lapack', + install_dir: pkg_install_dir, + ) diff --git a/subprojects/packagefiles/openblas/lapack-netlib/meson.build b/subprojects/packagefiles/openblas/lapack-netlib/meson.build new file mode 100644 index 000000000..28472cca8 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/meson.build @@ -0,0 +1,26 @@ +add_languages('fortran', native: false) +ff = meson.get_compiler('fortran') +# TODO(mtsokol): make it a local setting +# if ff.has_argument('-Wno-conversion') +# add_project_arguments('-Wno-conversion', language: 'fortran') +# endif + +lapack_major_version = 3 # soversion +lapack_minor_version = 12 +lapack_patch_version = 0 +lapack_version = f'@lapack_major_version@.@lapack_minor_version@.@lapack_patch_version@' + +# Common variables +prec = get_option('realkind') +build_single = get_option('build_single') +build_double = get_option('build_double') +build_complex = get_option('build_complex') +build_complex16 = get_option('build_complex16') +build_all_prec = get_option('build_all_prec') +use_c_lapack = get_option('use_c_lapack') +use_xblas = get_option('use_xblas') + +# Sub-directories +subdir('BLAS') # Defines netlib_blas and _allblas +subdir('INSTALL') # Defines _install_aux and _int_cpu_time +subdir('SRC') # Defines netlib_lapack diff --git a/subprojects/packagefiles/openblas/lapack-netlib/meson_options.txt b/subprojects/packagefiles/openblas/lapack-netlib/meson_options.txt new file mode 100644 index 000000000..b378e3329 --- /dev/null +++ b/subprojects/packagefiles/openblas/lapack-netlib/meson_options.txt @@ -0,0 +1,3 @@ +option('realkind', type : 'string', value : 'd', + description : 's: real32 d: real64 c: complex32 z: complex64') + diff --git a/subprojects/packagefiles/openblas/meson.build b/subprojects/packagefiles/openblas/meson.build new file mode 100644 index 000000000..b7464396b --- /dev/null +++ b/subprojects/packagefiles/openblas/meson.build @@ -0,0 +1,649 @@ +# Conventions: +# _ implies that the variables are not meant to be used outside here +# Optionals are applied from the top-level meson_options.txt +# They are declared at the top +# Typically derived from (in order) the CMakeLists.txt and Makefiles +# +# Installation: +# meson setup build --buildtype release +# meson compile -C build +# meson install --prefix=$HOME/.local/lapack +# +# NOTE: This is still a work in progress, the Makefiles are canonical +project('OpenBLAS', ['c', 'fortran'], + default_options: ['c_std=c99'], + meson_version: '>=1.0.0', + version: '0.3.28') + +openblas_major_version = 0 # soversion +openblas_minor_version = 3 +openblas_patch_version = '28' +openblas_version = f'@openblas_major_version@.@openblas_minor_version@.@openblas_patch_version@' + +# Skip the check for valid CC +cc = meson.get_compiler('c') +fc = meson.get_compiler('fortran') +cc_id = cc.get_id() +fc_id = fc.get_id() +_mbproot = meson.project_build_root() +_msproot = meson.project_source_root() + +pkg = import('pkgconfig') +pkg_ver = meson.project_version() +pkg_install_dir = '../../pkgconfig' + +# Common args +_args = [] +# TODO(rg): Max parallel number should be conditional +_cargs = ['-DMAX_PARALLEL_NUMBER=1', f'-DVERSION=@openblas_version@'] +_fargs = [] + +# System configuration +build_single = get_option('build_single') +build_double = get_option('build_double') +build_complex = get_option('build_complex') +build_complex16 = get_option('build_complex16') + +# TODO: Be conditional +_cargs = [ + '-DBUILD_SINGLE=1', + '-DBUILD_DOUBLE=1', + '-DBUILD_COMPLEX=1', + '-DBUILD_COMPLEX16=1', +] + +# Options from CMakelists +build_without_lapack = get_option('build_without_lapack') +build_lapack_deprecated = get_option('build_lapack_deprecated') +build_testing = get_option('build_testing') +use_c_lapack = get_option('use_c_lapack') +build_without_cblas = get_option('build_without_cblas') +dynamic_arch = get_option('dynamic_arch') +dynamic_older = get_option('dynamic_older') +build_relapack = get_option('build_relapack') +use_locking = get_option('use_locking') +use_perl = get_option('use_perl') +no_warmup = get_option('no_warmup') +no_affinity = get_option('no_affinity') +build_cpp_thread_safety_test = get_option('build_cpp_thread_safety_test') +build_cpp_thread_safety_gemv = get_option('build_cpp_thread_safety_gemv') +build_static_libs = get_option('build_static_libs') +max_stack_alloc = get_option('max_stack_alloc') +quad_prec = get_option('quad_precision') +exprecision = get_option('exprecision') + +is_linux = host_machine.system() == 'linux' +is_darwin = host_machine.system() == 'darwin' + +no_affinity = true +# TODO(mtsokol): Find out in which cases affinity should be on +# if is_linux or is_darwin +# no_affinity = true +# else +# no_affinity = false +# endif + +if cc_id == 'clang' + asm_name_prefix = '_' +else + asm_name_prefix = '' +endif + +_check_prefix = [] +conf_data = configuration_data() +is_win = host_machine.system() == 'windows' or host_machine.system() == 'cygwin' +conf_data.set('OS_WINDOWS', is_win) +hostcpu = host_machine.cpu_family() +conf_data.set('ARCH', hostcpu.to_upper()) + +if is_win + blas_server_src = 'blas_server_win32.c' +else + blas_server_src = 'blas_server.c' +endif + +defarch_array = [ + # {'system': ['windows', 'cygwin'], 'def': ['OS_WINDOWS']}, + {'cpu': ['aarch64'], 'def': ['ARCH_ARGM64']}, + {'cpu': ['alpha'], 'def': ['ARCH_ALPHA']}, + {'cpu': ['arm'], 'def': ['ARCH_ARM', 'USE_TRMM']}, + {'cpu': ['x86_64'], 'def': ['INTEL_AMD', 'USE_GEMM3M', 'USE_DIRECT_SGEMM']}, + {'cpu': ['s390x'], 'def': ['ARCH_ZARCH', 'ZARCH']}, + {'cpu': ['ia64'], 'def': ['ARCH_IA64', 'USE_GEMM3M']}, + {'cpu': ['sparc'], 'def': ['ARCH_SPARC']}, + {'cpu': ['mips'], 'def': ['ARCH_MIPS']}, + {'cpu': ['mips64'], 'def': ['ARCH_MIPS64', 'USE_TRMM']}, + {'cpu': ['loongarch64'], 'def': ['ARCH_LOONGARCH64']}, + {'cpu': ['riscv64'], 'def': ['ARCH_RISCV64', 'USE_TRMM']}, + {'cpu': ['ppc64', 'ppc'], 'def': ['ARCH_POWER', 'POWER']}, + {'cpu': ['generic'], 'def': ['USE_TRMM']}, +] + +# TODO(rg): Handle the kenel architectures later + +foreach arch : defarch_array + is_cpu = hostcpu in arch['cpu'] + foreach def : arch['def'] + conf_data.set(def, is_cpu) + endforeach +endforeach + +configure_file(output : 'getarch_conf.h', + configuration : conf_data) + +# Makefile.system +cpu_fam = target_machine.cpu_family() + +if cpu_fam in ['x86_64', 'ppc64', 'ppc'] + add_project_arguments([ + '-DSMALL_MATRIX_OPT', + f'-DMAX_STACK_ALLOC=@max_stack_alloc@', + ], language:'c') +endif + +if cpu_fam == 'x86_64' + _cargs += ['-m64'] +endif + +quadmath_dep = fc.find_library('quadmath', required: false) + +if fc_id == 'gcc' + add_project_arguments('-DF_INTERFACE_GFORT', language: 'c') +else + upper_fcid = fc_id.to_upper() + add_project_arguments(f'-DF_INTERFACE_@upper_fcid@', language: 'c') +endif + +py3 = find_program('python') + +simd_extensions = [ + {'flag': '-mmmx', 'define': 'HAVE_MMX'}, + {'flag': '-msse', 'define': 'HAVE_SSE'}, + {'flag': '-msse2', 'define': 'HAVE_SSE2'}, + {'flag': '-msse3', 'define': 'HAVE_SSE3'}, + {'flag': '-mssse3', 'define': 'HAVE_SSSE3'}, + {'flag': '-msse4.1', 'define': 'HAVE_SSE4_1'}, + {'flag': '-msse4.2', 'define': 'HAVE_SSE4_2'}, + {'flag': '-mfma', 'define': 'HAVE_FMA3'}, + {'flag': '-mavx', 'define': 'HAVE_AVX'}, + {'flag': '-mavx2', 'define': 'HAVE_AVX2'}, +] + +simd_cargs = [] +simd_conf_data = configuration_data() + +foreach ext : simd_extensions + if cc.has_argument(ext['flag']) + simd_cargs += ext['flag'] + simd_conf_data.set(ext['define'], true) + else + simd_conf_data.set(ext['define'], false) + endif +endforeach + +# Generate configuration header +configure_file(output : 'simd_conf.h', + configuration : simd_conf_data) +# Can't be added as a project argument since it won't be generated until later +add_project_arguments(simd_cargs, language: 'c') + +# Common symbol related options +symnames = ['ASMNAME', 'ASMFNAME', 'NAME', 'CNAME', 'CHAR_NAME', 'CHAR_CNAME'] + +# TODO(rg): Maybe make these conditional.. +_cargs += [ + '-DSMP_SERVER', # This is evidently necessary for the driver/level2 + '-DBUILD_SINGLE=1', + '-DBUILD_DOUBLE=1', + '-DBUILD_COMPLEX=1', + '-DBUILD_COMPLEX16=1', +] + +# Other common options, move later +# Undefine to help prevent clashes +foreach symb : symnames + _cargs += f'-U@symb@' +endforeach +# Based on options +if no_affinity + _cargs += '-DNO_AFFINITY' +endif +if no_warmup + _cargs += '-DNO_WARMUP' +endif +# Parallel builds +# TODO: This can be cleaned up significantly +# Also use multiprocessing.cpu_count() +# TODO: Handle SMP_SERVER +num_parallel = get_option('num_parallel') +if num_parallel > 1 + _cargs += f'-DMAX_PARALLEL_NUMBER=@num_parallel@' +endif +num_cores = get_option('num_cores') +if num_cores > 0 + num_threads = num_cores +else + num_threads = 0 +endif +use_thread = false +if num_threads > 2 + use_thread = true +endif +if use_thread + message('Multi-threading enabled with ' + num_threads.to_string() + ' threads.') + _cargs += f'-DMAX_CPU_NUMBER=@num_threads@' + _cargs += f'-DSMP_SERVER' +else + if get_option('use_locking') + _cargs += '-DUSE_LOCKING' + endif +endif + +# Common maps +# Naming conventions: https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c/2024-1/naming-conventions-for-blas-routines.html +# Also see: +# L1: https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c/2024-1/blas-level-1-routines-and-functions.html +precision_mappings = { + 's': {'undef': ['COMPLEX', 'DOUBLE']}, + 'd': {'undef': ['COMPLEX'], 'def': ['DOUBLE']}, + 'q': {'undef': ['COMPLEX'], 'def': ['XDOUBLE']}, + 'c': {'undef': ['DOUBLE'], 'def': ['COMPLEX']}, + 'z': {'def': ['COMPLEX', 'DOUBLE']}, + 'x': {'def': ['COMPLEX', 'XDOUBLE']}, + 'cs': {'undef': ['DOUBLE'], 'def': ['COMPLEX']}, + 'sc': {'undef': ['DOUBLE'], 'def': ['COMPLEX']}, + 'dz': {'def': ['COMPLEX', 'DOUBLE']}, + 'zd': {'def': ['COMPLEX', 'DOUBLE']}, + 'qx': {'def': ['COMPLEX', 'XDOUBLE']}, + 'xq': {'def': ['COMPLEX', 'XDOUBLE']}, + '': {}, # special case, for cblas_?dot*_sub + # NOTE: Anything with XDOUBLE aka longdouble has no cblas_ + # xq / qx == x + # sc / cs == c + # zd / dz is the same as z + # 'zd': {'undef': [], 'def': ['COMPLEX', 'DOUBLE']}, +} + +ext_mappings = { + # TRANSA is only in drivers level2, kernel uses TRANS + # LOWER is not there for only for ?ger_thread_U in drivers level2 + # TODO(rg): Does that, i.e. having (un)used symbols (un)defined matter? + '': {}, # special case + '_k': {}, + '_U': { + 'undef': ['LOWER', 'CONJ', 'XCONJ'], + 'except': [ + '?hemv', '?hemv_thread', '?hpmv', '?hpmv_thread', '?her', '?her_thread', + '?her2', '?her2_thread', '?hpr2', '?hpr2_thread', + ], + }, + '_C': {'def': ['CONJ'], 'undef': ['XCONJ']}, + '_D': {'def': ['CONJ', 'XCONJ']}, + '_L': { + 'def': ['LOWER'], + 'except': [ + '?hemv', '?hemv_thread', '?hpmv', '?hpmv_thread', '?her', '?her_thread', + '?her2', '?her2_thread', '?hpr2', '?hpr2_thread', + ], + }, + '_LN': {'def': ['LEFT'], 'undef': ['TRANSA'], + 'except': ['?syrk', '?syrk_thread', + '?syr2k', '?herk', '?herk_kernel', + '?trsm_kernel', '?her2k', '?her2k_kernel']}, + # Handle HEMV and HEMVREV better + '_V': { + 'def': ['HEMV', 'HEMVREV', 'XCONJ'], + 'undef': ['LOWER', 'CONJ'], + 'except': [ + '?hemv', '?hemv_thread', '?hpmv', '?hpmv_thread', '?her', '?her_thread', + '?her2', '?her2_thread', '?hpr2', '?hpr2_thread', + ], + }, + '_M': { + 'def': ['HEMV', 'HEMVREV', 'LOWER'], + 'except': [ + '?hemv', '?hemv_thread', '?hpmv', '?hpmv_thread', '?her', '?her_thread', + '?her2', '?her2_thread', '?hpr2', '?hpr2_thread', + ], + }, + '_n': {'undef': ['TRANS', 'TRANSA', 'CONJ', 'XCONJ']}, + '_t': {'def': ['TRANS', 'TRANSA'], 'undef': ['CONJ', 'XCONJ']}, + '_r': {'def': ['CONJ'], 'undef': ['TRANS', 'TRANSA', 'XCONJ']}, + '_c': {'def': ['TRANS', 'TRANSA', 'CONJ'], 'undef': ['XCONJ']}, + '_o': {'def': ['XCONJ'], 'undef': ['TRANS', 'TRANSA', 'CONJ']}, + '_u': {'def': ['TRANS', 'TRANSA', 'XCONJ'], 'undef': ['CONJ']}, + '_s': {'def': ['CONJ', 'XCONJ'], 'undef': ['TRANS', 'TRANSA']}, + '_d': {'def': ['TRANS', 'TRANSA', 'CONJ', 'XCONJ']}, + '_nn': {'def': ['NN']}, + '_nt': {'def': ['NT']}, + '_nr': {'def': ['NR']}, + '_nc': {'def': ['NC']}, + '_tn': {'def': ['TN']}, + '_tt': {'def': ['TT']}, + '_tr': {'def': ['TR']}, + '_tc': {'def': ['TC']}, + '_rn': {'def': ['RN']}, + '_rt': {'def': ['RT']}, + '_rr': {'def': ['RR']}, + '_rc': {'def': ['RC=RC']}, + '_cn': {'def': ['CN']}, + '_ct': {'def': ['CT']}, + '_cr': {'def': ['CR=CR']}, + '_cc': {'def': ['CC']}, + # Level 3 symbols + '_LU': {'def': ['NN'], 'undef': ['LOWER', 'RSIDE']}, + '_LL': {'def': ['LOWER', 'NN'], 'undef': ['RSIDE']}, + '_RU': {'def': ['RSIDE', 'NN'], 'undef': ['LOWER'], 'except': ['?hemm', '?hemm_thread']}, + '_RL': {'def': ['RSIDE', 'NN', 'LOWER'], 'except': ['?hemm', '?hemm_thread']}, + '_RN': {'undef': ['LEFT', 'TRANSA']}, + '_RR': {'undef': ['LEFT', 'TRANSA']}, + '_RT': {'def': ['TRANSA'], 'undef': ['LEFT']}, + '_RC': {'def': ['TRANSA'], 'undef': ['LEFT']}, + # TODO(rg): is CONJ OK for interface symbols? + '_UN': {'undef': ['TRANS', 'LOWER', 'CONJ'], 'except': ['?syrk']}, + '_UT': {'def': ['TRANS'], 'undef': ['LOWER'], 'except': ['?syrk']}, + '_UC': {'def': ['TRANS', 'CONJ'], 'undef': ['LOWER']}, + '_LC': {'def': ['LOWER', 'TRANS', 'CONJ']}, +} + +ext_mappings_l2 = [ + {'ext': '_NUU', 'def': ['UNIT'], 'undef': ['TRANSA', 'LOWER'], 'for': ['s', 'd']}, + {'ext': '_NUN', 'undef': ['TRANSA', 'UNIT', 'LOWER'], 'for': ['s', 'd']}, + {'ext': '_TLU', 'def': ['UNIT', 'TRANSA', 'LOWER'], 'for': ['s', 'd']}, + {'ext': '_TLN', 'def': ['TRANSA', 'LOWER'], 'undef': ['UNIT'], 'for': ['s', 'd']}, + {'ext': '_NLU', 'def': ['UNIT', 'LOWER'], 'undef': ['TRANSA'], 'for': ['s', 'd']}, + {'ext': '_NLN', 'def': ['LOWER'], 'undef': ['TRANSA', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_TUU', 'def': ['UNIT', 'TRANSA'], 'undef': ['LOWER'], 'for': ['s', 'd']}, + {'ext': '_TUN', 'def': ['TRANSA'], 'undef': ['UNIT', 'LOWER'], 'for': ['s', 'd']}, + {'ext': '_NUU', 'def': ['UNIT', 'TRANSA=1'], 'undef': ['LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_NUN', 'def': ['TRANSA=1'], 'undef': ['UNIT', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_TLU', 'def': ['UNIT', 'TRANSA=2', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_TLN', 'def': ['TRANSA=2', 'LOWER'], 'undef': ['UNIT'], 'for': ['c', 'x', 'z']}, + {'ext': '_RLU', 'def': ['UNIT', 'TRANSA=3', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_RLN', 'def': ['TRANSA=3', 'LOWER'], 'undef': ['UNIT'], 'for': ['c', 'x', 'z']}, + {'ext': '_CLU', 'def': ['UNIT', 'TRANSA=4', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_CLN', 'def': ['TRANSA=4', 'LOWER'], 'undef': ['UNIT'], 'for': ['c', 'x', 'z']}, + {'ext': '_NLU', 'def': ['UNIT', 'TRANSA=1', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_NLN', 'def': ['TRANSA=1', 'LOWER'], 'undef': ['UNIT'], 'for': ['c', 'x', 'z']}, + {'ext': '_TUU', 'def': ['UNIT', 'TRANSA=2'], 'undef': ['LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_TUN', 'def': ['TRANSA=2'], 'undef': ['UNIT', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_RUU', 'def': ['UNIT', 'TRANSA=3'], 'undef': ['LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_RUN', 'def': ['TRANSA=3'], 'undef': ['UNIT', 'LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_CUU', 'def': ['UNIT', 'TRANSA=4'], 'undef': ['LOWER'], 'for': ['c', 'x', 'z']}, + {'ext': '_CUN', 'def': ['TRANSA=4'], 'undef': ['UNIT', 'LOWER'], 'for': ['c', 'x', 'z']} +] + +ext_mappings_l3 = [ + {'ext': '_LNUU', 'def': ['UPPER', 'UNIT'], 'undef': ['TRANSA'], 'for': ['s', 'd']}, + {'ext': '_LNUN', 'def': ['UPPER'], 'undef': ['TRANSA', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_LNLU', 'def': ['UNIT'], 'undef': ['TRANSA', 'UPPER'], 'for': ['s', 'd']}, + {'ext': '_LNLN', 'undef': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['s', 'd', 'q']}, + {'ext': '_LTUU', 'def': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_LTUN', 'def': ['TRANSA', 'UPPER'], 'undef': ['UNIT'], 'for': ['s', 'd']}, + {'ext': '_LTLU', 'def': ['TRANSA', 'UNIT'], 'undef': ['UPPER'], 'for': ['s', 'd', 'q']}, + {'ext': '_LTLN', 'def': ['TRANSA'], 'undef': ['UPPER', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_RNUU', 'def': ['UPPER', 'UNIT'], 'undef': ['TRANSA'], 'for': ['s', 'd']}, + {'ext': '_RNUN', 'def': ['UPPER'], 'undef': ['TRANSA', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_RNLU', 'def': ['UNIT'], 'undef': ['TRANSA', 'UPPER'], 'for': ['s', 'd']}, + {'ext': '_RNLN', 'undef': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_RTUU', 'def': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['s', 'd']}, + {'ext': '_RTUN', 'def': ['TRANSA', 'UPPER'], 'undef': ['UNIT'], 'for': ['s', 'd', 'q']}, + {'ext': '_RTLU', 'def': ['TRANSA', 'UNIT'], 'undef': ['UPPER'], 'for': ['s', 'd', 'q']}, + {'ext': '_RTLN', 'def': ['TRANSA'], 'undef': ['UPPER', 'UNIT'], 'for': ['s', 'd']}, + + # For trmm + {'ext': '_LNUU', 'def': ['UPPER', 'UNIT'], 'undef': ['TRANSA', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LNUN', 'def': ['UPPER'], 'undef': ['TRANSA', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LNLU', 'def': ['UNIT'], 'undef': ['TRANSA', 'UPPER', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LNLN', 'def': [], 'undef': ['TRANSA', 'UPPER', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LTUU', 'def': ['TRANSA', 'UPPER', 'UNIT'], 'undef': ['CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LTUN', 'def': ['TRANSA', 'UPPER'], 'undef': ['UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LTLU', 'def': ['TRANSA', 'UNIT'], 'undef': ['UPPER', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LTLN', 'def': ['TRANSA'], 'undef': ['UPPER', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_LRUU', 'def': ['UPPER', 'UNIT', 'CONJ'], 'undef': ['TRANSA'], 'for': ['c', 'z', 'x']}, + {'ext': '_LRUN', 'def': ['UPPER', 'CONJ'], 'undef': ['TRANSA', 'UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_LRLU', 'def': ['UNIT', 'CONJ'], 'undef': ['TRANSA', 'UPPER'], 'for': ['c', 'z', 'x']}, + {'ext': '_LRLN', 'def': ['CONJ'], 'undef': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_LCUU', 'def': ['TRANSA', 'UPPER', 'UNIT', 'CONJ'], 'undef': [], 'for': ['c', 'z', 'x']}, + {'ext': '_LCUN', 'def': ['TRANSA', 'UPPER', 'CONJ'], 'undef': ['UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_LCLU', 'def': ['TRANSA', 'UNIT', 'CONJ'], 'undef': ['UPPER'], 'for': ['c', 'z', 'x']}, + {'ext': '_LCLN', 'def': ['TRANSA', 'CONJ'], 'undef': ['UPPER', 'UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_RNUU', 'def': ['UPPER', 'UNIT'], 'undef': ['TRANSA', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RNUN', 'def': ['UPPER'], 'undef': ['TRANSA', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RNLU', 'def': ['UNIT'], 'undef': ['TRANSA', 'UPPER', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RNLN', 'def': [], 'undef': ['TRANSA', 'UPPER', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RTUU', 'def': ['TRANSA', 'UPPER', 'UNIT'], 'undef': ['CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RTUN', 'def': ['TRANSA', 'UPPER'], 'undef': ['UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RTLU', 'def': ['TRANSA', 'UNIT'], 'undef': ['UPPER', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RTLN', 'def': ['TRANSA'], 'undef': ['UPPER', 'UNIT', 'CONJ'], 'for': ['c', 'z', 'x']}, + {'ext': '_RRUU', 'def': ['UPPER', 'UNIT', 'CONJ'], 'undef': ['TRANSA'], 'for': ['c', 'z', 'x']}, + {'ext': '_RRUN', 'def': ['UPPER', 'CONJ'], 'undef': ['TRANSA', 'UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_RRLU', 'def': ['UNIT', 'CONJ'], 'undef': ['TRANSA', 'UPPER'], 'for': ['c', 'z', 'x']}, + {'ext': '_RRLN', 'def': ['CONJ'], 'undef': ['TRANSA', 'UPPER', 'UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_RCUU', 'def': ['TRANSA', 'UPPER', 'UNIT', 'CONJ'], 'undef': [], 'for': ['c', 'z', 'x']}, + {'ext': '_RCUN', 'def': ['TRANSA', 'UPPER', 'CONJ'], 'undef': ['UNIT'], 'for': ['c', 'z', 'x']}, + {'ext': '_RCLU', 'def': ['TRANSA', 'UNIT', 'CONJ'], 'undef': ['UPPER'], 'for': ['c', 'z', 'x']}, + {'ext': '_RCLN', 'def': ['TRANSA', 'CONJ'], 'undef': ['UPPER', 'UNIT'], 'for': ['c', 'z', 'x']}, + + # symm + # syrk + {'ext': '_UN', 'def': [], 'undef': ['LOWER', 'TRANS', 'CONJ'], 'for': ['s', 'd', 'c', 'z']}, + {'ext': '_UT', 'def': ['TRANS'], 'undef': ['LOWER', 'CONJ'], 'for': ['s', 'd', 'c', 'z']}, + {'ext': '_LN', 'def': ['LOWER'], 'undef': ['TRANS', 'CONJ'], 'for': ['s', 'd', 'c', 'z']}, + {'ext': '_LT', 'def': ['TRANS', 'LOWER'], 'undef': ['CONJ'], 'for': ['s', 'd', 'c', 'z']}, + {'ext': '_RU', 'def': ['RSIDE', 'NC'], 'undef': ['LOWER'], 'for': ['c', 'z']}, + {'ext': '_RL', 'def': ['RSIDE', 'NC', 'LOWER'], 'for': ['c', 'z']}, + # hem hemv_thread + {'ext': '_U', 'def': ['HEMV', 'HER'], 'undef': ['LOWER'], 'for': ['c', 'z']}, + {'ext': '_L', 'def': ['HEMV', 'HER', 'LOWER'], 'for': ['c', 'z']}, + {'ext': '_V', 'def': ['HEMVREV', 'HERREV'], 'undef': ['LOWER'], 'for': ['c', 'z']}, + {'ext': '_M', 'def': ['HEMVREV', 'HERREV', 'LOWER'], 'for': ['c', 'z']} +] + + # cc -c -O2 -DSMALL_MATRIX_OPT -DMAX_STACK_ALLOC=2048 -Wall -m64 -DF_INTERFACE_GFORT -fPIC -DSMP_SERVER -DNO_WARMUP -DMAX_CPU_NUMBER=12 -DMAX_PARALLEL_NUMBER=1 -DBUILD_SINGLE=1 -DBUILD_DOUBLE=1 -DBUILD_COMPLEX=1 -DBUILD_COMPLEX16=1 -DVERSION=\"0.3.26.dev\" -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mavx2 -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -DASMNAME=strmm_RTUU -DASMFNAME=strmm_RTUU_ -DNAME=strmm_RTUU_ -DCNAME=strmm_RTUU -DCHAR_NAME=\"strmm_RTUU_\" -DCHAR_CNAME=\"strmm_RTUU\" -DNO_AFFINITY -I../.. -UDOUBLE -UCOMPLEX -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT trmm_R.c -o strmm_RTUU.o + +symb_defs = { + '?amax': {'def': ['USE_ABS'], 'undef': ['USE_MIN']}, + '?amin': {'def': ['USE_ABS', 'USE_MIN']}, + 'i?max': {'undef': ['USE_ABS', 'USE_MIN']}, + 'i?amax': {'def': ['USE_ABS'], 'undef': ['USE_MIN']}, + 'i?amin': {'def': ['USE_ABS', 'USE_MIN']}, + 'i?min': {'def': ['USE_MIN'], 'undef': ['USE_ABS']}, + '?max': {'undef': ['USE_ABS', 'USE_MIN']}, + '?min': {'def': ['USE_MIN'], 'undef': ['USE_ABS']}, + '?axpyc': {'def': ['CONJ']}, + '?dotu': {'undef': ['CONJ']}, + '?dotc': {'def': ['CONJ']}, + '?geru': {'undef': ['CONJ']}, + '?gerc': {'def': ['CONJ']}, + '?hemm': {'def': ['HEMM']}, + '?herk': {'def': ['HEMM']}, + '?her2k': {'def': ['HEMM']}, + '?gemm3m': {'def': ['GEMM3M']}, + '?symm3m': {'def': ['GEMM3M']}, + '?hemm3m': {'def': ['HEMM', 'GEMM3M']}, + '?her_thread': {'def': ['HER']}, + '?her2_thread': {'def': ['HER']}, + '?hpr_thread': {'def': ['HEMV']}, + '?trmm_kernel': {'def': ['TRMMKERNEL']}, + '?trsm_kernel': {'def': ['TRSMKERNEL']}, + '?dsdot': {'def': ['DSDOT']}, + '?bgemm': {'def': ['HALF']}, + '?gemm_small_kernel_b0': {'def': ['B0']}, + 'cblas_?dotu_sub': {'def': ['CBLAS', 'FORCE_USE_STACK'], 'undef': ['CONJ']}, + 'cblas_?dotc_sub': {'def': ['CBLAS', 'FORCE_USE_STACK', 'CONJ']}, +} + +# 3m_ rules +m3_startswith = { + '3m_i': ['-UUSE_ALPHA'], + '3m_o': ['-DUSE_ALPHA'], +} +m3_endswith = { + 'copyr': ['-DREAL_ONLY'], + 'copyi': ['-DIMAGE_ONLY'], +} + +# config.h file generation + +_config_h = f'@_mbproot@/config.h' +_makefile_conf = f'@_mbproot@/Makefile.conf' +run_command(f'@_msproot@/c_check', _makefile_conf, _config_h, cc_id, check: true) +run_command(f'@_msproot@/f_check', _makefile_conf, _config_h, fc_id, check: true) + +run_command(cc_id, '-o', f'@_mbproot@/getarch', 'getarch.c', 'cpuid.S', check: true) +_getarch_1_result = run_command(f'@_mbproot@/getarch', '1', check: true, capture: true) +run_command(py3, + f'@_msproot@/write_to_file.py', + _getarch_1_result.stdout(), + _config_h, + check: true) +_getarch_0_result = run_command(f'@_mbproot@/getarch', '0', check: true, capture: true) +run_command(py3, + f'@_msproot@/write_to_file.py', + _getarch_0_result.stdout(), + _makefile_conf, + check: true) + +run_command(cc_id, + '-DGEMM_MULTITHREAD_THRESHOLD=4', + '-I.', + f'-I@_mbproot@', + '-o', 'getarch_2nd', + 'getarch_2nd.c', + capture: true, check: true) +_getarch_2nd_1_result = run_command('./getarch_2nd', '1', check: true, capture: true) +run_command(py3, + f'@_msproot@/write_to_file.py', + _getarch_2nd_1_result.stdout(), + _config_h, + check: true) +_getarch_2nd_0_result = run_command('./getarch_2nd', '0', check: true, capture: true) +run_command(py3, + f'@_msproot@/write_to_file.py', + _getarch_2nd_0_result.stdout(), + _makefile_conf, + check: true) + + +_read_config_py = f'@_msproot@/read_config.py' + +run_command(py3, + _read_config_py, + '--file1', _config_h, + '--build_dir', f'@_mbproot@', + check: true) + +keyval = import('keyval') +conf_kv = keyval.load(f'@_mbproot@/config.kconf') +# NOTE(rg): conf_kv doesn't do any parsing, setup manually +conf_hdat = configuration_data() +foreach key,val : conf_kv + if 'CHAR' in key + conf_hdat.set_quoted(key, val) + else + conf_hdat.set(key, val) + endif +endforeach + +makefile_conf_kv = keyval.load(f'@_mbproot@/Makefile.conf') +makefile_conf_dat = configuration_data() +foreach key,val : makefile_conf_kv + makefile_conf_dat.set(key, val) +endforeach + +# Ignoring other hostarch checks and conflicts for arch in BSD for now +_inc = [include_directories('.')] +subdir('lapack-netlib') +subdir('interface') +subdir('driver/level2') +subdir('driver/level3') +subdir('driver/others') +subdir('kernel') + +openblas = both_libraries('openblas', + link_whole: [ + _interface, + _l2_driver, + _l3_driver, + _others, + _kern, + ], + dependencies: [dependency('threads'), quadmath_dep], + override_options: ['b_lundef=false', 'b_asneeded=false'], + install: true, + ) +openblas_static = openblas.get_static_lib() + +# Handle headers +fs = import('fs') + +if not get_option('build_without_lapack') + lapacke_root = 'lapack-netlib/LAPACKE/include' + lapacke_headers = [ 'lapack.h', 'lapacke_config.h', 'lapacke.h', 'lapacke_mangling.h', 'lapacke_utils.h', ] + + foreach head : lapacke_headers + fs.copyfile(f'@lapacke_root@/@head@', + install_dir: 'include', + install: true) + endforeach + +endif + +# Paths to necessary files for header generation +fconfig_h = f'@_mbproot@/config.h' +fconfig_last = f'@_mbproot@/config_last.h' +template = f'@_msproot@/openblas_config_template.h' +common_interface = f'@_msproot@/common_interface.h' +cblas = f'@_msproot@/cblas.h' + +# Prepare config_last.h from config.h +pcl = custom_target('prepare_config_last', + output : 'config_last.h', + command : [py3, f'@_msproot@/prepare_config_last.py', + '--config', fconfig_h, + '--output', fconfig_last, + quad_prec ? '--quad-precision' : [], + exprecision ? '--exprecision' : [], + ], + build_by_default : true, +) + +# Generate the headers +custom_target('gen_install_headers', + output : ['openblas_config.h', 'f77blas.h', 'cblas.h'], + command : [py3, f'@_msproot@/gen_install_headers.py', + '--dest-dir', f'@_mbproot@', + '--version', meson.project_version(), + '--config-last', fconfig_last, + '--template', template, + '--common-interface', common_interface, + '--cblas', cblas, + not get_option('no_fortran') ? '--generate-f77blas' : [], + not get_option('build_without_cblas') ? '--generate-cblas' : [], + ], + install : true, + install_dir : 'include', + depends : pcl, +) + +pkg.generate(openblas, + name: 'openblas', + filebase: 'meson-openblas', + description: 'OpenBLAS via meson build', + version: f'@pkg_ver@_meson', + install_dir: pkg_install_dir, + ) + +if build_testing + subdir('test') + subdir('ctest') + subdir('utest') +endif + +# Dependency setup +openblas_dep = declare_dependency( + link_whole : openblas_static, + include_directories : _inc, + version: meson.project_version(), +) diff --git a/subprojects/packagefiles/openblas/meson_options.txt b/subprojects/packagefiles/openblas/meson_options.txt new file mode 100644 index 000000000..4d5accc52 --- /dev/null +++ b/subprojects/packagefiles/openblas/meson_options.txt @@ -0,0 +1,60 @@ +# From CMakeLists +option('build_without_lapack', type: 'boolean', value: false, description: 'Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)') +option('build_lapack_deprecated', type: 'boolean', value: true, description: 'When building LAPACK, include also some older, deprecated routines') +option('build_testing', type: 'boolean', value: false, description: 'Build LAPACK testsuite when building LAPACK') +option('use_c_lapack', type: 'boolean', value: false, description: 'Build LAPACK from C sources instead of the original Fortran') +option('build_without_cblas', type: 'boolean', value: false, description: 'Do not build the C interface (CBLAS) to the BLAS functions') +option('use_perl', type: 'boolean', value: false, description: 'Use the older PERL scripts for build preparation instead of universal shell scripts') +option('no_warmup', type: 'boolean', value: true, description: 'Do not run a benchmark on each startup just to find the best location for the memory buffer') +option('no_affinity', type: 'boolean', value: true, description: 'Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core') +option('build_cpp_thread_safety_test', type: 'boolean', value: false, description: 'Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)') +option('build_cpp_thread_safety_gemv', type: 'boolean', value: false, description: 'Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)') +option('build_static_libs', type: 'boolean', value: false, description: 'Build static library') +# Parallel options +option('num_cores', type: 'integer', + min: 1, value: 1, description: 'Number of CPUs') +option('num_parallel', type: 'integer', + min: 1, value: 1, description: 'Max CPU') +option('num_threads', type: 'integer', + min: 1, value: 12, description: 'Max threads') +option('use_locking', type: 'boolean', value: false, + description: 'Use locks even in single-threaded builds to make them callable from multiple threads') + +# From Makefile +option('dynamic_arch', type: 'boolean', value: false, description: 'Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)') +option('dynamic_older', type: 'boolean', value: false, description: 'Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH') +option('no_lapack', type: 'boolean', value: false, description: 'Disable LAPACK') +option('build_relapack', type: 'boolean', value: false, description: 'Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)') +option('no_fortran', type: 'boolean', value: false, description: 'Disable Fortran compiler usage') + +# From Makefile.system +option('relapack_replace', type: 'boolean', value: false, description: 'Replace RELAPACK functions') +option('build_single', type: 'boolean', value: true, description: 'Build single precision') +option('build_double', type: 'boolean', value: true, description: 'Build double precision') +option('build_complex', type: 'boolean', value: true, description: 'Build complex precision') +option('build_complex16', type: 'boolean', value: true, description: 'Build double complex precision') +option('max_stack_alloc', type: 'integer', value: 2048, description: 'Max stack allocation') + +# From lapack-netlib +option('realkind', type : 'string', value : 'd', + description : 's: real32 d: real64 c: complex32 z: complex64') +option('use_xblas', type : 'boolean', value : false, + description : 'Build extended precision (needs XBLAS)') + +# From / for interface/Makefile +option('build_bfloat16', type: 'boolean', value: false, description: 'Build bfloat16') +# option('exprecision', type: 'boolean', value: false, description: 'Build the q suffixes') + +# Meson only +# This is the equivalent of producing all precisions via make all inside lapack-netlib/blas/src +option('build_all_prec', type: 'boolean', value: true, description: 'Build all precisions') +# For installation, maps to +option('quad_precision', type: 'boolean', value: false, description: 'Build quad precision') +option('exprecision', type: 'boolean', value: false, description: 'Build extended precision') + +# For naming netlib libraries +option('netlib_blas_name', type: 'string', value: 'refblas', description: 'Name for the Netlib BLAS library') +option('netlib_cblas_name', type: 'string', value: 'clbas', description: 'Name for the Netlib CBLAS library') +option('netlib_lapack_name', type: 'string', value: 'lapack', description: 'Name for the Netlib LAPACK library') +option('netlib_tmglib_name', type: 'string', value: 'tmglib', description: 'Name for the Netlib TMGLIB library') +option('netlib_lapacke_name', type: 'string', value: 'lapacke', description: 'Name for the Netlib LAPACKE library') diff --git a/subprojects/packagefiles/openblas/prepare_config_last.py b/subprojects/packagefiles/openblas/prepare_config_last.py new file mode 100644 index 000000000..80602db88 --- /dev/null +++ b/subprojects/packagefiles/openblas/prepare_config_last.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path + + +def prepare_config_last(config_path, output_path, quad_precision, exprecision): + config_path = Path(config_path) + output_path = Path(output_path) + + with config_path.open("r") as config_file: + config_lines = config_file.readlines() + + with output_path.open("w") as output_file: + output_file.writelines(config_lines) + if quad_precision: + output_file.write("#define QUAD_PRECISION\n") + if exprecision: + output_file.write("#define EXPRECISION\n") + + +def main(): + parser = argparse.ArgumentParser(description="Prepare config_last.h from config.h") + parser.add_argument("--config", required=True, help="Path to config.h") + parser.add_argument("--output", required=True, help="Path to output config_last.h") + parser.add_argument( + "--quad-precision", action="store_true", help="Enable QUAD_PRECISION" + ) + parser.add_argument("--exprecision", action="store_true", help="Enable EXPRECISION") + + args = parser.parse_args() + + prepare_config_last(args.config, args.output, args.quad_precision, args.exprecision) + + +if __name__ == "__main__": + main() diff --git a/subprojects/packagefiles/openblas/read_config.py b/subprojects/packagefiles/openblas/read_config.py new file mode 100644 index 000000000..216d70849 --- /dev/null +++ b/subprojects/packagefiles/openblas/read_config.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path + + +def read_config_file(file_path: Path) -> dict: + config_data = {} + with file_path.open("r") as file: + lines = file.readlines() + for line in lines: + line = line.strip() + if line.startswith("#define"): + parts = line.split() + key = parts[1] + if len(parts) == 3: + value = parts[2] + if value.isdigit(): + value = int(value) + elif value.startswith('"') and value.endswith('"'): + value = value.strip('"') + config_data[key] = value + elif len(parts) == 2: + config_data[key] = 1 + return config_data + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Read a config.h file and write to a specified build directory." + ) + parser.add_argument("--file1", type=Path, help="Path to the config.h file.") + parser.add_argument("--build_dir", type=Path, help="Path to the build directory.") + + args = parser.parse_args() + + config_data = read_config_file(args.file1) + fdat = [] + for key, value in config_data.items(): + fdat.append(f"{key}={value}") + result = "\n".join(fdat) + + # Ensure the build directory exists + args.build_dir.mkdir(parents=True, exist_ok=True) + + # Write to the specified file in the build directory + output_file_path = args.build_dir / "config.kconf" + with output_file_path.open("a") as f: + f.write(result) diff --git a/subprojects/packagefiles/openblas/test/meson.build b/subprojects/packagefiles/openblas/test/meson.build new file mode 100644 index 000000000..033fd01d0 --- /dev/null +++ b/subprojects/packagefiles/openblas/test/meson.build @@ -0,0 +1,50 @@ +_test_input_array = [ + {'id': 'sblat1', 'has_dat': false}, + {'id': 'sblat2', 'has_dat': true}, + {'id': 'sblat3', 'has_dat': true}, + {'id': 'dblat1', 'has_dat': false}, + {'id': 'dblat2', 'has_dat': true}, + {'id': 'dblat3', 'has_dat': true}, + {'id': 'cblat1', 'has_dat': false}, + {'id': 'cblat2', 'has_dat': true}, + {'id': 'cblat3', 'has_dat': true}, + {'id': 'zblat1', 'has_dat': false}, + {'id': 'zblat2', 'has_dat': true}, + {'id': 'zblat3', 'has_dat': true}, +] + +if conf_hdat.has('ARCH_X86_64') or conf_hdat.has('ARCH_X86') + _test_input_array += [ + {'id': 'cblat3_3m', 'has_dat': true}, {'id': 'zblat3_3m', 'has_dat': true}, + ] +endif + +_test_runner = executable('test_runner', sources: ['test_runner.c'], install: false) + +foreach _test : _test_input_array + test_id = _test['id'] + executable( + test_id, + sources: f'@test_id@.f', + link_with: [openblas_static], + dependencies: [dependency('threads')], + ) + + if is_win + obj_name = test_id + else + obj_name = f'./@test_id@' + endif + + _args = [obj_name] + if _test['has_dat'] + _args += [ meson.current_source_dir() / f'@test_id@.dat'] + endif + + test( + test_id, + _test_runner, + args: _args, + workdir: meson.current_build_dir(), + ) +endforeach diff --git a/subprojects/packagefiles/openblas/test/test_runner.c b/subprojects/packagefiles/openblas/test/test_runner.c new file mode 100644 index 000000000..749a4f929 --- /dev/null +++ b/subprojects/packagefiles/openblas/test/test_runner.c @@ -0,0 +1,24 @@ +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2 && argc != 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + char command[1024]; + if (argc == 2) { + snprintf(command, sizeof(command), "%s", argv[1]); + } else { + snprintf(command, sizeof(command), "%s < %s", argv[1], argv[2]); + } + + int result = system(command); + if (result != EXIT_SUCCESS) { + fprintf(stderr, "Error: Command '%s' failed with return code %d.\n", command, result); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/subprojects/packagefiles/openblas/utest/meson.build b/subprojects/packagefiles/openblas/utest/meson.build new file mode 100644 index 000000000..336c7d792 --- /dev/null +++ b/subprojects/packagefiles/openblas/utest/meson.build @@ -0,0 +1,116 @@ +# TODO(rg): Add handling of utest_main2 as per CMakeLists +sources_utest = [ + 'utest_main.c', + 'test_min.c', + 'test_amax.c', + 'test_ismin.c', + 'test_rotmg.c', + 'test_axpy.c', + 'test_dotu.c', + 'test_dsdot.c', + 'test_swap.c', + 'test_rot.c', + 'test_dnrm2.c', + 'test_zscal.c', + 'test_amin.c', + 'test_axpby.c', +] + +dir_ext = 'test_extensions' +sources_utest_ext = [ + 'utest_main.c', + dir_ext / 'xerbla.c', + dir_ext / 'common.c', + dir_ext / 'test_isamin.c', + dir_ext / 'test_idamin.c', + dir_ext / 'test_icamin.c', + dir_ext / 'test_izamin.c', + dir_ext / 'test_ssum.c', + dir_ext / 'test_dsum.c', + dir_ext / 'test_scsum.c', + dir_ext / 'test_dzsum.c', + dir_ext / 'test_saxpby.c', + dir_ext / 'test_daxpby.c', + dir_ext / 'test_caxpby.c', + dir_ext / 'test_zaxpby.c', + dir_ext / 'test_zaxpyc.c', + dir_ext / 'test_caxpyc.c', + dir_ext / 'test_samin.c', + dir_ext / 'test_damin.c', + dir_ext / 'test_scamin.c', + dir_ext / 'test_dzamin.c', + dir_ext / 'test_scamax.c', + dir_ext / 'test_dzamax.c', + dir_ext / 'test_drotmg.c', + dir_ext / 'test_srotmg.c', + dir_ext / 'test_zrotg.c', + dir_ext / 'test_crotg.c', + dir_ext / 'test_crot.c', + dir_ext / 'test_zrot.c', + dir_ext / 'test_zscal.c', + dir_ext / 'test_cscal.c', + dir_ext / 'test_domatcopy.c', + dir_ext / 'test_somatcopy.c', + dir_ext / 'test_zomatcopy.c', + dir_ext / 'test_comatcopy.c', + dir_ext / 'test_simatcopy.c', + dir_ext / 'test_dimatcopy.c', + dir_ext / 'test_cimatcopy.c', + dir_ext / 'test_zimatcopy.c', + dir_ext / 'test_sgeadd.c', + dir_ext / 'test_dgeadd.c', + dir_ext / 'test_cgeadd.c', + dir_ext / 'test_zgeadd.c', + dir_ext / 'test_cgemv_t.c', + dir_ext / 'test_zgemv_t.c', + dir_ext / 'test_cgemv_n.c', + dir_ext / 'test_zgemv_n.c', + dir_ext / 'test_sgemmt.c', + dir_ext / 'test_dgemmt.c', + dir_ext / 'test_cgemmt.c', + dir_ext / 'test_zgemmt.c', + dir_ext / 'test_ztrmv.c', + dir_ext / 'test_ctrmv.c', + dir_ext / 'test_ztrsv.c', + dir_ext / 'test_ctrsv.c', + dir_ext / 'test_zgemm.c', + dir_ext / 'test_cgemm.c', + dir_ext / 'test_zgbmv.c', + dir_ext / 'test_cgbmv.c', + dir_ext / 'test_zspmv.c', + dir_ext / 'test_cspmv.c', + dir_ext / 'test_zsbmv.c', + dir_ext / 'test_csbmv.c', +] + +utest_inc = _inc + [include_directories('.')] + +# TODO(rg): Can this be fixed? +# ‘strsignal’ in utest, seems like a false positive +_utest_nowarn = cc.get_supported_arguments( + '-Wno-implicit-function-declaration', +) + +openblas_utest_exec = executable( + 'openblas_utest', + sources: sources_utest, + link_with: openblas_static, + dependencies: [dependency('threads')], + include_directories: utest_inc, + c_args: _cargs + _utest_nowarn, +) + +openblas_utest_ext_exec = executable( + 'openblas_utest_ext', + sources: sources_utest_ext, + link_with: openblas_static, + dependencies: [dependency('threads')], + # This hack is required as `utest/test_extensions/common.c` needs + # `common.h` from the source root, not `utest/test_extensions` directory + # one. Also, `config.h` is needed that lives in the build directory. + implicit_include_directories: false, + c_args: _cargs + _utest_nowarn + [f'-I@_msproot@', f'-I@_mbproot@'], +) + +test('openblas_utest_tests', openblas_utest_exec) +test('openblas_utest_tests', openblas_utest_ext_exec) diff --git a/subprojects/packagefiles/openblas/write_to_file.py b/subprojects/packagefiles/openblas/write_to_file.py new file mode 100644 index 000000000..eafb0ba2e --- /dev/null +++ b/subprojects/packagefiles/openblas/write_to_file.py @@ -0,0 +1,13 @@ +import argparse + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Write contents to file.") + parser.add_argument("contents", help="Contents.") + parser.add_argument("file_path", help="File path.") + + args = parser.parse_args() + + f = open(args.file_path, "a") + f.write(args.contents) + f.close() diff --git a/tools/sanity_checks.py b/tools/sanity_checks.py index 84ecf3dbd..001987fc2 100755 --- a/tools/sanity_checks.py +++ b/tools/sanity_checks.py @@ -87,6 +87,14 @@ 'openal-soft': [ 'hexify.py' ], + 'openblas': [ + 'gen_install_headers.py', + 'prepare_config_last.py', + 'read_config.py', + 'write_to_file.py', + 'test_runner.c', + 'run_fortran.c', + ], 'protobuf': [ 'symlink.py', ],