-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
examples: add matmul performance test
- Loading branch information
Showing
1 changed file
with
271 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
/******************************************************************************* | ||
* Copyright 2022 Intel Corporation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*******************************************************************************/ | ||
|
||
#include <algorithm> | ||
#include <chrono> | ||
#include <cmath> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <random> | ||
#include <string> | ||
#include <vector> | ||
|
||
#include "example_utils.hpp" | ||
#include "oneapi/dnnl/dnnl.hpp" | ||
|
||
using namespace dnnl; | ||
|
||
using tag = memory::format_tag; | ||
using dt = memory::data_type; | ||
|
||
struct gemm_dims_t { | ||
memory::dim m, n, k; | ||
}; | ||
|
||
static const int min_runs = 4; | ||
|
||
const char *get_type_string(dt type) { | ||
const char *type_string = "unknown"; | ||
|
||
#define TYPE_CASE(T) \ | ||
if (type == dt::T) type_string = #T; | ||
TYPE_CASE(f16); | ||
TYPE_CASE(f32); | ||
TYPE_CASE(f64); | ||
TYPE_CASE(bf16); | ||
TYPE_CASE(s8); | ||
TYPE_CASE(u8); | ||
#undef TYPE_CASE | ||
|
||
return type_string; | ||
} | ||
|
||
void print_test_case(dt type, gemm_dims_t dims) { | ||
std::cout << '[' << std::setw(4) << get_type_string(type); | ||
if (dims.m == dims.n && dims.m == dims.k) | ||
std::cout << " m = n = k = " << dims.m; | ||
else | ||
std::cout << " m = " << dims.m << ", n = " << dims.n | ||
<< ", k = " << dims.k; | ||
std::cout << "] " << std::flush; | ||
} | ||
|
||
void fill_random(std::vector<float> &out, bool is_integer) { | ||
static std::vector<float> random_data_i, random_data_f; | ||
constexpr size_t nrand = 1037; | ||
|
||
if (random_data_i.empty() || random_data_f.empty()) { | ||
std::mt19937 generator; | ||
std::uniform_int_distribution<int> dist_i(-16, 15); | ||
std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f); | ||
|
||
random_data_i.resize(nrand); | ||
for (auto &d : random_data_i) | ||
d = dist_i(generator); | ||
|
||
random_data_f.resize(nrand); | ||
for (auto &d : random_data_f) | ||
d = dist_f(generator); | ||
} | ||
|
||
auto &rd = is_integer ? random_data_i : random_data_f; | ||
|
||
for (size_t i = 0; i < out.size(); i += nrand) { | ||
size_t chunk = std::min(nrand, out.size() - i); | ||
std::memcpy(&out[i], rd.data(), chunk * sizeof(float)); | ||
} | ||
} | ||
|
||
double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims, | ||
double time_limit = 0.) { | ||
bool is_integer = (type == dt::s8 || type == dt::u8); | ||
bool quick_test = (time_limit == 0.); | ||
|
||
// Create execution dnnl::engine. | ||
dnnl::engine engine(engine_kind, 0); | ||
|
||
// Create dnnl::stream. | ||
dnnl::stream engine_stream(engine); | ||
|
||
// Source (A), weights (B), and destination (C) matrix dimensions. | ||
memory::dims a_dims = {dims.m, dims.k}; | ||
memory::dims b_dims = {dims.k, dims.n}; | ||
memory::dims c_dims = {dims.m, dims.n}; | ||
|
||
// Allocate buffers and random-initialize A/B | ||
std::vector<float> a_data(product(a_dims)); | ||
std::vector<float> b_data(product(b_dims)); | ||
std::vector<float> c_data(product(c_dims)); | ||
|
||
fill_random(a_data, is_integer); | ||
fill_random(b_data, is_integer); | ||
|
||
// Create memory descriptors and memory objects for src, weights, bias, and | ||
// dst. | ||
auto a_md = memory::desc(a_dims, type, tag::any); | ||
auto b_md = memory::desc(b_dims, type, tag::any); | ||
auto c_md = memory::desc(c_dims, type, tag::any); | ||
|
||
auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab); | ||
auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab); | ||
|
||
auto a_in_mem = memory(a_in_md, engine); | ||
auto b_in_mem = memory(b_in_md, engine); | ||
|
||
// Write data to memory object's handles. | ||
write_to_dnnl_memory(a_data.data(), a_in_mem); | ||
write_to_dnnl_memory(b_data.data(), b_in_mem); | ||
|
||
// Create operation descriptor | ||
auto matmul_d = matmul::desc(a_md, b_md, c_md); | ||
|
||
// Create primitive descriptor. | ||
auto matmul_pd = matmul::primitive_desc(matmul_d, engine); | ||
|
||
// Repack and convert input data. | ||
auto a_mem = memory(matmul_pd.src_desc(), engine); | ||
reorder(a_in_mem, a_mem).execute(engine_stream, a_in_mem, a_mem); | ||
|
||
auto b_mem = memory(matmul_pd.weights_desc(), engine); | ||
reorder(b_in_mem, b_mem).execute(engine_stream, b_in_mem, b_mem); | ||
|
||
auto c_mem = memory(matmul_pd.dst_desc(), engine); | ||
|
||
// Create the primitive. | ||
auto matmul_prim = matmul(matmul_pd); | ||
|
||
// Start output. | ||
if (!quick_test) print_test_case(type, dims); | ||
|
||
// Primitive arguments. | ||
std::unordered_map<int, memory> matmul_args; | ||
matmul_args.insert({DNNL_ARG_SRC, a_mem}); | ||
matmul_args.insert({DNNL_ARG_WEIGHTS, b_mem}); | ||
matmul_args.insert({DNNL_ARG_DST, c_mem}); | ||
|
||
// Warmup executions. | ||
matmul_prim.execute(engine_stream, matmul_args); | ||
engine_stream.wait(); | ||
|
||
auto start_first = std::chrono::steady_clock::now(); | ||
matmul_prim.execute(engine_stream, matmul_args); | ||
engine_stream.wait(); | ||
auto end_first = std::chrono::steady_clock::now(); | ||
|
||
std::chrono::duration<double> dur_first = end_first - start_first; | ||
|
||
if (quick_test) return dur_first.count(); | ||
|
||
int runs = std::max(min_runs, int(time_limit / dur_first.count())); | ||
|
||
// Timing runs. | ||
auto start = std::chrono::steady_clock::now(); | ||
|
||
for (int i = 0; i <= runs; i++) | ||
matmul_prim.execute(engine_stream, matmul_args); | ||
engine_stream.wait(); | ||
|
||
auto end = std::chrono::steady_clock::now(); | ||
|
||
std::chrono::duration<double> duration = end - start; | ||
|
||
// Display the result. | ||
double avg_time = (duration.count() - dur_first.count()) / runs; | ||
double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * 2; | ||
double perf = (total_ops / avg_time) * 1e-9; | ||
|
||
auto scale_string = "G"; | ||
auto unit_string = is_integer ? "Op/s" : "Flop/s"; | ||
|
||
if (perf >= 1000) { | ||
perf /= 1000; | ||
scale_string = "T"; | ||
} | ||
|
||
std::cout << perf << ' ' << scale_string << unit_string << std::endl; | ||
|
||
return avg_time; | ||
} | ||
|
||
void run(engine::kind engine_kind, dt type, gemm_dims_t dims, | ||
double time_limit) { | ||
try { | ||
if (dims.m * dims.n != 0) { | ||
// Dimensions manually specified by user. | ||
run_case(engine_kind, type, dims, time_limit); | ||
} else { | ||
// Automatically choose dimensions to fit time limit. | ||
int mnk = 128; | ||
const int max_mnk = 8192; | ||
|
||
while (mnk < max_mnk) { | ||
dims.m = dims.n = dims.k = mnk; | ||
double time1 = run_case(engine_kind, type, dims); | ||
double nruns_est = std::max(1., time_limit / time1); | ||
double mnk_expand = std::exp2( | ||
std::round(std::log2(nruns_est / min_runs) / 3.)); | ||
if (mnk_expand <= 1) break; | ||
mnk = std::min<double>(max_mnk, mnk * mnk_expand); | ||
} | ||
|
||
dims.m = dims.n = dims.k = mnk; | ||
run_case(engine_kind, type, dims, time_limit); | ||
} | ||
} catch (dnnl::error &e) { | ||
// Catch and report unimplemented cases. | ||
if (e.status == dnnl_unimplemented) { | ||
print_test_case(type, dims); | ||
std::cout << "unsupported" << std::endl; | ||
} else | ||
throw; | ||
} | ||
} | ||
|
||
void bad_args() { | ||
std::cerr << "Usage: matmul-perf-cpp [cpu|gpu]\n" | ||
" matmul-perf-cpp [cpu|gpu] <size>\n" | ||
" matmul-perf-cpp [cpu|gpu] <m> <n> <k>\n" | ||
"If a single <size> is specified, it is used for all three " | ||
"dimensions (m/n/k).\n"; | ||
throw std::invalid_argument("Incorrect input arguments."); | ||
} | ||
|
||
void matmul_perf(engine::kind engine_kind, int argc, char **argv) { | ||
gemm_dims_t dims = {0, 0, 0}; | ||
|
||
if (argc > 2) { | ||
if (argc == 3) | ||
dims.m = dims.n = dims.k = std::atoi(argv[2]); | ||
else if (argc == 5) { | ||
dims.m = std::atoi(argv[2]); | ||
dims.n = std::atoi(argv[3]); | ||
dims.k = std::atoi(argv[4]); | ||
} else | ||
bad_args(); | ||
|
||
if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args(); | ||
} | ||
|
||
run(engine_kind, dt::f32, dims, 2.0); | ||
run(engine_kind, dt::f16, dims, 2.0); | ||
run(engine_kind, dt::bf16, dims, 2.0); | ||
run(engine_kind, dt::s8, dims, 2.0); | ||
} | ||
|
||
int main(int argc, char **argv) { | ||
return handle_example_errors( | ||
matmul_perf, parse_engine_kind(argc, argv, 3), argc, argv); | ||
} |