Skip to content

Commit

Permalink
Fixes #680: Added the matrixMulCUBLAS (modified) CUDA sample to the…
Browse files Browse the repository at this point in the history
… examples
  • Loading branch information
eyalroz committed Sep 19, 2024
1 parent eb6a73a commit d6cc076
Show file tree
Hide file tree
Showing 2 changed files with 301 additions and 0 deletions.
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ link_libraries(cuda-api-wrappers::runtime-and-driver)

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
if (TARGET CUDA::cublas)
add_executable(matrixMulCUBLAS modified_cuda_samples/matrixMulCUBLAS/matrixMulCUBLAS.cpp)
target_link_libraries(matrixMulCUBLAS CUDA::cublas)
endif()
add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu)
add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
Expand Down
297 changes: 297 additions & 0 deletions examples/modified_cuda_samples/matrixMulCUBLAS/matrixMulCUBLAS.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
/*
* Original code Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2024, Eyal Rozenberg <[email protected]>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/*
* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication as described in Chapter 3
* of the programming guide and uses the CUBLAS library to demonstrate
* the best performance.
* SOME PRECAUTIONS:
* IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
* WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
* The reason is explained as follows:
* CUBLAS library uses column-major storage, but C/C++ use row-major storage.
* When passing the matrix pointer to CUBLAS, the memory layout alters from
* row-major to column-major, which is equivalent to an implicit transpose.
* In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
* C = A * B, we can't use the input order like cublasSgemm(A, B) because of
* implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
* If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
* multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
* is a column-based cublas matrix, which means C(T) in C/C++, we need extra
* transpose code to convert it to a row-based C/C++ matrix.
* To solve the problem, let's consider our desired result C, a row-major matrix.
* In cublas format, it is C(T) actually (because of the implicit transpose).
* C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
* happen to be C/C++ matrice B and A (still because of the implicit transpose)!
* We don't need extra transpose code, we only need alter the input order!
*
* CUBLAS provides high-performance matrix multiplication.
* See also:
* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
*/

// Utilities and system includes
#include <assert.h>

#include <cublas_v2.h>

// CUDA and CUBLAS functions

#include "../../common.hpp"



// Optional Command-line multiplier for matrix sizes
typedef struct _matrixSize {
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;

////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set matrix multiply on CPU
//! C = A * B
//! @param C reference data, computed but preallocated
//! @param A matrix A as provided to device
//! @param B matrix B as provided to device
//! @param hA height of matrix A
//! @param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA,
unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j) {
double sum = 0;

for (unsigned int k = 0; k < wA; ++k) {
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}

C[i * wB + j] = (float) sum;
}
}

inline bool compare_l2_norm(
cuda::span<float const> reference,
cuda::span<const float> data,
float const epsilon)
{
if (reference.size() != data.size()) {
std::cerr << "Sizes of two spans to be compared - differ.";
exit(EXIT_FAILURE);
}
assert(epsilon >= 0);

float error = 0;
float ref = 0;

for (unsigned int i = 0; i < data.size(); ++i) {
float diff = reference[i] - data[i];
error += diff * diff;
ref += reference[i] * reference[i];
}

float normRef = ::sqrtf(ref);

if (fabs(ref) < 1e-7) {
std::cerr << "ERROR, reference l2-norm is 0\n";
exit(EXIT_FAILURE);
}

float normError = ::sqrtf(error);
error = normError / normRef;
bool result = error < epsilon;
if (not result) {
std::cerr << "ERROR, L2-norm error " << error << " is greater than epsilon " << epsilon << "\n";
}
return result;
}

sMatrixSize initialize_matrix_dimensions()
{
auto matrix_size_multiplier { 5 };
sMatrixSize matrix_dims;
int block_size { 32 };

matrix_dims.uiWA = 3 * block_size * matrix_size_multiplier;
matrix_dims.uiHA = 4 * block_size * matrix_size_multiplier;

matrix_dims.uiWB = 2 * block_size * matrix_size_multiplier;
matrix_dims.uiHB = 3 * block_size * matrix_size_multiplier;

matrix_dims.uiWC = 2 * block_size * matrix_size_multiplier;
matrix_dims.uiHC = 4 * block_size * matrix_size_multiplier;

std::cout
<< "MatrixA(" << matrix_dims.uiHA << ',' << matrix_dims.uiWA << "), "
<< "MatrixB(" << matrix_dims.uiHB << ',' << matrix_dims.uiWB << "), "
<< "MatrixC(" << matrix_dims.uiHC << ',' << matrix_dims.uiWC << ")\\n";

if (matrix_dims.uiWA != matrix_dims.uiHB ||
matrix_dims.uiHA != matrix_dims.uiHC ||
matrix_dims.uiWB != matrix_dims.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(EXIT_FAILURE);
}
return matrix_dims;
}

void multiply_and_time_with_cublas(
cuda::device_t device,
cuda::span<float> d_A,
cuda::span<float> d_B,
cuda::span<float> d_C,
cuda::span<float> h_CUBLAS,
sMatrixSize matrix_dims,
int num_iterations)
{
std::cout << "Computing result using CUBLAS...";

const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;

cublasCreate(&handle);

// Perform warmup operation with cublas
cublasSgemm(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
matrix_dims.uiWB, matrix_dims.uiHA, matrix_dims.uiWA, // m, n, k
&alpha, d_B.data(),
matrix_dims.uiWB, // lda
d_A.data(),
matrix_dims.uiWA, // ldb
&beta,
d_C.data(),
matrix_dims.uiWB // ldc
);

// Allocate CUDA events that we'll use for timing

// Record the start event
auto stream = device.default_stream();
auto start = stream.enqueue.event(cuda::event::sync_by_blocking, cuda::event::do_record_timings);

for (int iteration_index = 0; iteration_index < num_iterations; iteration_index++) {
// note cublas is column primary!
// need to transpose the order
cublasSgemm(
handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_dims.uiWB, matrix_dims.uiHA,
matrix_dims.uiWA, &alpha, d_B.data(), matrix_dims.uiWB, d_A.data(),
matrix_dims.uiWA, &beta, d_C.data(), matrix_dims.uiWB);
}
auto end = stream.enqueue.event(cuda::event::sync_by_blocking, cuda::event::do_record_timings);

std::cout << "done.\n";

// Wait for the stop event to complete
end.synchronize();

auto total = cuda::event::time_elapsed_between(start, end);

// Compute and print the performance
auto msec_per_iteration = total.count() / num_iterations;
double ops_per_multiplication = 2.0 * (double) matrix_dims.uiHC *
(double) matrix_dims.uiWC *
(double) matrix_dims.uiHB;
double giga_ops_per_second =
(ops_per_multiplication * 1.0e-9f) / (msec_per_iteration / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
giga_ops_per_second, msec_per_iteration, ops_per_multiplication);

cuda::memory::copy(h_CUBLAS, d_C);

// Destroy the handle
cublasDestroy(handle);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////

int main(int argc, char **argv)
{
std::cout << "[Matrix Multiply CUBLAS] - Starting...\n";
auto device_id = choose_device(argc, argv);
auto device = cuda::device::get(device_id);

std::cout << "GPU Device " << device_id << ": \"" << device.name() << "\" "
<< "with compute capability " << device.compute_capability() << '\n';

auto matrix_dims = initialize_matrix_dimensions();
int num_iterations = 30;

auto size_A = matrix_dims.uiWA * matrix_dims.uiHA;
auto size_B = matrix_dims.uiWB * matrix_dims.uiHB;
auto size_C = matrix_dims.uiWC * matrix_dims.uiHC;

auto h_A = cuda::make_unique_span<float>(size_A);
auto h_B = cuda::make_unique_span<float>(size_B);
auto h_CUBLAS_result = cuda::make_unique_span<float>(size_C);

// set seed for rand()
srand(2006);

// initialize host memory
auto generator = []() { return rand() / (float) RAND_MAX; };
std::generate(h_A.begin(), h_A.end(), generator);
std::generate(h_B.begin(), h_B.end(), generator);

// allocate device memory
auto d_A = cuda::memory::device::make_unique_span<float>(size_A);
auto d_B = cuda::memory::device::make_unique_span<float>(size_B);
auto d_C = cuda::memory::device::make_unique_span<float>(size_C);

cuda::memory::copy(d_A, h_A);
cuda::memory::copy(d_B, h_B);

multiply_and_time_with_cublas(device, d_A, d_B, d_C, h_CUBLAS_result, matrix_dims, num_iterations);

// compute reference solution
std::cout << "Computing result using host CPU...";
auto h_CPU_result = cuda::make_unique_span<float>(size_C);
matrixMulCPU(h_CPU_result.data(), h_A.data(), h_B.data(), matrix_dims.uiHA, matrix_dims.uiWA, matrix_dims.uiWB);
std::cout << "Done.\n";

bool about_equal = compare_l2_norm(h_CPU_result, h_CUBLAS_result, 1.0e-6f);

std::cout << "CUBLAS Matrix Multiply is close enough to CPU results: " << (about_equal ? "Yes" : "No") << '\n';
std::cout << (about_equal ? "SUCCESS" : "FAILURE");
}

0 comments on commit d6cc076

Please sign in to comment.