Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cutensor #25

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
# Specify the minimum version for CMake
cmake_minimum_required(VERSION 3.5)
cmake_minimum_required(VERSION 3.12)

# Project's name
project(EigenCuda LANGUAGES CXX)

###############################################################################
# build setup
###############################################################################

# Set the output folder where your program will be created
set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})

###############################################################################
# Compiler flags
###############################################################################

if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
#release comes with -O3 by default
set(CMAKE_BUILD_TYPE Release CACHE STRING
Expand All @@ -17,9 +30,33 @@ if(ENABLE_TESTING)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
endif(ENABLE_TESTING)

###############################################################################
# Dependencies
###############################################################################

# Search for Cuda
find_package(CUDA REQUIRED)

# Search for Cutensor
if(NOT DEFINED ENV{CUTENSOR_ROOT} AND NOT DEFINED CUTENSOR_ROOT)
message(FATAL_ERROR "CUTENSOR_ROOT not set!")
else()
if(DEFINED ENV{CUTENSOR_ROOT})
set(CUTENSOR_ROOT "$ENV{CUTENSOR_ROOT}")
endif()
message("-- Looking for cuTENSOR in ${CUTENSOR_ROOT}")
if(NOT EXISTS ${CUTENSOR_ROOT})
message(FATAL_ERROR "Cannot find CUTENSOR_ROOT")
endif()
endif()

if(NOT TARGET cutensor)
add_library(cutensor SHARED IMPORTED)
set_target_properties(cutensor PROPERTIES
IMPORTED_LOCATION "${CUTENSOR_ROOT}/lib/10.2/libcutensor.so"
INTERFACE_INCLUDE_DIRECTORIES "${CUTENSOR_ROOT}/include")
endif()

# search for Eigen
find_package (Eigen3 3.3 REQUIRED CONFIG
HINTS ${CONDA_PREFIX}/include
Expand Down
27 changes: 3 additions & 24 deletions include/cudamatrix.hpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
#ifndef EIGENCUDA_H_
#define EIGENCUDA_H_

#include <Eigen/Core>
#include <Eigen/Dense>
#include <cublas_v2.h>
#include "cudatensorbase.hpp"
#include <curand.h>
#include <iostream>
#include <memory>
#include <sstream>
#include <vector>

/*
* \brief Perform Tensor-matrix multiplications in a GPU
*
Expand All @@ -19,14 +16,9 @@

namespace eigencuda {

cudaError_t checkCuda(cudaError_t result);

using Index = Eigen::Index;
Index count_available_gpus();

class CudaMatrix {
class CudaMatrix : CudaTensorBase {
public:
Index size() const { return _rows * _cols; };
Index size() const override { return _rows * _cols; };
Index rows() const { return _rows; };
Index cols() const { return _cols; };
double *data() const { return _data.get(); };
Expand All @@ -42,19 +34,6 @@ class CudaMatrix {
void copy_to_gpu(const Eigen::MatrixXd &A);

private:
// Unique pointer with custom delete function
using Unique_ptr_to_GPU_data = std::unique_ptr<double, void (*)(double *)>;

Unique_ptr_to_GPU_data alloc_matrix_in_gpu(size_t size_arr) const;

void throw_if_not_enough_memory_in_gpu(size_t requested_memory) const;

size_t size_matrix() const { return this->size() * sizeof(double); }

// Attributes of the matrix in the device
Unique_ptr_to_GPU_data _data{nullptr,
[](double *x) { checkCuda(cudaFree(x)); }};
cudaStream_t _stream = nullptr;
Index _rows;
Index _cols;
};
Expand Down
2 changes: 1 addition & 1 deletion include/cudapipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CUDA_PIPELINE__H

#include "cudamatrix.hpp"

#include <cublas_v2.h>
/*
* \brief Perform Tensor-matrix multiplications in a GPU
*
Expand Down
37 changes: 37 additions & 0 deletions include/cudatensor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef CUDATENSOR_H_
#define CUDATENSOR_H_

#include <cuda_runtime.h>
#include <cudatensorbase.hpp>
#include <memory>
#include <unsupported/Eigen/CXX11/Tensor>

/**
* \brief Manage the tensor memory in the GPU
*/

namespace eigencuda {

class CudaTensor : CudaTensorBase {
public:
CudaTensor() = default;

Index size() const override;

private:
// vector of dimensions;
const Eigen::Tensor<float, 3>::Dimensions _dimensions;
/**
* \brief Copy the tensor to the GPU
*/
void copy_to_gpu(const Eigen::Tensor<double, 3> &A);

/**
* \brief Convert A CudaTensor to an EigenTensor
*/
operator Eigen::Tensor<double, 3>() const;
};

} // namespace eigencuda

#endif // CUDATENSOR_H_
63 changes: 63 additions & 0 deletions include/cudatensorbase.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef CUDATENSORBASE_H_
#define CUDATENSORBASE_H_

#include <Eigen/Core>
#include <Eigen/Dense>
#include <cuda_runtime.h>
#include <memory>
/**
* \brief Manage the tensor memory on the GPU
*/

namespace eigencuda {

// Check the return value of the cuda operations
cudaError_t checkCuda(cudaError_t result);

// Int64
using Index = Eigen::Index;

/**
* \brief Number of GPUs on the host
*/
Index count_available_gpus();

class CudaTensorBase {
public:
// Pointer to the GPU memory
double *data() const { return _data.get(); };

/**
* \brief calculate the total number of elements
*/
Index virtual size() const = 0;

protected:
// Unique pointer with custom delete function
using Unique_ptr_to_GPU_data = std::unique_ptr<double, void (*)(double *)>;

// GPU stream
cudaStream_t _stream = nullptr;

// Attributes of the matrix in the device
Unique_ptr_to_GPU_data _data{
nullptr, [](double *x) { eigencuda::checkCuda(cudaFree(x)); }};

size_t size_tensor() const { return this->size() * sizeof(double); }

/**
* \brief reserve space for the tensor in the GPU
* @param size of the tensor
*/
Unique_ptr_to_GPU_data alloc_tensor_in_gpu(size_t) const;

/**
* \brief Check that the GPU has enough memory
* @param requested_memory
*/
void throw_if_not_enough_memory_in_gpu(size_t requested_memory) const;
};

} // namespace eigencuda

#endif // CUDATENSORBASE_H_
11 changes: 10 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

add_library(eigencuda cudamatrix.cc cudapipeline.cc)
add_library(eigencuda cudamatrix.cc cudapipeline.cc cudatensor.cc cudatensorbase.cc)

add_executable(contraction contraction.cc)

target_include_directories(eigencuda
PUBLIC
Expand All @@ -25,6 +27,13 @@ target_link_libraries(eigencuda
${CUDA_CUBLAS_LIBRARIES}
)

target_link_libraries(contraction
PUBLIC
eigencuda
cutensor
)


if(ENABLE_TESTING)
add_subdirectory(tests)
endif()
110 changes: 110 additions & 0 deletions src/contraction.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@

#include <cutensor.h>
#include <iostream>
#include <unordered_map>
#include <unsupported/Eigen/CXX11/Tensor>
#include <vector>

int main() {
// Host element type definition
using floatTypeCompute = float;

// CUDA types
cudaDataType_t typeA = CUDA_R_64F;
cudaDataType_t typeB = CUDA_R_64F;
cudaDataType_t typeC = CUDA_R_64F;
cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_64F;

floatTypeCompute alpha = 1.1;
floatTypeCompute beta = 0.9;

std::cout << "Include headers and define data types\n";

// Create vector of modes
std::vector<int> modeC{'m', 'u', 'n', 'v'};
std::vector<int> modeA{'m', 'h', 'k', 'n'};
std::vector<int> modeB{'u', 'k', 'v', 'h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();

// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;

// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for (auto mode : modeC) extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for (auto mode : modeA) extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for (auto mode : modeB) extentB.push_back(extent[mode]);

// Number of elements of each tensor
size_t elementsA = 1;
for (auto mode : modeA) elementsA *= extent[mode];
size_t elementsB = 1;
for (auto mode : modeB) elementsB *= extent[mode];
size_t elementsC = 1;
for (auto mode : modeC) elementsC *= extent[mode];

// Eigen Tensors
Eigen::Tensor<double, 4> tensorA(96, 96, 64, 64);
Eigen::Tensor<double, 4> tensorB(96, 64, 64, 64);
Eigen::Tensor<double, 4> tensorC(96, 96, 96, 64);

// Initialize tensor as Random
tensorA.setRandom();
tensorB.setRandom();
tensorC.setRandom();

// Size in bytes
size_t sizeA = sizeof(double) * tensorA.size();
size_t sizeB = sizeof(double) * tensorB.size();
size_t sizeC = sizeof(double) * tensorC.size();

const auto& dims = tensorA.dimensions();
Eigen::Index total = std::accumulate(dims.cbegin(), dims.cend(), 1,
[](auto x, auto y) { return x * y; });
std::cout << "total size: " << total << "\n";

// // Allocate on device
// eigencuda::Unique_ptr_to_GPU_data A_in_gpu =
// eigencuda::alloc_tensor_in_gpu(sizeA);
// eigencuda::Unique_ptr_to_GPU_data B_in_gpu =
// eigencuda::alloc_tensor_in_gpu(sizeB);
// eigencuda::Unique_ptr_to_GPU_data C_in_gpu =
// eigencuda::alloc_tensor_in_gpu(sizeC);

// // Initialize data on host
// for (int64_t i = 0; i < elementsA; i++)
// A[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;
// for (int64_t i = 0; i < elementsB; i++)
// B[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;
// for (int64_t i = 0; i < elementsC; i++)
// C[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;

// Copy to device
// cudaMemcpy(C_in_gpu, tensorC, sizeC, cudaMemcpyHostToDevice);
// cudaMemcpy(A_in_gpu, tensorA, sizeA, cudaMemcpyHostToDevice);
// cudaMemcpy(B_in_gpu, tensorB, sizeB, cudaMemcpyHostToDevice);
// eigencuda::checkCuda(cudaMemcpy(A_in_gpu.get(), tensorA.data(),
// sizeA,
// cudaMemcpyHostToDevice));
// eigencuda::checkCuda(cudaMemcpy(B_in_gpu.get(), tensorB.data(),
// sizeB,
// cudaMemcpyHostToDevice));

// eigencuda::checkCuda(cudaMemcpy(C_in_gpu.get(), tensorC.data(),
// sizeC,
// cudaMemcpyHostToDevice));

std::cout << "Allocate, initialize and transfer tensors\n";

return 0;
}
Loading