NLESC-JCER · felipeZ · Mar 5, 2020 · Mar 5, 2020 · Mar 5, 2020 · Mar 6, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,9 +1,22 @@
 # Specify the minimum version for CMake
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.12)
 
 # Project's name
 project(EigenCuda LANGUAGES CXX)
 
+###############################################################################
+# build setup
+###############################################################################
+
+# Set the output folder where your program will be created
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+###############################################################################
+# Compiler flags
+###############################################################################
+
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
   #release comes with -O3 by default
   set(CMAKE_BUILD_TYPE Release CACHE STRING
@@ -17,9 +30,33 @@ if(ENABLE_TESTING)
   find_package(Boost REQUIRED COMPONENTS unit_test_framework)
 endif(ENABLE_TESTING)
 
+###############################################################################
+# Dependencies
+###############################################################################
+
 # Search for Cuda
 find_package(CUDA REQUIRED)
 
+# Search for Cutensor
+if(NOT DEFINED ENV{CUTENSOR_ROOT} AND NOT DEFINED CUTENSOR_ROOT)
+  message(FATAL_ERROR "CUTENSOR_ROOT not set!")
+else()
+  if(DEFINED ENV{CUTENSOR_ROOT})
+    set(CUTENSOR_ROOT "$ENV{CUTENSOR_ROOT}")
+  endif()
+  message("-- Looking for cuTENSOR in ${CUTENSOR_ROOT}")
+  if(NOT EXISTS ${CUTENSOR_ROOT})
+    message(FATAL_ERROR "Cannot find CUTENSOR_ROOT")
+  endif()
+endif()
+
+if(NOT TARGET cutensor)
+  add_library(cutensor SHARED IMPORTED)
+  set_target_properties(cutensor PROPERTIES
+    IMPORTED_LOCATION "${CUTENSOR_ROOT}/lib/10.2/libcutensor.so"
+    INTERFACE_INCLUDE_DIRECTORIES "${CUTENSOR_ROOT}/include")
+endif()
+
 # search for Eigen
 find_package (Eigen3 3.3 REQUIRED CONFIG
   HINTS ${CONDA_PREFIX}/include

diff --git a/include/cudamatrix.hpp b/include/cudamatrix.hpp
@@ -1,15 +1,12 @@
 #ifndef EIGENCUDA_H_
 #define EIGENCUDA_H_
 
-#include <Eigen/Core>
-#include <Eigen/Dense>
-#include <cublas_v2.h>
+#include "cudatensorbase.hpp"
 #include <curand.h>
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <vector>
-
 /*
  * \brief Perform Tensor-matrix multiplications in a GPU
  *
@@ -19,14 +16,9 @@
 
 namespace eigencuda {
 
-cudaError_t checkCuda(cudaError_t result);
-
-using Index = Eigen::Index;
-Index count_available_gpus();
-
-class CudaMatrix {
+class CudaMatrix : CudaTensorBase {
  public:
-  Index size() const { return _rows * _cols; };
+  Index size() const override { return _rows * _cols; };
   Index rows() const { return _rows; };
   Index cols() const { return _cols; };
   double *data() const { return _data.get(); };
@@ -42,19 +34,6 @@ class CudaMatrix {
   void copy_to_gpu(const Eigen::MatrixXd &A);
 
  private:
-  // Unique pointer with custom delete function
-  using Unique_ptr_to_GPU_data = std::unique_ptr<double, void (*)(double *)>;
-
-  Unique_ptr_to_GPU_data alloc_matrix_in_gpu(size_t size_arr) const;
-
-  void throw_if_not_enough_memory_in_gpu(size_t requested_memory) const;
-
-  size_t size_matrix() const { return this->size() * sizeof(double); }
-
-  // Attributes of the matrix in the device
-  Unique_ptr_to_GPU_data _data{nullptr,
-                               [](double *x) { checkCuda(cudaFree(x)); }};
-  cudaStream_t _stream = nullptr;
   Index _rows;
   Index _cols;
 };

diff --git a/include/cudapipeline.hpp b/include/cudapipeline.hpp
@@ -2,7 +2,7 @@
 #define CUDA_PIPELINE__H
 
 #include "cudamatrix.hpp"
-
+#include <cublas_v2.h>
 /*
  * \brief Perform Tensor-matrix multiplications in a GPU
  *

diff --git a/include/cudatensor.hpp b/include/cudatensor.hpp
@@ -0,0 +1,37 @@
+#ifndef CUDATENSOR_H_
+#define CUDATENSOR_H_
+
+#include <cuda_runtime.h>
+#include <cudatensorbase.hpp>
+#include <memory>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+/**
+ * \brief Manage the tensor memory in the GPU
+ */
+
+namespace eigencuda {
+
+class CudaTensor : CudaTensorBase {
+ public:
+  CudaTensor() = default;
+
+  Index size() const override;
+
+ private:
+  // vector of dimensions;
+  const Eigen::Tensor<float, 3>::Dimensions _dimensions;
+  /**
+   * \brief Copy the tensor to the GPU
+   */
+  void copy_to_gpu(const Eigen::Tensor<double, 3> &A);
+
+  /**
+   * \brief Convert A CudaTensor to an EigenTensor
+   */
+  operator Eigen::Tensor<double, 3>() const;
+};
+
+}  // namespace eigencuda
+
+#endif  // CUDATENSOR_H_
diff --git a/include/cudatensorbase.hpp b/include/cudatensorbase.hpp
@@ -0,0 +1,63 @@
+#ifndef CUDATENSORBASE_H_
+#define CUDATENSORBASE_H_
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <cuda_runtime.h>
+#include <memory>
+/**
+ * \brief Manage the tensor memory on the GPU
+ */
+
+namespace eigencuda {
+
+// Check the return value of the cuda operations
+cudaError_t checkCuda(cudaError_t result);
+
+// Int64
+using Index = Eigen::Index;
+
+/**
+ * \brief Number of GPUs on the host
+ */
+Index count_available_gpus();
+
+class CudaTensorBase {
+ public:
+  // Pointer to the GPU memory
+  double *data() const { return _data.get(); };
+
+  /**
+   * \brief calculate the total number of elements
+   */
+  Index virtual size() const = 0;
+
+ protected:
+  // Unique pointer with custom delete function
+  using Unique_ptr_to_GPU_data = std::unique_ptr<double, void (*)(double *)>;
+
+  // GPU stream
+  cudaStream_t _stream = nullptr;
+
+  // Attributes of the matrix in the device
+  Unique_ptr_to_GPU_data _data{
+      nullptr, [](double *x) { eigencuda::checkCuda(cudaFree(x)); }};
+
+  size_t size_tensor() const { return this->size() * sizeof(double); }
+
+  /**
+   * \brief reserve space for the tensor in the GPU
+   * @param size of the tensor
+   */
+  Unique_ptr_to_GPU_data alloc_tensor_in_gpu(size_t) const;
+
+  /**
+   * \brief Check that the GPU has enough memory
+   * @param requested_memory
+   */
+  void throw_if_not_enough_memory_in_gpu(size_t requested_memory) const;
+};
+
+}  // namespace eigencuda
+
+#endif  // CUDATENSORBASE_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 
-add_library(eigencuda cudamatrix.cc cudapipeline.cc)
+add_library(eigencuda cudamatrix.cc cudapipeline.cc cudatensor.cc cudatensorbase.cc)
+
+add_executable(contraction contraction.cc)
 
 target_include_directories(eigencuda
   PUBLIC
@@ -25,6 +27,13 @@ target_link_libraries(eigencuda
     ${CUDA_CUBLAS_LIBRARIES}
   )
 
+  target_link_libraries(contraction
+  PUBLIC
+    eigencuda
+    cutensor
+  )
+
+
 if(ENABLE_TESTING)
   add_subdirectory(tests)
 endif()
diff --git a/src/contraction.cc b/src/contraction.cc
@@ -0,0 +1,110 @@
+
+#include <cutensor.h>
+#include <iostream>
+#include <unordered_map>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+
+int main() {
+  // Host element type definition
+  using floatTypeCompute = float;
+
+  // CUDA types
+  cudaDataType_t typeA = CUDA_R_64F;
+  cudaDataType_t typeB = CUDA_R_64F;
+  cudaDataType_t typeC = CUDA_R_64F;
+  cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_64F;
+
+  floatTypeCompute alpha = 1.1;
+  floatTypeCompute beta = 0.9;
+
+  std::cout << "Include headers and define data types\n";
+
+  // Create vector of modes
+  std::vector<int> modeC{'m', 'u', 'n', 'v'};
+  std::vector<int> modeA{'m', 'h', 'k', 'n'};
+  std::vector<int> modeB{'u', 'k', 'v', 'h'};
+  int nmodeA = modeA.size();
+  int nmodeB = modeB.size();
+  int nmodeC = modeC.size();
+
+  // Extents
+  std::unordered_map<int, int64_t> extent;
+  extent['m'] = 96;
+  extent['n'] = 96;
+  extent['u'] = 96;
+  extent['v'] = 64;
+  extent['h'] = 64;
+  extent['k'] = 64;
+
+  // Create a vector of extents for each tensor
+  std::vector<int64_t> extentC;
+  for (auto mode : modeC) extentC.push_back(extent[mode]);
+  std::vector<int64_t> extentA;
+  for (auto mode : modeA) extentA.push_back(extent[mode]);
+  std::vector<int64_t> extentB;
+  for (auto mode : modeB) extentB.push_back(extent[mode]);
+
+  // Number of elements of each tensor
+  size_t elementsA = 1;
+  for (auto mode : modeA) elementsA *= extent[mode];
+  size_t elementsB = 1;
+  for (auto mode : modeB) elementsB *= extent[mode];
+  size_t elementsC = 1;
+  for (auto mode : modeC) elementsC *= extent[mode];
+
+  // Eigen Tensors
+  Eigen::Tensor<double, 4> tensorA(96, 96, 64, 64);
+  Eigen::Tensor<double, 4> tensorB(96, 64, 64, 64);
+  Eigen::Tensor<double, 4> tensorC(96, 96, 96, 64);
+
+  // Initialize tensor as Random
+  tensorA.setRandom();
+  tensorB.setRandom();
+  tensorC.setRandom();
+
+  // Size in bytes
+  size_t sizeA = sizeof(double) * tensorA.size();
+  size_t sizeB = sizeof(double) * tensorB.size();
+  size_t sizeC = sizeof(double) * tensorC.size();
+
+  const auto& dims = tensorA.dimensions();
+  Eigen::Index total = std::accumulate(dims.cbegin(), dims.cend(), 1,
+                                       [](auto x, auto y) { return x * y; });
+  std::cout << "total size: " << total << "\n";
+
+  //   // Allocate on device
+  //   eigencuda::Unique_ptr_to_GPU_data A_in_gpu =
+  //       eigencuda::alloc_tensor_in_gpu(sizeA);
+  //   eigencuda::Unique_ptr_to_GPU_data B_in_gpu =
+  //       eigencuda::alloc_tensor_in_gpu(sizeB);
+  //   eigencuda::Unique_ptr_to_GPU_data C_in_gpu =
+  //       eigencuda::alloc_tensor_in_gpu(sizeC);
+
+  // // Initialize data on host
+  // for (int64_t i = 0; i < elementsA; i++)
+  //   A[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;
+  // for (int64_t i = 0; i < elementsB; i++)
+  //   B[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;
+  // for (int64_t i = 0; i < elementsC; i++)
+  //   C[i] = (((float)rand()) / RAND_MAX - 0.5) * 100;
+
+  // Copy to device
+  // cudaMemcpy(C_in_gpu, tensorC, sizeC, cudaMemcpyHostToDevice);
+  // cudaMemcpy(A_in_gpu, tensorA, sizeA, cudaMemcpyHostToDevice);
+  // cudaMemcpy(B_in_gpu, tensorB, sizeB, cudaMemcpyHostToDevice);
+  //   eigencuda::checkCuda(cudaMemcpy(A_in_gpu.get(), tensorA.data(),
+  //   sizeA,
+  //                                   cudaMemcpyHostToDevice));
+  //   eigencuda::checkCuda(cudaMemcpy(B_in_gpu.get(), tensorB.data(),
+  //   sizeB,
+  //                                   cudaMemcpyHostToDevice));
+
+  //   eigencuda::checkCuda(cudaMemcpy(C_in_gpu.get(), tensorC.data(),
+  //   sizeC,
+  //                                   cudaMemcpyHostToDevice));
+
+  std::cout << "Allocate, initialize and transfer tensors\n";
+
+  return 0;
+}