vgvassilev · kchristin22 · Sep 8, 2024 · Sep 9, 2024
diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h
@@ -396,6 +396,7 @@ namespace clad {
     StmtDiff
     VisitImplicitValueInitExpr(const clang::ImplicitValueInitExpr* IVIE);
     StmtDiff VisitCStyleCastExpr(const clang::CStyleCastExpr* CSCE);
+    StmtDiff VisitPseudoObjectExpr(const clang::PseudoObjectExpr* POE);
     StmtDiff VisitInitListExpr(const clang::InitListExpr* ILE);
     StmtDiff VisitIntegerLiteral(const clang::IntegerLiteral* IL);
     StmtDiff VisitMemberExpr(const clang::MemberExpr* ME);

diff --git a/include/clad/Differentiator/VisitorBase.h b/include/clad/Differentiator/VisitorBase.h
@@ -295,7 +295,8 @@ namespace clad {
                  clang::Scope* scope, clang::Expr* Init = nullptr,
                  bool DirectInit = false, clang::TypeSourceInfo* TSI = nullptr,
                  clang::VarDecl::InitializationStyle IS =
-                     clang::VarDecl::InitializationStyle::CInit);
+                     clang::VarDecl::InitializationStyle::CInit,
+                 clang::StorageClass SC = clang::SC_None);
     /// Builds variable declaration to be used inside the derivative
     /// body.
     /// \param[in] Type The type of variable declaration to build.
@@ -336,7 +337,8 @@ namespace clad {
                        clang::Expr* Init = nullptr, bool DirectInit = false,
                        clang::TypeSourceInfo* TSI = nullptr,
                        clang::VarDecl::InitializationStyle IS =
-                           clang::VarDecl::InitializationStyle::CInit);
+                           clang::VarDecl::InitializationStyle::CInit,
+                       clang::StorageClass SC = clang::SC_None);
     /// Creates a namespace declaration and enters its context. All subsequent
     /// Stmts are built inside that namespace, until
     /// m_Sema.PopDeclContextIsUsed.

diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -240,14 +240,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       addToCurrentBlock(BuildDeclStmt(gradientVD));
     }
 
-    // If the function is a global kernel, we need to transform it
-    // into a device function when calling it inside the overload function
-    // which is the final global kernel returned.
-    if (m_Derivative->hasAttr<clang::CUDAGlobalAttr>()) {
-      m_Derivative->dropAttr<clang::CUDAGlobalAttr>();
-      m_Derivative->addAttr(clang::CUDADeviceAttr::CreateImplicit(m_Context));
-    }
-
     Expr* callExpr = BuildCallExprToFunction(m_Derivative, callArgs,
                                              /*UseRefQualifiedThisObj=*/true);
     addToCurrentBlock(callExpr);
@@ -346,7 +338,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     bool shouldCreateOverload = false;
     // FIXME: Gradient overload doesn't know how to handle additional parameters
     // added by the plugins yet.
-    if (request.Mode != DiffMode::jacobian && numExtraParam == 0)
+    if (request.Mode != DiffMode::jacobian && numExtraParam == 0 &&
+        !FD->hasAttr<CUDAGlobalAttr>())
       shouldCreateOverload = true;
     if (!request.DeclarationOnly && !request.DerivedFDPrototypes.empty())
       // If the overload is already created, we don't need to create it again.
@@ -2884,10 +2877,19 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
           VDDerivedInit = getZeroInit(VDDerivedType);
         }
       }
-      if (initializeDerivedVar)
-        VDDerived = BuildGlobalVarDecl(
-            VDDerivedType, "_d_" + VD->getNameAsString(), VDDerivedInit, false,
-            nullptr, VD->getInitStyle());
+      if (initializeDerivedVar) {
+        if (VD->hasAttr<clang::CUDASharedAttr>()) {
+          VDDerived = BuildGlobalVarDecl(
+              VDDerivedType, "_d_" + VD->getNameAsString(), nullptr, false,
+              nullptr, VD->getInitStyle(), VD->getStorageClass());
+          VDDerived->addAttr(
+              VD->getAttr<clang::CUDASharedAttr>()->clone(m_Context));
+        } else {
+          VDDerived = BuildGlobalVarDecl(
+              VDDerivedType, "_d_" + VD->getNameAsString(), VDDerivedInit,
+              false, nullptr, VD->getInitStyle());
+        }
+      }
     }
 
     // If `VD` is a reference to a local variable, then it is already
@@ -2970,11 +2972,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       VDClone = BuildGlobalVarDecl(
           VDCloneType, VD->getNameAsString(),
           BuildOp(UnaryOperatorKind::UO_AddrOf, initDiff.getExpr()),
-          VD->isDirectInit());
+          VD->isDirectInit(), nullptr,
+          clang::VarDecl::InitializationStyle::CInit, VD->getStorageClass());
     else
-      VDClone = BuildGlobalVarDecl(VDCloneType, VD->getNameAsString(),
-                                   initDiff.getExpr(), VD->isDirectInit(),
-                                   nullptr, VD->getInitStyle());
+      VDClone =
+          BuildGlobalVarDecl(VDCloneType, VD->getNameAsString(),
+                             initDiff.getExpr(), VD->isDirectInit(), nullptr,
+                             VD->getInitStyle(), VD->getStorageClass());
+    if (VD->hasAttr<clang::CUDASharedAttr>())
+      VDClone->addAttr(VD->getAttr<clang::CUDASharedAttr>()->clone(m_Context));
     if (isPointerType && derivedVDE) {
       if (promoteToFnScope) {
         Expr* assignDerivativeE = BuildOp(BinaryOperatorKind::BO_Assign,
@@ -3083,6 +3089,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     llvm::SmallVector<Decl*, 4> declsDiff;
     // Need to put array decls inlined.
     llvm::SmallVector<Decl*, 4> localDeclsDiff;
+    llvm::SmallVector<Stmt*, 16> sharedMemInits;
     // reverse_mode_forward_pass does not have a reverse pass so declarations
     // don't have to be moved to the function global scope.
     bool promoteToFnScope =
@@ -3172,6 +3179,13 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
             localDeclsDiff.push_back(VDDiff.getDecl_dx());
           else
             declsDiff.push_back(VDDiff.getDecl_dx());
+          if (VD->hasAttr<clang::CUDASharedAttr>()) {
+            VarDecl* VDDerived = VDDiff.getDecl_dx();
+            Expr* declRef = BuildDeclRef(VDDerived);
+            Stmt* assignToZero = BuildOp(BinaryOperatorKind::BO_Assign, declRef,
+                                         getZeroInit(VDDerived->getType()));
+            sharedMemInits.push_back(assignToZero);
+          }
         }
       } else if (auto* SAD = dyn_cast<StaticAssertDecl>(D)) {
         DeclDiff<StaticAssertDecl> SADDiff = DifferentiateStaticAssertDecl(SAD);
@@ -3210,6 +3224,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       Stmts& block =
           promoteToFnScope ? m_Globals : getCurrentBlock(direction::forward);
       addToBlock(DSDiff, block);
+      for (Stmt* sharedMemInitsStmt : sharedMemInits)
+        addToBlock(sharedMemInitsStmt, block);
     }
 
     if (m_ExternalSource) {
@@ -3272,6 +3288,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     return {castExpr, castExprDiff};
   }
 
+  StmtDiff
+  ReverseModeVisitor::VisitPseudoObjectExpr(const PseudoObjectExpr* POE) {
+    // Used for CUDA Builtins
+    return {Clone(POE), Clone(POE)};
+  }
+
   StmtDiff ReverseModeVisitor::VisitMemberExpr(const MemberExpr* ME) {
     auto baseDiff = VisitWithExplicitNoDfDx(ME->getBase());
     auto* field = ME->getMemberDecl();

diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp
@@ -113,12 +113,13 @@ namespace clad {
   VarDecl* VisitorBase::BuildVarDecl(QualType Type, IdentifierInfo* Identifier,
                                      Scope* Scope, Expr* Init, bool DirectInit,
                                      TypeSourceInfo* TSI,
-                                     VarDecl::InitializationStyle IS) {
+                                     VarDecl::InitializationStyle IS,
+                                     StorageClass SC) {
     // add namespace specifier in variable declaration if needed.
     Type = utils::AddNamespaceSpecifier(m_Sema, m_Context, Type);
-    auto* VD = VarDecl::Create(
-        m_Context, m_Sema.CurContext, m_DiffReq->getLocation(),
-        m_DiffReq->getLocation(), Identifier, Type, TSI, SC_None);
+    auto* VD =
+        VarDecl::Create(m_Context, m_Sema.CurContext, m_DiffReq->getLocation(),
+                        m_DiffReq->getLocation(), Identifier, Type, TSI, SC);
 
     if (Init) {
       m_Sema.AddInitializerToDecl(VD, Init, DirectInit);
@@ -149,9 +150,10 @@ namespace clad {
   VarDecl* VisitorBase::BuildGlobalVarDecl(QualType Type,
                                            llvm::StringRef prefix, Expr* Init,
                                            bool DirectInit, TypeSourceInfo* TSI,
-                                           VarDecl::InitializationStyle IS) {
+                                           VarDecl::InitializationStyle IS,
+                                           StorageClass SC) {
     return BuildVarDecl(Type, CreateUniqueIdentifier(prefix),
-                        m_DerivativeFnScope, Init, DirectInit, TSI, IS);
+                        m_DerivativeFnScope, Init, DirectInit, TSI, IS, SC);
   }
 
   NamespaceDecl* VisitorBase::BuildNamespaceDecl(IdentifierInfo* II,

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
@@ -34,6 +34,58 @@ void fake_kernel(int *a) {
   *a *= *a;
 }
 
+__global__ void add_kernel(int *out, int *in) {
+  int index = threadIdx.x;
+  out[index] += in[index];
+}
+
+// CHECK:    void add_kernel_grad(int *out, int *in, int *_d_out, int *_d_in) {
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:     int index0 = threadIdx.x;
+//CHECK-NEXT:     int _t0 = out[index0];
+//CHECK-NEXT:     out[index0] += in[index0];
+//CHECK-NEXT:     {
+//CHECK-NEXT:         out[index0] = _t0;
+//CHECK-NEXT:         int _r_d0 = _d_out[index0];
+//CHECK-NEXT:         _d_in[index0] += _r_d0;
+//CHECK-NEXT:     }
+//CHECK-NEXT: }
+
+__global__ void add_kernel_2(int *out, int *in) {
+  out[threadIdx.x] += in[threadIdx.x];
+}
+
+// CHECK:    void add_kernel_2_grad(int *out, int *in, int *_d_out, int *_d_in) {
+//CHECK-NEXT:     int _t0 = out[threadIdx.x];
+//CHECK-NEXT:     out[threadIdx.x] += in[threadIdx.x];
+//CHECK-NEXT:     {
+//CHECK-NEXT:         out[threadIdx.x] = _t0;
+//CHECK-NEXT:         int _r_d0 = _d_out[threadIdx.x];
+//CHECK-NEXT:         _d_in[threadIdx.x] += _r_d0;
+//CHECK-NEXT:     }
+//CHECK-NEXT: }
+
+__global__ void add_kernel_3(int *out, int *in) {
+  __shared__ int shared;
+  shared = 1;
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  out[index] += in[index] + shared;
+}
+
+// CHECK:    void add_kernel_3_grad(int *out, int *in, int *_d_out, int *_d_in) {
+//CHECK-NEXT:    unsigned int _t1 = blockIdx.x;
+//CHECK-NEXT:    unsigned int _t0 = blockDim.x;
+//CHECK-NEXT:    int _d_index = 0;
+//CHECK-NEXT:    int index0 = threadIdx.x + _t1 * _t0;
+//CHECK-NEXT:    int _t2 = out[index0];
+//CHECK-NEXT:    out[index0] += in[index0];
+//CHECK-NEXT:    {
+//CHECK-NEXT:        out[index0] = _t2;
+//CHECK-NEXT:        int _r_d0 = _d_out[index0];
+//CHECK-NEXT:        _d_in[index0] += _r_d0;
+//CHECK-NEXT:    }
+//CHECK-NEXT:}
+
 int main(void) {
   int *a = (int*)malloc(sizeof(int));
   *a = 2;
@@ -58,7 +110,7 @@ int main(void) {
 
   cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
   cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
-  printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, a^2 = 4
+  printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, a^2 = 4
 
   auto error = clad::gradient(fake_kernel);
   error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels
@@ -73,7 +125,61 @@ int main(void) {
 
   cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
   cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
-  printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, a^2 = 10
+  printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, a^2 = 10
+
+  int *dummy_in, *dummy_out;
+  cudaMalloc(&dummy_in, sizeof(int));
+  cudaMalloc(&dummy_out, sizeof(int));
+
+  int *out = (int*)malloc(10 * sizeof(int));
+  for(int i = 0; i < 10; i++) {
+    out[i] = 5;
+  }
+  int *d_out;
+  cudaMalloc(&d_out, 10 * sizeof(int));
+  cudaMemcpy(d_out, out, 10 * sizeof(int), cudaMemcpyHostToDevice);
+
+  int *d_in;
+  cudaMalloc(&d_in, 10 * sizeof(int));
+
+  auto add = clad::gradient(add_kernel, "in, out");
+  add.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out, dummy_in, d_out, d_in);
+  cudaDeviceSynchronize();
+
+  int *res = (int*)malloc(10 * sizeof(int));
+  cudaMemcpy(res, d_in, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+  for(int i = 0; i < 10; i++) {
+   if (res[i] != 5) {
+      std::cerr << "wrong result of add_kernel_grad at index " << i << std::endl;
+      return 1;
+    }
+  }
+
+  cudaMemset(d_in, 0, 10 * sizeof(int));
+  auto add_2 = clad::gradient(add_kernel_2, "in, out");
+  add_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out, dummy_in, d_out, d_in);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(res, d_in, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+  for(int i = 0; i < 10; i++) {
+    if (res[i] != 5) {
+      std::cerr << "wrong result of add_kernel_2_grad at index " << i << std::endl;
+      return 1;
+    }
+  }
+
+  cudaMemset(d_in, 0, 10 * sizeof(int));
+  auto add_3 = clad::gradient(add_kernel_3, "in, out");
+  add_3.execute_kernel(dim3(10), dim3(1), sizeof(int), cudaStream, dummy_out, dummy_in, d_out, d_in);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(res, d_in, 10 * sizeof(int), cudaMemcpyDeviceToHost);
+  for(int i = 0; i < 10; i++) {
+    if (res[i] != 5) {
+      std::cerr << "wrong result of add_kernel_3_grad at index " << i << std::endl;
+      return 1;
+    }
+  }
 
   return 0;
 }