Skip to content

Commit

Permalink
nvidia: update hello world following changes in #456
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Aug 27, 2024
1 parent 0b24651 commit 6fc9da3
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 85 deletions.
39 changes: 0 additions & 39 deletions constantine/math_compiler/codegen_amdgpu.nim
Original file line number Diff line number Diff line change
Expand Up @@ -67,45 +67,6 @@ proc hipDeviceInit*(deviceID = 0'i32): HipDevice =

return hipDevice

# ############################################################
#
# LLVM IR for AMD GPUs
#
# ############################################################
#
# Note:
# __device__ functions for field and elliptic curve arithmetic
# might be compiled by default with scalar codegen
#
# We will need to either:
# - Derive explicitly a vectorized version of the warp/wave size (32)
# - Derive implicitly a vectorized version, probably with __forceinline__

proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
## Create a public wrapper of a Hip device function
##
## A function named `addmod` can be found by appending _public
## check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")

let pubName = fn.fnImpl.getName() & "_public"
let pubFn = module.addFunction(cstring(pubName), fn.fnTy)

let ctx = module.getContext()
let builder = ctx.createBuilder()
defer: builder.dispose()

let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
builder.positionAtEnd(blck)

var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
for i, arg in mpairs(args):
arg = pubFn.getParam(i.uint32)
discard builder.call2(fn.fnTy, fn.fnImpl, args)

# A public kernel must return void
builder.retVoid()
pubFn.setCallingConvention(AMDGPU_KERNEL)

# ############################################################
#
# Code generation
Expand Down
35 changes: 1 addition & 34 deletions constantine/math_compiler/codegen_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -109,37 +109,6 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =

return cuDevice

# ############################################################
#
# LLVM IR for Nvidia GPUs
#
# ############################################################

proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
## Create a public wrapper of a cuda device function
##
## A function named `addmod` can be found by appending _public
## check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")

let pubName = fn.fnImpl.getName() & "_public"
let pubFn = module.addFunction(cstring(pubName), fn.fnTy)

let ctx = module.getContext()
let builder = ctx.createBuilder()
defer: builder.dispose()

let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
builder.positionAtEnd(blck)

var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
for i, arg in mpairs(args):
arg = pubFn.getParam(i.uint32)
discard builder.call2(fn.fnTy, fn.fnImpl, args)

# A public kernel must return void
builder.retVoid()
module.tagCudaKernel((fn.fnTy, pubFn))

# ############################################################
#
# Code generation
Expand Down Expand Up @@ -194,9 +163,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
#
# ############################################################

proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction =
# Public kernels are appended _public
let fnName = cm.genSymbol(opcode) & "_public"
proc getCudaKernel*(cuMod: CUmodule, fnName: string): CUfunction =
check cuModuleGetFunction(result, cuMod, fnName)

proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
Expand Down
2 changes: 1 addition & 1 deletion constantine/math_compiler/ir.nim
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ template llvmFnDef[N: static int](
fn.setLinkage(linkInternal)
else:
asy.setPublic(fn)
fn.setSection(sectionName)
fn.setSection(cstring sectionName)
asy.addAttributes(fn, attrs)

asy.br.positionAtEnd(savedLoc)
Expand Down
76 changes: 65 additions & 11 deletions tests/gpu/hello_world_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

import
constantine/platforms/llvm/llvm,
constantine/math_compiler/codegen_nvidia
constantine/platforms/abis/nvidia_abi {.all.},
constantine/platforms/abis/c_abi

# ############################################################
#
Expand Down Expand Up @@ -79,7 +80,17 @@ proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
#
# ############################################################

template check*(status: NvvmResult) =
template check(status: CUresult) =
## Check the status code of a CUDA operation
## Exit program with error if failure

let code = status # ensure that the input expression is evaluated once only
if code != CUDA_SUCCESS:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
quit 1

template check(status: NvvmResult) =
let code = status # Assign so execution is done once only.
if code != NVVM_SUCCESS:
stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code
Expand Down Expand Up @@ -148,6 +159,22 @@ proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]):

echo "Nvidia JIT compiler Hello World"

proc tagCudaKernel(module: ModuleRef, fnTy: TypeRef, fnImpl: ValueRef) =
## Tag a function as a Cuda Kernel, i.e. callable from host

doAssert fnTy.getReturnType().isVoid(), block:
"Kernels must not return values but function returns " & $fnTy.getReturnType().getTypeKind()

let ctx = module.getContext()
module.addNamedMetadataOperand(
"nvvm.annotations",
ctx.asValueRef(ctx.metadataNode([
fnImpl.asMetadataRef(),
ctx.metadataNode("kernel"),
constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
]))
)

proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =

# ######################################
Expand Down Expand Up @@ -175,7 +202,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
block:
let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let addKernel = module.addFunction(addKernelName, addType)
let blck = ctx.appendBasicBlock(addKernel, "addBody")
let blck = ctx.appendBasicBlock(addKernel)
builder.positionAtEnd(blck)
let r = addKernel.getParam(0)
let a = addKernel.getParam(1)
Expand All @@ -184,12 +211,12 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
builder.store(sum, r)
builder.retVoid()

module.wrapInCallableCudaKernel((addType, addKernel))
module.tagCudaKernel(addType, addKernel)

block:
let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
let mulKernel = module.addFunction(mulKernelName, mulType)
let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
let blck = ctx.appendBasicBlock(mulKernel)
builder.positionAtEnd(blck)
let r = mulKernel.getParam(0)
let a = mulKernel.getParam(1)
Expand All @@ -198,7 +225,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
builder.store(prod, r)
builder.retVoid()

module.wrapInCallableCudaKernel((mulType, mulKernel))
module.tagCudaKernel(mulType, mulKernel)

module.verify(AbortProcessAction)

Expand Down Expand Up @@ -228,7 +255,33 @@ type
PTXviaLlvmNvptx

proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction =
check cuModuleGetFunction(result, cuMod, fnName & "_public")
check cuModuleGetFunction(result, cuMod, fnName[0].unsafeAddr)

proc cudaDeviceInit(deviceID = 0'i32): CUdevice =

check cuInit(deviceID.uint32)

var devCount: int32
check cuDeviceGetCount(devCount)
if devCount == 0:
echo "cudaDeviceInit error: no devices supporting CUDA"
quit 1

var cuDevice: CUdevice
check cuDeviceGet(cuDevice, deviceID)
var name = newString(128)
check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
echo "Using CUDA Device [", deviceID, "]: ", cstring(name)

var major, minor: int32
check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
echo "Compute Capability: SM ", major, ".", minor
if major < 6:
echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later"
quit 1

return cuDevice

proc main(backend: CodegenBackend) =

Expand Down Expand Up @@ -271,7 +324,7 @@ proc main(backend: CodegenBackend) =
var cuCtx: CUcontext
var cuMod: CUmodule
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
check cuModuleLoadData(cuMod, ptx[0].unsafeAddr)
let addKernel = cuMod.getCudaKernel(addKernelName)
let mulKernel = cuMod.getCudaKernel(mulKernelName)

Expand Down Expand Up @@ -323,9 +376,10 @@ proc main(backend: CodegenBackend) =
check cuCtxDestroy(cuCtx)
cuCtx = CUcontext(nil)

echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
main(PTXviaNvidiaNvvm)
echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
# echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
# main(PTXviaNvidiaNvvm)
# echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
echo "[Skip] Compilation via Nvidia NVVM, incompatibilities between LLVM IR and NVVM IR"

echo "\n\nCompilation via LLVM NVPTX\n##########################\n"
main(PTXviaLlvmNvptx)
Expand Down

0 comments on commit 6fc9da3

Please sign in to comment.