From af2bc47d226be3fd7117854dfe1e7861372655b0 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 29 Apr 2024 23:10:51 +0000 Subject: [PATCH] remove forceinline. blame @arund42 if this ever does not get inlined by the compiler --- dev/cuda/common.h | 20 ++++++++++---------- train_gpt2.cu | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 13fcb0f81..7edb1e7af 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -40,22 +40,22 @@ void cublasCheck(cublasStatus_t status, const char *file, int line) template struct alignas(16) Packed128 { - __device__ __forceinline__ Packed128() = default; - __device__ __forceinline__ explicit Packed128(int4 bits) { + __device__ Packed128() = default; + __device__ explicit Packed128(int4 bits) { static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&payload, &bits, sizeof(bits)); } - __device__ __forceinline__ ElementType& operator[](int index) { + __device__ ElementType& operator[](int index) { return payload[index]; } - __device__ __forceinline__ const ElementType& operator[](int index) const { + __device__ const ElementType& operator[](int index) const { return payload[index]; } - __device__ __forceinline__ float fp32(int index) { + __device__ float fp32(int index) { return static_cast(payload[index]); } - __device__ __forceinline__ int4 get_bits() const { + __device__ int4 get_bits() const { int4 bits; static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&bits, &payload, sizeof(bits)); @@ -71,25 +71,25 @@ typedef Packed128 f128; // load a Packed128 from an aligned memory address template -__device__ __forceinline__ Packed128 load128(const ElementType* address) { +__device__ Packed128 load128(const ElementType* address) { return Packed128{*reinterpret_cast(address)}; } // load a Packed128 from an aligned memory address with streaming cache hint template -__device__ __forceinline__ Packed128 load128cs(const ElementType* address) { +__device__ Packed128 load128cs(const ElementType* address) { return Packed128{__ldcs(reinterpret_cast(address))}; } // store a Packed128 to an aligned memory address template -__device__ __forceinline__ void store128(ElementType* target, Packed128 value) { +__device__ void store128(ElementType* target, Packed128 value) { *reinterpret_cast(target) = value.get_bits(); } // store a Packed128 to an aligned memory address with streaming cache hint template -__device__ __forceinline__ void store128cs(ElementType* target, Packed128 value) { +__device__ void store128cs(ElementType* target, Packed128 value) { __stcs(reinterpret_cast(target), value.get_bits()); } diff --git a/train_gpt2.cu b/train_gpt2.cu index 85b409235..3913bc5ab 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -199,22 +199,22 @@ __device__ void atomicAddX(float* addr, float val) { template struct alignas(16) Packed128 { - __device__ __forceinline__ Packed128() = default; - __device__ __forceinline__ explicit Packed128(int4 bits) { + __device__ Packed128() = default; + __device__ explicit Packed128(int4 bits) { static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&payload, &bits, sizeof(bits)); } - __device__ __forceinline__ ElementType& operator[](int index) { + __device__ ElementType& operator[](int index) { return payload[index]; } - __device__ __forceinline__ const ElementType& operator[](int index) const { + __device__ const ElementType& operator[](int index) const { return payload[index]; } - __device__ __forceinline__ float fp32(int index) { + __device__ float fp32(int index) { return static_cast(payload[index]); } - __device__ __forceinline__ int4 get_bits() const { + __device__ int4 get_bits() const { int4 bits; static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&bits, &payload, sizeof(bits)); @@ -230,25 +230,25 @@ typedef Packed128 f128; // load a Packed128 from an aligned memory address template -__device__ __forceinline__ Packed128 load128(const ElementType* address) { +__device__ Packed128 load128(const ElementType* address) { return Packed128{*reinterpret_cast(address)}; } // load a Packed128 from an aligned memory address with streaming cache hint template -__device__ __forceinline__ Packed128 load128cs(const ElementType* address) { +__device__ Packed128 load128cs(const ElementType* address) { return Packed128{__ldcs(reinterpret_cast(address))}; } // store a Packed128 to an aligned memory address template -__device__ __forceinline__ void store128(ElementType* target, Packed128 value) { +__device__ void store128(ElementType* target, Packed128 value) { *reinterpret_cast(target) = value.get_bits(); } // store a Packed128 to an aligned memory address with streaming cache hint template -__device__ __forceinline__ void store128cs(ElementType* target, Packed128 value) { +__device__ void store128cs(ElementType* target, Packed128 value) { __stcs(reinterpret_cast(target), value.get_bits()); }