From 175d00e2f9da37353aa06d0ace71c4d2a43f1640 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Fri, 14 Jun 2024 16:07:51 -0400
Subject: [PATCH] fix up resource deallocation runtime errors, first draft of
 tutorial

---
 gpu.h   |  35 ++----------
 run.cpp | 171 ++++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 126 insertions(+), 80 deletions(-)
diff --git a/gpu.h b/gpu.h
index ad8a634..5d49cb0 100644
--- a/gpu.h
+++ b/gpu.h
@@ -129,7 +129,7 @@ struct Kernel {
   size_t outputSize;
   size_t numBuffers;
   size_t numInputs;
-  WGPUCommandBuffer commandBuffer;
+  WGPUCommandBuffer commandBuffer; // managed automatically by wgpuQueueSubmit
   WGPUBuffer readbackBuffer;
   CallbackDataDyn callbackData;
   std::promise<void> promise;
@@ -174,43 +174,16 @@ bool operator<(const Kernel &lhs, const Kernel &rhs) {
   return lhs.commandBuffer < rhs.commandBuffer;
 }
 
-void FreeKernel(Kernel *op) {
-  log(kDefLog, kInfo, "Freeing kernel");
-  // TODO(avh): nullptr is insufficient check for freeable resources
-  if (op->commandBuffer != nullptr) {
-    wgpuCommandBufferRelease(op->commandBuffer);
-  }
-  if (op->readbackBuffer != nullptr) {
-    wgpuBufferRelease(op->readbackBuffer);
-  }
-  if (op->callbackData.buffer != nullptr) {
-    wgpuBufferRelease(op->callbackData.buffer);
-  }
-}
-
-void FreeMultiKernel(MultiKernel *pipeline) {
-  log(kDefLog, kInfo, "Freeing multi kernel");
-  if (pipeline->commandBuffer) {
-    // wgpuCommandBufferRelease(pipeline->commandBuffer);
-  }
-  if (pipeline->readbackBuffer) {
-    // wgpuBufferRelease(pipeline->readbackBuffer);
-  }
-}
-
 struct KernelPool {
   KernelPool(GPUContext *ctx) : ctx(ctx), data() {}
   GPUContext *ctx;
   std::set<Kernel *> data;
   std::set<MultiKernel *> multiData;
   ~KernelPool() {
-    for (auto kernelPtr : data) {
-      FreeKernel(kernelPtr);
-    }
+    // Note : commandBuffer is destroyed upon queue submission,
+    // explicitly destroying readback and callback buffers
+    // produces runtime errors.
     data.clear();
-    for (MultiKernel *multiKernelPtr : multiData) {
-      FreeMultiKernel(multiKernelPtr);
-    }
     multiData.clear();
   }
 };
diff --git a/run.cpp b/run.cpp
index 3bf4190..d278edd 100644
--- a/run.cpp
+++ b/run.cpp
@@ -24,13 +24,14 @@ void wait() {
 void section(const char *content) {
   fprintf(stdout, "\033[2J\033[1;1H"); // clear screen
   fprintf(stdout, "%s\n", kAsciiBanner);
-  fprintf(stdout, "================================================================================\n");
+  fprintf(stdout, "============================================================"
+                  "====================\n");
   fprintf(stdout, "%s\n", content);
   wait();
   // fprintf(stdout, "\033[4A\033[0J"); // clear lines
 }
 
-void runHelloGELU(GPUContext& ctx) {
+void runHelloGELU(GPUContext &ctx) {
   // Device code (runs on the GPU) using WGSL (WebGPU Shading Language)
   const char *kGELU = R"(
   const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@@ -55,8 +56,7 @@ void runHelloGELU(GPUContext& ctx) {
   }
   GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
-  Kernel op =
-      CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
+  Kernel op = CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
   DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
@@ -152,7 +152,6 @@ Let's try running this.
   GPUContext ctx = CreateContext();
   runHelloGELU(ctx);
 
-
   section(R"(
 Design Objectives of gpu.cpp
 ----------------------------
@@ -196,11 +195,14 @@ when the GPU computation occurs:
 )");
 
   section(R"(
+Ahead-of-time GPU Resource Preparation
+--------------------------------------
 
+In the next sections, we'll look at the ahead-of-time GPU resource preparation
 
-*Ahead-of-time GPU Resource Preparation*
-
-
+These are functions that acquire resources and prepare state for GPU
+computation. These are assumed to be less performance critical and not on hot
+code paths.
 )");
 
   section(R"(
@@ -245,7 +247,6 @@ for allocating and deallocating tensors data on the GPU. In practice
 
 )");
 
-
   section(R"(
 `CreateContext()` creates a GPUContext
 --------------------------------------
@@ -288,19 +289,56 @@ Let's try creating some data on the GPU now.
 
 )");
 
-std::array<float, 3072> inputArr;
-std::array<float, 3072> outputArr;
-for (int i = 0; i < 3072; ++i) {
-  inputArr[i] = static_cast<float>(i); // dummy input data
-}
-GPUTensor input = CreateTensor(ctx, {3072}, kf32, inputArr.data());
-GPUTensor output = CreateTensor(ctx, {3072}, kf32, outputArr.data());
+  std::array<float, 3072> inputArr;
+  std::array<float, 3072> outputArr;
+  for (int i = 0; i < 3072; ++i) {
+    inputArr[i] = static_cast<float>(i); // dummy input data
+  }
+  GPUTensor input = CreateTensor(ctx, {3072}, kf32, inputArr.data());
+  GPUTensor output = CreateTensor(ctx, {3072}, kf32, outputArr.data());
+
+  fprintf(stdout, "\nSuccessfully created input and output tensors.\n\n");
+  wait();
 
-fprintf(stdout, "\nSuccessfully created input and output tensors.\n\n");
-wait();
+  section(R"(
+Create a Kernel with `CreateKernel()`
+-------------------------------------
 
+Reviewing our GELU example and after using `CreateTensor()` to allocate and
+bind buffers for input and output data, we can use `CreateKernel()` to create a
+kernel.
 
-section(R"(
+```
+  // Previously: Create the input and output tensors
+  GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
+  GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
+
+  // ...
+
+  Kernel op =
+      CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
+```
+
+Note this *does not run* the kernel, it just prepares the kernel as a resource
+to be dispatched later.
+
+There are four arguments to `CreateKernel()`:
+- `GPUContext` - the context for the GPU
+- `ShaderCode` - the shader code for the kernel
+- `GPUTensor` - the input tensor. Even though the kernel is not executed,
+GPUTensor provides a handle to the buffers on the GPU to be loaded when the
+kernel is run. If there's more than one input, `GPUTensors` can be used which
+is an ordered collection of `GPUTensor`.
+- `GPUTensor` - the output tensor. As with the input tensor, the values are not
+important at this point, the underlying reference to the GPU buffer is bound to
+the kernel so that when the kernel is dispatched, it will know where to write
+the output data.
+
+The kGELU string that goes into ShaderCode is the WGSL shader code for the
+kernel. We'll look at this next.
+)");
+
+  section(R"(
 WGSL Compute Kernels are Programs that run Computation on the GPU
 ------------------------------------------------------------------
 
@@ -337,44 +375,71 @@ that this is a compute kernel. The `@workgroup_size(256)` annotation specifies
 the workgroup size for the kernel.
 )");
 
-section(R"(
-`CreateKernel()` is used to create a Kernel
--------------------------------------------
+  section(R"(
+Performance critical dispatch of GPU computation
+------------------------------------------------
 
-Reviewing our GELU example and after using `CreateTensor()` to allocate and
-bind buffers for input and output data, we can use `CreateKernel()` to create a
-kernel.
+The past few sections have covered the ahead-of-time GPU resource preparation
+consisting of `Create*()` and supporting functions. 
+
+None of these actually execute computation on the GPU yet.
+
+Next we'll look at the dispatch functions which asynchronously dispatches the
+kernel for execution.
+)");
+
+
+  section(R"(
+Dispatch a kernel for execution with `DispatchKernel()`
+------------------------------------------------------
+
+After creating a kernel, you can dispatch it for execution on the GPU using
+`DispatchKernel()`.
 
 ```
-  GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
-  GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
+  // Previously: Create the kernel 
   Kernel op =
       CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
+
+  // ...
+
+  DispatchKernel(ctx, op);
+  Wait(ctx, op.future);
+  ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+}
 ```
 
-Note this *does not run* the kernel, it just prepares the kernel as a resource
-to be dispatched later.
+Note that the kernel is executed asynchronously on the GPU, in other words,
+execution will continue on the CPU while the GPU is running the kernel.
 
-There are four arguments to `CreateKernel()`:
-- `GPUContext` - the context for the GPU
-- `ShaderCode` - the shader code for the kernel
-- `GPUTensor` - the input tensor. Even though the kernel is not executed,
-GPUTensor provides a handle to the buffers on the GPU to be loaded when the
-kernel is run. If there's more than one input, `GPUTensors` can be used which
-is an ordered collection of `GPUTensor`.
-- `GPUTensor` - the output tensor. As with the input tensor, the values are not
-important at this point, the underlying reference to the GPU buffer is bound to
-the kernel so that when the kernel is dispatched, it will know where to write
-the output data.
+To wait for the kernel to finish, you can use `Wait(ctx, op.future)`. This will
+block until the kernel has finished executing.
 
-)");
+Note the output of the kernel (if any) is written to the output tensor on the
+GPU. It is not copied back to CPU by default until you call `ToCPU()` to copy
+the data back to the CPU.
 
+This is intentional to allow for efficient pipelining of GPU computation and
+reusing GPU resources without copying data back and forth unless it's specified.
+)");
 
 section(R"(
-Dispatching a kernel
-------------------
+Dispatch multiple kernels for execution with `DispatchMultiKernel()`
+---------------------------------------------------------------------
+
+If you have multiple kernels to dispatch, you can use `CreateMultiKernel()` and
+`DispatchMultiKernel()`.
+
+These create and dispatch multiple kernels together and are similar to
+`CreateKernel()` and `DispatchKernel()`, but with multiple kernels and multiple
+inputs per kernel.
+
+With a more complex input signature, `CreateMultiKernel()` takes a structured
+input type `MultiKernelDesc` that specifies the kernels and their inputs. But
+otherwise usage is similar.
 
-TODO(avh)
+Note that inputs can even be shared between kernels, allowing for building a
+complex computation graphs with shared inputs between them.
 )");
 
 
@@ -382,13 +447,21 @@ TODO(avh)
 gpu.cpp vs. the raw WebGPU API
 ------------------------------
 
-The main responsibility of the types and functions of the library is to make
-it trivial to represent these common building blocks of computation
+The main responsibility of the types and functions of the library is to make it
+simple to represent these common building blocks of computation.
+
+If you look at `examples/webgpu_intro/run.cpp` you can learn more about what
+it's like to interact directly with the WebGPU API.
+)");
+
+  section(R"(
+That's it for the introduction to gpu.cpp.
+
+Have fun and let us know if you have any questions or feedback!
 
-If you look at `examples/webgpu_intro/run.cpp` you can get a sense of what it's
-like to interact directly with the WebGPU.
+We're happy to collaborate with contributors or hear what you're building with
+gpu.cpp.
 )");
 
-  fprintf(stdout, "Goodbye!\n");
   return 0;
 }