feature: implement GPUMemoryAllocator

w3ntao · Jan 13, 2025 · 365971d · 365971d
1 parent 9e55f00
commit 365971d
Show file tree

Hide file tree

Showing 209 changed files with 1,513 additions and 1,901 deletions.
diff --git a/src/pbrt/accelerator/hlbvh.cu b/src/pbrt/accelerator/hlbvh.cu
@@ -1,6 +1,7 @@
-#include "pbrt/accelerator/hlbvh.h"
-#include "pbrt/util/stack.h"
-#include "pbrt/util/thread_pool.h"
+#include <pbrt/accelerator/hlbvh.h>
+#include <pbrt/gpu/gpu_memory_allocator.h>
+#include <pbrt/util/stack.h>
+#include <pbrt/util/thread_pool.h>
 #include <chrono>
 
 constexpr uint TREELET_MORTON_BITS_PER_DIMENSION = 10;
@@ -168,12 +169,9 @@ __global__ void init_bvh_args(HLBVH::BottomBVHArgs *bvh_args_array,
 }
 
 const HLBVH *HLBVH::create(const std::vector<const Primitive *> &gpu_primitives,
-                           std::vector<void *> &gpu_dynamic_pointers) {
-    HLBVH *bvh;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&bvh, sizeof(HLBVH)));
-    gpu_dynamic_pointers.push_back(bvh);
-
-    bvh->build_bvh(gpu_primitives, gpu_dynamic_pointers);
+                           GPUMemoryAllocator &allocator) {
+    auto bvh = allocator.allocate<HLBVH>();
+    bvh->build_bvh(gpu_primitives, allocator);
 
     return bvh;
 }
@@ -384,7 +382,7 @@ void HLBVH::build_bottom_bvh(const BottomBVHArgs *bvh_args_array, uint array_len
 }
 
 void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
-                      std::vector<void *> &gpu_dynamic_pointers) {
+                      GPUMemoryAllocator &allocator) {
     auto start_sorting = std::chrono::system_clock::now();
 
     primitives = nullptr;
@@ -398,46 +396,35 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
 
     printf("\ntotal primitives: %u\n", num_total_primitives);
 
-    MortonPrimitive *gpu_morton_primitives;
-    CHECK_CUDA_ERROR(
-        cudaMallocManaged(&gpu_morton_primitives, sizeof(MortonPrimitive) * num_total_primitives));
+    GPUMemoryAllocator local_allocator;
+
+    auto sparse_treelets = local_allocator.allocate<Treelet>(MAX_TREELET_NUM);
 
-    CHECK_CUDA_ERROR(cudaGetLastError());
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+    auto gpu_morton_primitives = allocator.allocate<MortonPrimitive>(num_total_primitives);
+    auto gpu_primitives_array = allocator.allocate<const Primitive *>(num_total_primitives);
 
-    const Primitive **gpu_primitives_array;
-    CHECK_CUDA_ERROR(
-        cudaMallocManaged(&gpu_primitives_array, sizeof(Primitive *) * num_total_primitives));
     CHECK_CUDA_ERROR(cudaMemcpy(gpu_primitives_array, gpu_primitives.data(),
                                 sizeof(Primitive *) * num_total_primitives,
                                 cudaMemcpyHostToDevice));
 
-    Treelet *sparse_treelets;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&sparse_treelets, sizeof(Treelet) * MAX_TREELET_NUM));
-
-    CHECK_CUDA_ERROR(cudaGetLastError());
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-
-    gpu_dynamic_pointers.push_back(gpu_morton_primitives);
-    gpu_dynamic_pointers.push_back(gpu_primitives_array);
-
     this->init(gpu_primitives_array, gpu_morton_primitives);
 
     constexpr uint threads = 1024;
     {
         const uint blocks = divide_and_ceil(num_total_primitives, threads);
         hlbvh_init_morton_primitives<<<blocks, threads>>>(morton_primitives, primitives,
                                                           num_total_primitives);
+        CHECK_CUDA_ERROR(cudaGetLastError());
+        CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
 
     {
         const uint blocks = divide_and_ceil(MAX_TREELET_NUM, threads);
         hlbvh_init_treelets<<<blocks, threads>>>(sparse_treelets);
+        CHECK_CUDA_ERROR(cudaGetLastError());
+        CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
 
-    CHECK_CUDA_ERROR(cudaGetLastError());
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-
     Bounds3f bounds_of_primitives_centroids;
     for (uint idx = 0; idx < num_total_primitives; idx++) {
         bounds_of_primitives_centroids += gpu_morton_primitives[idx].bounds.centroid();
@@ -467,10 +454,8 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
         CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
 
-    uint *primitives_counter;
-    uint *primitives_indices_offset;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&primitives_counter, sizeof(uint) * MAX_TREELET_NUM));
-    CHECK_CUDA_ERROR(cudaMallocManaged(&primitives_indices_offset, sizeof(uint) * MAX_TREELET_NUM));
+    auto primitives_counter = local_allocator.allocate<uint>(MAX_TREELET_NUM);
+    auto primitives_indices_offset = local_allocator.allocate<uint>(MAX_TREELET_NUM);
 
     {
         const uint blocks = divide_and_ceil(MAX_TREELET_NUM, threads);
@@ -502,10 +487,7 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
         CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
 
-    MortonPrimitive *buffer_morton_primitives;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&buffer_morton_primitives,
-                                       sizeof(MortonPrimitive) * num_total_primitives));
-
+    auto buffer_morton_primitives = local_allocator.allocate<MortonPrimitive>(num_total_primitives);
     {
         const uint blocks = divide_and_ceil(num_total_primitives, threads);
         sort_morton_primitives<<<blocks, threads>>>(buffer_morton_primitives, morton_primitives,
@@ -533,10 +515,6 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
         CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
 
-    CHECK_CUDA_ERROR(cudaFree(primitives_counter));
-    CHECK_CUDA_ERROR(cudaFree(primitives_indices_offset));
-    CHECK_CUDA_ERROR(cudaFree(buffer_morton_primitives));
-
     std::vector<uint> dense_treelet_indices;
     {
         uint max_primitive_num_in_a_treelet = 0;
@@ -564,45 +542,38 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
                max_primitive_num_in_a_treelet);
     }
 
-    Treelet *dense_treelets;
-    CHECK_CUDA_ERROR(
-        cudaMallocManaged(&dense_treelets, sizeof(Treelet) * dense_treelet_indices.size()));
-
+    auto dense_treelets = local_allocator.allocate<Treelet>(dense_treelet_indices.size());
     for (uint idx = 0; idx < dense_treelet_indices.size(); idx++) {
         uint sparse_idx = dense_treelet_indices[idx];
         CHECK_CUDA_ERROR(cudaMemcpy(&dense_treelets[idx], &sparse_treelets[sparse_idx],
                                     sizeof(Treelet), cudaMemcpyDeviceToDevice));
     }
-    CHECK_CUDA_ERROR(cudaFree(sparse_treelets));
 
     uint max_build_node_length =
         (2 * dense_treelet_indices.size() + 1) + (2 * num_total_primitives + 1);
-    CHECK_CUDA_ERROR(cudaMallocManaged(&build_nodes, sizeof(BVHBuildNode) * max_build_node_length));
-    gpu_dynamic_pointers.push_back(build_nodes);
+
+    build_nodes = allocator.allocate<BVHBuildNode>(max_build_node_length);
 
     auto start_top_bvh = std::chrono::system_clock::now();
 
     ThreadPool thread_pool;
     const uint top_bvh_node_num =
         build_top_bvh_for_treelets(dense_treelets, dense_treelet_indices.size(), thread_pool);
 
-    CHECK_CUDA_ERROR(cudaFree(dense_treelets));
-
     auto start_bottom_bvh = std::chrono::system_clock::now();
 
     uint start = 0;
     uint end = top_bvh_node_num;
 
-    uint *shared_offset;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&shared_offset, sizeof(uint)));
+    auto shared_offset = local_allocator.allocate<uint>();
     *shared_offset = end;
 
     uint depth = 0;
     while (end > start) {
         const uint array_length = end - start;
 
-        BottomBVHArgs *bvh_args_array;
-        CHECK_CUDA_ERROR(cudaMallocManaged(&bvh_args_array, sizeof(BottomBVHArgs) * array_length));
+        auto bvh_args_array = local_allocator.allocate<BottomBVHArgs>(array_length);
+        // TODO: this part can be optimized to prevent allocating memory every loop
 
         {
             uint blocks = divide_and_ceil(uint(end - start), threads);
@@ -626,10 +597,7 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
         hlbvh_build_bottom_bvh<<<blocks, threads>>>(this, bvh_args_array, array_length);
         CHECK_CUDA_ERROR(cudaGetLastError());
         CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-
-        CHECK_CUDA_ERROR(cudaFree(bvh_args_array));
     }
-    CHECK_CUDA_ERROR(cudaFree(shared_offset));
 
     printf("HLBVH: bottom BVH nodes: %u, max depth: %u, max primitives in a leaf: %u\n",
            end - top_bvh_node_num, depth, MAX_PRIMITIVES_NUM_IN_LEAF);

diff --git a/src/pbrt/accelerator/hlbvh.h b/src/pbrt/accelerator/hlbvh.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include "pbrt/base/primitive.h"
-#include "pbrt/base/shape.h"
-#include "pbrt/euclidean_space/bounds3.h"
+#include <pbrt/base/primitive.h>
+#include <pbrt/base/shape.h>
+#include <pbrt/euclidean_space/bounds3.h>
 #include <atomic>
 #include <vector>
 
+class GPUMemoryAllocator;
 class ThreadPool;
 
 class HLBVH {
@@ -73,7 +74,7 @@ class HLBVH {
     };
 
     static const HLBVH *create(const std::vector<const Primitive *> &gpu_primitives,
-                               std::vector<void *> &gpu_dynamic_pointers);
+                               GPUMemoryAllocator &allocator);
 
     PBRT_CPU_GPU
     Bounds3f bounds() const {
@@ -101,7 +102,7 @@ class HLBVH {
     }
 
     void build_bvh(const std::vector<const Primitive *> &gpu_primitives,
-                   std::vector<void *> &gpu_dynamic_pointers);
+                   GPUMemoryAllocator &allocator);
 
     uint build_top_bvh_for_treelets(const Treelet *treelets, uint num_dense_treelets,
                                     ThreadPool &thread_pool);

diff --git a/src/pbrt/base/bsdf.cu b/src/pbrt/base/bsdf.cu
@@ -1,5 +1,5 @@
-#include "pbrt/base/bsdf.h"
-#include "pbrt/base/material.h"
+#include <pbrt/base/bsdf.h>
+#include <pbrt/base/material.h>
 
 PBRT_CPU_GPU
 void BSDF::init_bxdf(const Material *material, SampledWavelengths &lambda,

diff --git a/src/pbrt/base/bsdf.h b/src/pbrt/base/bsdf.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "pbrt/base/bxdf.h"
-#include "pbrt/euclidean_space/frame.h"
-#include "pbrt/euclidean_space/normal3f.h"
+#include <pbrt/base/bxdf.h>
+#include <pbrt/euclidean_space/frame.h>
+#include <pbrt/euclidean_space/normal3f.h>
 
 class Material;
 class MaterialEvalContext;

diff --git a/src/pbrt/base/bxdf.cu b/src/pbrt/base/bxdf.cu
@@ -1,4 +1,4 @@
-#include "pbrt/base/bxdf.h"
+#include <pbrt/base/bxdf.h>
 
 PBRT_CPU_GPU
 void BxDF::init(const CoatedConductorBxDF &_coated_conductor_bxdf) {

diff --git a/src/pbrt/base/bxdf.h b/src/pbrt/base/bxdf.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "pbrt/base/bxdf_util.h"
-#include "pbrt/bxdfs/coated_conductor_bxdf.h"
-#include "pbrt/bxdfs/coated_diffuse_bxdf.h"
-#include "pbrt/bxdfs/conductor_bxdf.h"
-#include "pbrt/bxdfs/dielectric_bxdf.h"
-#include "pbrt/bxdfs/diffuse_bxdf.h"
-#include "pbrt/spectrum_util/sampled_spectrum.h"
-#include "pbrt/util/macro.h"
+#include <pbrt/base/bxdf_util.h>
+#include <pbrt/bxdfs/coated_conductor_bxdf.h>
+#include <pbrt/bxdfs/coated_diffuse_bxdf.h>
+#include <pbrt/bxdfs/conductor_bxdf.h>
+#include <pbrt/bxdfs/dielectric_bxdf.h>
+#include <pbrt/bxdfs/diffuse_bxdf.h>
+#include <pbrt/spectrum_util/sampled_spectrum.h>
+#include <pbrt/gpu/macro.h>
 
 class BxDF {
   public:

diff --git a/src/pbrt/base/bxdf_util.h b/src/pbrt/base/bxdf_util.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "pbrt/base/spectrum.h"
-#include "pbrt/util/macro.h"
+#include <pbrt/base/spectrum.h>
+#include <pbrt/gpu/macro.h>
 
 enum BxDFFlags {
     Unset = 0,

diff --git a/src/pbrt/base/camera.cu b/src/pbrt/base/camera.cu
@@ -1,19 +1,14 @@
-#include "pbrt/base/camera.h"
-#include "pbrt/cameras/perspective.h"
+#include <pbrt/base/camera.h>
+#include <pbrt/cameras/perspective.h>
+#include <pbrt/gpu/gpu_memory_allocator.h>
 
 Camera *Camera::create_perspective_camera(const Point2i &resolution,
                                           const CameraTransform &camera_transform, const Film *film,
                                           const Filter *filter,
                                           const ParameterDictionary &parameters,
-                                          std::vector<void *> &gpu_dynamic_pointers) {
-    PerspectiveCamera *perspective_camera;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&perspective_camera, sizeof(PerspectiveCamera)));
-
-    Camera *camera;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&camera, sizeof(Camera)));
-
-    gpu_dynamic_pointers.push_back(perspective_camera);
-    gpu_dynamic_pointers.push_back(camera);
+                                          GPUMemoryAllocator &allocator) {
+    auto perspective_camera = allocator.allocate<PerspectiveCamera>();
+    auto camera = allocator.allocate<Camera>();
 
     perspective_camera->init(resolution, camera_transform, film, filter, parameters);
     camera->init(perspective_camera);
@@ -63,7 +58,7 @@ void Camera::pdf_we(const Ray &ray, FloatType *pdfPos, FloatType *pdfDir) const
 
 PBRT_CPU_GPU
 pbrt::optional<CameraWiSample> Camera::sample_wi(const Interaction &ref, const Point2f u,
-                                                      SampledWavelengths &lambda) const {
+                                                 SampledWavelengths &lambda) const {
     switch (type) {
     case Type::perspective: {
         return static_cast<const PerspectiveCamera *>(ptr)->sample_wi(ref, u, lambda);

diff --git a/src/pbrt/base/camera.h b/src/pbrt/base/camera.h
@@ -1,14 +1,14 @@
 #pragma once
 
-#include "pbrt/base/interaction.h"
-#include "pbrt/base/ray.h"
-#include "pbrt/euclidean_space/point2.h"
-#include "pbrt/euclidean_space/transform.h"
-#include "pbrt/spectrum_util/sampled_spectrum.h"
-#include <vector>
+#include <pbrt/base/interaction.h>
+#include <pbrt/base/ray.h>
+#include <pbrt/euclidean_space/point2.h>
+#include <pbrt/euclidean_space/transform.h>
+#include <pbrt/spectrum_util/sampled_spectrum.h>
 
 class Film;
 class Filter;
+class GPUMemoryAllocator;
 class Sampler;
 class ParameterDictionary;
 class PerspectiveCamera;
@@ -154,7 +154,7 @@ class Camera {
                                              const CameraTransform &camera_transform,
                                              const Film *film, const Filter *filter,
                                              const ParameterDictionary &parameters,
-                                             std::vector<void *> &gpu_dynamic_pointers);
+                                             GPUMemoryAllocator &allocator);
 
     void init(const PerspectiveCamera *perspective_camera);
 
@@ -175,7 +175,7 @@ class Camera {
 
     PBRT_CPU_GPU
     pbrt::optional<CameraWiSample> sample_wi(const Interaction &ref, const Point2f u,
-                                                  SampledWavelengths &lambda) const;
+                                             SampledWavelengths &lambda) const;
 
   private:
     Type type;

diff --git a/src/pbrt/base/film.cu b/src/pbrt/base/film.cu
@@ -1,16 +1,15 @@
-#include "ext/lodepng/lodepng.h"
-#include "pbrt/base/film.h"
-#include "pbrt/films/rgb_film.h"
-#include "pbrt/spectrum_util/color_encoding.h"
+#include <ext/lodepng/lodepng.h>
+#include <pbrt/base/film.h>
+#include <pbrt/films/rgb_film.h>
+#include <pbrt/spectrum_util/color_encoding.h>
+#include <pbrt/gpu/gpu_memory_allocator.h>
 #include <vector>
 
 Film *Film::create_rgb_film(const Filter *filter, const ParameterDictionary &parameters,
-                            std::vector<void *> &gpu_dynamic_pointers) {
-    Film *film;
-    CHECK_CUDA_ERROR(cudaMallocManaged(&film, sizeof(Film)));
-    gpu_dynamic_pointers.push_back(film);
+                            GPUMemoryAllocator &allocator) {
+    auto rgb_film = RGBFilm::create(filter, parameters, allocator);
 
-    auto rgb_film = RGBFilm::create(filter, parameters, gpu_dynamic_pointers);
+    auto film = allocator.allocate<Film>();
     film->init(rgb_film);
 
     return film;