Skip to content

Commit

Permalink
feature: implement GPUMemoryAllocator
Browse files Browse the repository at this point in the history
  • Loading branch information
w3ntao committed Jan 13, 2025
1 parent 9e55f00 commit 365971d
Show file tree
Hide file tree
Showing 209 changed files with 1,513 additions and 1,901 deletions.
84 changes: 26 additions & 58 deletions src/pbrt/accelerator/hlbvh.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "pbrt/accelerator/hlbvh.h"
#include "pbrt/util/stack.h"
#include "pbrt/util/thread_pool.h"
#include <pbrt/accelerator/hlbvh.h>
#include <pbrt/gpu/gpu_memory_allocator.h>
#include <pbrt/util/stack.h>
#include <pbrt/util/thread_pool.h>
#include <chrono>

constexpr uint TREELET_MORTON_BITS_PER_DIMENSION = 10;
Expand Down Expand Up @@ -168,12 +169,9 @@ __global__ void init_bvh_args(HLBVH::BottomBVHArgs *bvh_args_array,
}

const HLBVH *HLBVH::create(const std::vector<const Primitive *> &gpu_primitives,
std::vector<void *> &gpu_dynamic_pointers) {
HLBVH *bvh;
CHECK_CUDA_ERROR(cudaMallocManaged(&bvh, sizeof(HLBVH)));
gpu_dynamic_pointers.push_back(bvh);

bvh->build_bvh(gpu_primitives, gpu_dynamic_pointers);
GPUMemoryAllocator &allocator) {
auto bvh = allocator.allocate<HLBVH>();
bvh->build_bvh(gpu_primitives, allocator);

return bvh;
}
Expand Down Expand Up @@ -384,7 +382,7 @@ void HLBVH::build_bottom_bvh(const BottomBVHArgs *bvh_args_array, uint array_len
}

void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
std::vector<void *> &gpu_dynamic_pointers) {
GPUMemoryAllocator &allocator) {
auto start_sorting = std::chrono::system_clock::now();

primitives = nullptr;
Expand All @@ -398,46 +396,35 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,

printf("\ntotal primitives: %u\n", num_total_primitives);

MortonPrimitive *gpu_morton_primitives;
CHECK_CUDA_ERROR(
cudaMallocManaged(&gpu_morton_primitives, sizeof(MortonPrimitive) * num_total_primitives));
GPUMemoryAllocator local_allocator;

auto sparse_treelets = local_allocator.allocate<Treelet>(MAX_TREELET_NUM);

CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
auto gpu_morton_primitives = allocator.allocate<MortonPrimitive>(num_total_primitives);
auto gpu_primitives_array = allocator.allocate<const Primitive *>(num_total_primitives);

const Primitive **gpu_primitives_array;
CHECK_CUDA_ERROR(
cudaMallocManaged(&gpu_primitives_array, sizeof(Primitive *) * num_total_primitives));
CHECK_CUDA_ERROR(cudaMemcpy(gpu_primitives_array, gpu_primitives.data(),
sizeof(Primitive *) * num_total_primitives,
cudaMemcpyHostToDevice));

Treelet *sparse_treelets;
CHECK_CUDA_ERROR(cudaMallocManaged(&sparse_treelets, sizeof(Treelet) * MAX_TREELET_NUM));

CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

gpu_dynamic_pointers.push_back(gpu_morton_primitives);
gpu_dynamic_pointers.push_back(gpu_primitives_array);

this->init(gpu_primitives_array, gpu_morton_primitives);

constexpr uint threads = 1024;
{
const uint blocks = divide_and_ceil(num_total_primitives, threads);
hlbvh_init_morton_primitives<<<blocks, threads>>>(morton_primitives, primitives,
num_total_primitives);
CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
}

{
const uint blocks = divide_and_ceil(MAX_TREELET_NUM, threads);
hlbvh_init_treelets<<<blocks, threads>>>(sparse_treelets);
CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
}

CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

Bounds3f bounds_of_primitives_centroids;
for (uint idx = 0; idx < num_total_primitives; idx++) {
bounds_of_primitives_centroids += gpu_morton_primitives[idx].bounds.centroid();
Expand Down Expand Up @@ -467,10 +454,8 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
}

uint *primitives_counter;
uint *primitives_indices_offset;
CHECK_CUDA_ERROR(cudaMallocManaged(&primitives_counter, sizeof(uint) * MAX_TREELET_NUM));
CHECK_CUDA_ERROR(cudaMallocManaged(&primitives_indices_offset, sizeof(uint) * MAX_TREELET_NUM));
auto primitives_counter = local_allocator.allocate<uint>(MAX_TREELET_NUM);
auto primitives_indices_offset = local_allocator.allocate<uint>(MAX_TREELET_NUM);

{
const uint blocks = divide_and_ceil(MAX_TREELET_NUM, threads);
Expand Down Expand Up @@ -502,10 +487,7 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
}

MortonPrimitive *buffer_morton_primitives;
CHECK_CUDA_ERROR(cudaMallocManaged(&buffer_morton_primitives,
sizeof(MortonPrimitive) * num_total_primitives));

auto buffer_morton_primitives = local_allocator.allocate<MortonPrimitive>(num_total_primitives);
{
const uint blocks = divide_and_ceil(num_total_primitives, threads);
sort_morton_primitives<<<blocks, threads>>>(buffer_morton_primitives, morton_primitives,
Expand Down Expand Up @@ -533,10 +515,6 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
}

CHECK_CUDA_ERROR(cudaFree(primitives_counter));
CHECK_CUDA_ERROR(cudaFree(primitives_indices_offset));
CHECK_CUDA_ERROR(cudaFree(buffer_morton_primitives));

std::vector<uint> dense_treelet_indices;
{
uint max_primitive_num_in_a_treelet = 0;
Expand Down Expand Up @@ -564,45 +542,38 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
max_primitive_num_in_a_treelet);
}

Treelet *dense_treelets;
CHECK_CUDA_ERROR(
cudaMallocManaged(&dense_treelets, sizeof(Treelet) * dense_treelet_indices.size()));

auto dense_treelets = local_allocator.allocate<Treelet>(dense_treelet_indices.size());
for (uint idx = 0; idx < dense_treelet_indices.size(); idx++) {
uint sparse_idx = dense_treelet_indices[idx];
CHECK_CUDA_ERROR(cudaMemcpy(&dense_treelets[idx], &sparse_treelets[sparse_idx],
sizeof(Treelet), cudaMemcpyDeviceToDevice));
}
CHECK_CUDA_ERROR(cudaFree(sparse_treelets));

uint max_build_node_length =
(2 * dense_treelet_indices.size() + 1) + (2 * num_total_primitives + 1);
CHECK_CUDA_ERROR(cudaMallocManaged(&build_nodes, sizeof(BVHBuildNode) * max_build_node_length));
gpu_dynamic_pointers.push_back(build_nodes);

build_nodes = allocator.allocate<BVHBuildNode>(max_build_node_length);

auto start_top_bvh = std::chrono::system_clock::now();

ThreadPool thread_pool;
const uint top_bvh_node_num =
build_top_bvh_for_treelets(dense_treelets, dense_treelet_indices.size(), thread_pool);

CHECK_CUDA_ERROR(cudaFree(dense_treelets));

auto start_bottom_bvh = std::chrono::system_clock::now();

uint start = 0;
uint end = top_bvh_node_num;

uint *shared_offset;
CHECK_CUDA_ERROR(cudaMallocManaged(&shared_offset, sizeof(uint)));
auto shared_offset = local_allocator.allocate<uint>();
*shared_offset = end;

uint depth = 0;
while (end > start) {
const uint array_length = end - start;

BottomBVHArgs *bvh_args_array;
CHECK_CUDA_ERROR(cudaMallocManaged(&bvh_args_array, sizeof(BottomBVHArgs) * array_length));
auto bvh_args_array = local_allocator.allocate<BottomBVHArgs>(array_length);
// TODO: this part can be optimized to prevent allocating memory every loop

{
uint blocks = divide_and_ceil(uint(end - start), threads);
Expand All @@ -626,10 +597,7 @@ void HLBVH::build_bvh(const std::vector<const Primitive *> &gpu_primitives,
hlbvh_build_bottom_bvh<<<blocks, threads>>>(this, bvh_args_array, array_length);
CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

CHECK_CUDA_ERROR(cudaFree(bvh_args_array));
}
CHECK_CUDA_ERROR(cudaFree(shared_offset));

printf("HLBVH: bottom BVH nodes: %u, max depth: %u, max primitives in a leaf: %u\n",
end - top_bvh_node_num, depth, MAX_PRIMITIVES_NUM_IN_LEAF);
Expand Down
11 changes: 6 additions & 5 deletions src/pbrt/accelerator/hlbvh.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#pragma once

#include "pbrt/base/primitive.h"
#include "pbrt/base/shape.h"
#include "pbrt/euclidean_space/bounds3.h"
#include <pbrt/base/primitive.h>
#include <pbrt/base/shape.h>
#include <pbrt/euclidean_space/bounds3.h>
#include <atomic>
#include <vector>

class GPUMemoryAllocator;
class ThreadPool;

class HLBVH {
Expand Down Expand Up @@ -73,7 +74,7 @@ class HLBVH {
};

static const HLBVH *create(const std::vector<const Primitive *> &gpu_primitives,
std::vector<void *> &gpu_dynamic_pointers);
GPUMemoryAllocator &allocator);

PBRT_CPU_GPU
Bounds3f bounds() const {
Expand Down Expand Up @@ -101,7 +102,7 @@ class HLBVH {
}

void build_bvh(const std::vector<const Primitive *> &gpu_primitives,
std::vector<void *> &gpu_dynamic_pointers);
GPUMemoryAllocator &allocator);

uint build_top_bvh_for_treelets(const Treelet *treelets, uint num_dense_treelets,
ThreadPool &thread_pool);
Expand Down
4 changes: 2 additions & 2 deletions src/pbrt/base/bsdf.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "pbrt/base/bsdf.h"
#include "pbrt/base/material.h"
#include <pbrt/base/bsdf.h>
#include <pbrt/base/material.h>

PBRT_CPU_GPU
void BSDF::init_bxdf(const Material *material, SampledWavelengths &lambda,
Expand Down
6 changes: 3 additions & 3 deletions src/pbrt/base/bsdf.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once

#include "pbrt/base/bxdf.h"
#include "pbrt/euclidean_space/frame.h"
#include "pbrt/euclidean_space/normal3f.h"
#include <pbrt/base/bxdf.h>
#include <pbrt/euclidean_space/frame.h>
#include <pbrt/euclidean_space/normal3f.h>

class Material;
class MaterialEvalContext;
Expand Down
2 changes: 1 addition & 1 deletion src/pbrt/base/bxdf.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "pbrt/base/bxdf.h"
#include <pbrt/base/bxdf.h>

PBRT_CPU_GPU
void BxDF::init(const CoatedConductorBxDF &_coated_conductor_bxdf) {
Expand Down
16 changes: 8 additions & 8 deletions src/pbrt/base/bxdf.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#pragma once

#include "pbrt/base/bxdf_util.h"
#include "pbrt/bxdfs/coated_conductor_bxdf.h"
#include "pbrt/bxdfs/coated_diffuse_bxdf.h"
#include "pbrt/bxdfs/conductor_bxdf.h"
#include "pbrt/bxdfs/dielectric_bxdf.h"
#include "pbrt/bxdfs/diffuse_bxdf.h"
#include "pbrt/spectrum_util/sampled_spectrum.h"
#include "pbrt/util/macro.h"
#include <pbrt/base/bxdf_util.h>
#include <pbrt/bxdfs/coated_conductor_bxdf.h>
#include <pbrt/bxdfs/coated_diffuse_bxdf.h>
#include <pbrt/bxdfs/conductor_bxdf.h>
#include <pbrt/bxdfs/dielectric_bxdf.h>
#include <pbrt/bxdfs/diffuse_bxdf.h>
#include <pbrt/spectrum_util/sampled_spectrum.h>
#include <pbrt/gpu/macro.h>

class BxDF {
public:
Expand Down
4 changes: 2 additions & 2 deletions src/pbrt/base/bxdf_util.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "pbrt/base/spectrum.h"
#include "pbrt/util/macro.h"
#include <pbrt/base/spectrum.h>
#include <pbrt/gpu/macro.h>

enum BxDFFlags {
Unset = 0,
Expand Down
19 changes: 7 additions & 12 deletions src/pbrt/base/camera.cu
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
#include "pbrt/base/camera.h"
#include "pbrt/cameras/perspective.h"
#include <pbrt/base/camera.h>
#include <pbrt/cameras/perspective.h>
#include <pbrt/gpu/gpu_memory_allocator.h>

Camera *Camera::create_perspective_camera(const Point2i &resolution,
const CameraTransform &camera_transform, const Film *film,
const Filter *filter,
const ParameterDictionary &parameters,
std::vector<void *> &gpu_dynamic_pointers) {
PerspectiveCamera *perspective_camera;
CHECK_CUDA_ERROR(cudaMallocManaged(&perspective_camera, sizeof(PerspectiveCamera)));

Camera *camera;
CHECK_CUDA_ERROR(cudaMallocManaged(&camera, sizeof(Camera)));

gpu_dynamic_pointers.push_back(perspective_camera);
gpu_dynamic_pointers.push_back(camera);
GPUMemoryAllocator &allocator) {
auto perspective_camera = allocator.allocate<PerspectiveCamera>();
auto camera = allocator.allocate<Camera>();

perspective_camera->init(resolution, camera_transform, film, filter, parameters);
camera->init(perspective_camera);
Expand Down Expand Up @@ -63,7 +58,7 @@ void Camera::pdf_we(const Ray &ray, FloatType *pdfPos, FloatType *pdfDir) const

PBRT_CPU_GPU
pbrt::optional<CameraWiSample> Camera::sample_wi(const Interaction &ref, const Point2f u,
SampledWavelengths &lambda) const {
SampledWavelengths &lambda) const {
switch (type) {
case Type::perspective: {
return static_cast<const PerspectiveCamera *>(ptr)->sample_wi(ref, u, lambda);
Expand Down
16 changes: 8 additions & 8 deletions src/pbrt/base/camera.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#pragma once

#include "pbrt/base/interaction.h"
#include "pbrt/base/ray.h"
#include "pbrt/euclidean_space/point2.h"
#include "pbrt/euclidean_space/transform.h"
#include "pbrt/spectrum_util/sampled_spectrum.h"
#include <vector>
#include <pbrt/base/interaction.h>
#include <pbrt/base/ray.h>
#include <pbrt/euclidean_space/point2.h>
#include <pbrt/euclidean_space/transform.h>
#include <pbrt/spectrum_util/sampled_spectrum.h>

class Film;
class Filter;
class GPUMemoryAllocator;
class Sampler;
class ParameterDictionary;
class PerspectiveCamera;
Expand Down Expand Up @@ -154,7 +154,7 @@ class Camera {
const CameraTransform &camera_transform,
const Film *film, const Filter *filter,
const ParameterDictionary &parameters,
std::vector<void *> &gpu_dynamic_pointers);
GPUMemoryAllocator &allocator);

void init(const PerspectiveCamera *perspective_camera);

Expand All @@ -175,7 +175,7 @@ class Camera {

PBRT_CPU_GPU
pbrt::optional<CameraWiSample> sample_wi(const Interaction &ref, const Point2f u,
SampledWavelengths &lambda) const;
SampledWavelengths &lambda) const;

private:
Type type;
Expand Down
17 changes: 8 additions & 9 deletions src/pbrt/base/film.cu
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
#include "ext/lodepng/lodepng.h"
#include "pbrt/base/film.h"
#include "pbrt/films/rgb_film.h"
#include "pbrt/spectrum_util/color_encoding.h"
#include <ext/lodepng/lodepng.h>
#include <pbrt/base/film.h>
#include <pbrt/films/rgb_film.h>
#include <pbrt/spectrum_util/color_encoding.h>
#include <pbrt/gpu/gpu_memory_allocator.h>
#include <vector>

Film *Film::create_rgb_film(const Filter *filter, const ParameterDictionary &parameters,
std::vector<void *> &gpu_dynamic_pointers) {
Film *film;
CHECK_CUDA_ERROR(cudaMallocManaged(&film, sizeof(Film)));
gpu_dynamic_pointers.push_back(film);
GPUMemoryAllocator &allocator) {
auto rgb_film = RGBFilm::create(filter, parameters, allocator);

auto rgb_film = RGBFilm::create(filter, parameters, gpu_dynamic_pointers);
auto film = allocator.allocate<Film>();
film->init(rgb_film);

return film;
Expand Down
Loading

0 comments on commit 365971d

Please sign in to comment.