add rough idea for kernel

Prapti Devansh Trivedi · Prapti Devansh Trivedi · commit 53e12d2067a0 · 2024-04-21T19:32:02.000-04:00
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
@@ -424,6 +424,101 @@ int CudaRasterizer::Rasterizer::preprocessForward(
 	return num_rendered;
 }
 
+
+int CudaRasterizer::Rasterizer::preprocessForwardBatches(
+	float2* means2D,
+	float* depths,
+	int* radii,
+	float* cov3D,
+	float4* conic_opacity,
+	float* rgb,
+	bool* clamped,//the above are all per-Gaussian intemediate results.
+	const int P, int D, int M,
+	const std::vector<int>& width, std::vector<int>& height,
+	const float* means3D,
+	const float* scales,
+	const float* rotations,
+	const float* shs,
+	const float* opacities,//3dgs parameters
+	const std::vector<float>& scale_modifier,
+	const std::vector<torch::Tensor>& viewmatrix,
+	const std::vector<torch::Tensor>& projmatrix,
+	const std::vector<float>& cam_pos,
+	const std::vector<float>& tan_fovx, std::vector<float>& tan_fovy,
+	const std::vector<bool>& prefiltered,
+	std::vector<bool>& debug,//raster_settings
+	const std::vector<pybind11::dict> &args)
+{
+	auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
+	char* log_tmp = new char[500];
+
+	// print out the environment variables
+	if (mode == "train" && zhx_debug && iteration % log_interval == 1) {
+		sprintf(log_tmp, "world_size: %d, global_rank: %d, iteration: %d, log_folder: %s, zhx_debug: %d, zhx_time: %d, device: %d, log_interval: %d, dist_division_mode: %s", 
+				world_size, global_rank, iteration, log_folder.c_str(), zhx_debug, zhx_time, device, log_interval, dist_division_mode.c_str());
+		save_log_in_file(iteration, global_rank, world_size, log_folder, "cuda", log_tmp);
+	}
+
+	MyTimerOnGPU timer;
+	// const float focal_y = height / (2.0f * tan_fovy);
+	// const float focal_x = width / (2.0f * tan_fovx);
+	const int num_viewpoints=viewmatrix.size();
+
+	//CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII
+
+	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, num_viewpoints);
+	dim3 block(BLOCK_X, BLOCK_Y, num_viewpoints);
+	int tile_num = tile_grid.x * tile_grid.y*tile_grid.z;
+
+	// allocate temporary buffer for tiles_touched.
+	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 
+	// TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case
+	uint32_t* tiles_touched_temp_buffer;
+	CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, P * sizeof(uint32_t)), debug);
+	CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, P * sizeof(uint32_t)), debug);
+
+	timer.start("10 preprocess");
+	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
+	CHECK_CUDA(FORWARD::preprocess(
+		P, D, M,
+		means3D,
+		(glm::vec3*)scales,
+		scale_modifier,
+		(glm::vec4*)rotations,
+		opacities,
+		shs,
+		clamped,
+		nullptr,//cov3D_precomp,
+		nullptr,//colors_precomp,TODO: this is correct?
+		viewmatrix, projmatrix,
+		(glm::vec3*)cam_pos,
+		width, height,
+		focal_x, focal_y,
+		tan_fovx, tan_fovy,
+		radii,
+		means2D,
+		depths,
+		cov3D,
+		rgb,
+		conic_opacity,
+		tile_grid,
+		tiles_touched_temp_buffer,
+		prefiltered
+	), debug)
+	timer.stop("10 preprocess");
+
+	int num_rendered = 0;//TODO: should I calculate this here?
+
+	// Print out timing information
+	if (zhx_time && iteration % log_interval == 1) {
+		timer.printAllTimes(iteration, world_size, global_rank, log_folder, true);
+	}
+	delete log_tmp;
+	// free temporary buffer for tiles_touched. TODO: remove it. 
+	CHECK_CUDA(cudaFree(tiles_touched_temp_buffer), debug);
+	return num_rendered;
+}
+
 void CudaRasterizer::Rasterizer::preprocessBackward(
 	const int* radii,
 	const float* cov3D,
diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
@@ -31,7 +31,7 @@ def preprocess_gaussians(
     sh,
     opacities,
     raster_settings,
-    cuda_args,
+    cuda_args,flag_batched=False
 ):
     return _PreprocessGaussians.apply(
         means3D,
@@ -40,7 +40,7 @@ def preprocess_gaussians(
         sh,
         opacities,
         raster_settings,
-        cuda_args,
+        cuda_args,flag_batched
     )
 
 class _PreprocessGaussians(torch.autograd.Function):
@@ -52,45 +52,88 @@ def forward(
         rotations,
         sh,
         opacities,
-        raster_settings,
-        cuda_args,
+        raster_settings_list,
+        batched_cuda_args,flag_batched
     ):
 
         # Restructure arguments the way that the C++ lib expects them
-        args = (
-            means3D,
-            scales,
-            rotations,
-            sh,
-            opacities,# 3dgs' parametes.
-            raster_settings.scale_modifier,
-            raster_settings.viewmatrix,
-            raster_settings.projmatrix,
-            raster_settings.tanfovx,
-            raster_settings.tanfovy,
-            raster_settings.image_height,
-            raster_settings.image_width,
-            raster_settings.sh_degree,
-            raster_settings.campos,
-            raster_settings.prefiltered,
-            raster_settings.debug,#raster_settings
-            cuda_args
-        )
-
-        # TODO: update this. 
-        num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+        if flag_batched==False:
+            args = (
+                means3D,
+                scales,
+                rotations,
+                sh,
+                opacities,# 3dgs' parametes.
+                raster_settings.scale_modifier,
+                raster_settings.viewmatrix,
+                raster_settings.projmatrix,
+                raster_settings.tanfovx,
+                raster_settings.tanfovy,
+                raster_settings.image_height,
+                raster_settings.image_width,
+                raster_settings.sh_degree,
+                raster_settings.campos,
+                raster_settings.prefiltered,
+                raster_settings.debug,#raster_settings
+                cuda_args
+            )
+
+            # TODO: update this. 
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+
+            # Keep relevant tensors for backward
+            ctx.raster_settings = raster_settings
+            ctx.cuda_args = cuda_args
+            ctx.num_rendered = num_rendered
+            ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
+            ctx.mark_non_differentiable(radii, depths)
+
+            # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
+            # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
+            # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
+            return means2D, rgb, conic_opacity, radii, depths
+
+        else:
+            args_list=[]
+            for raster_settings,cuda_args in zip(raster_settings_list,batched_cuda_args):
+
+                args = (
+                    means3D,
+                    scales,
+                    rotations,
+                    sh,
+                    opacities,# 3dgs' parametes.
+                    raster_settings.scale_modifier,
+                    raster_settings.viewmatrix,
+                    raster_settings.projmatrix,
+                    raster_settings.tanfovx,
+                    raster_settings.tanfovy,
+                    raster_settings.image_height,
+                    raster_settings.image_width,
+                    raster_settings.sh_degree,
+                    raster_settings.campos,
+                    raster_settings.prefiltered,
+                    raster_settings.debug,#raster_settings
+                    cuda_args
+                )
+                args_list.append(args)
+
+            # TODO: update this. 
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batches(*args_list)
+
+            # Keep relevant tensors for backward
+            ctx.raster_settings = raster_settings_list
+            ctx.cuda_args = batched_cuda_args
+            ctx.num_rendered = num_rendered
+            ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
+            ctx.mark_non_differentiable(radii, depths)
+
+            # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
+            # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
+            # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
+            return means2D, rgb, conic_opacity, radii, depths
 
-        # Keep relevant tensors for backward
-        ctx.raster_settings = raster_settings
-        ctx.cuda_args = cuda_args
-        ctx.num_rendered = num_rendered
-        ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
-        ctx.mark_non_differentiable(radii, depths)
 
-        # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
-        # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
-        # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
-        return means2D, rgb, conic_opacity, radii, depths
 
     @staticmethod # TODO: gradient for conic_opacity is tricky. because cuda render backward generate dL_dconic and dL_dopacity sperately. 
     def backward(ctx, grad_means2D, grad_rgb, grad_conic_opacity, grad_radii, grad_depths):
@@ -320,14 +363,14 @@ def markVisible(self, positions):
     def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None):
         # Invoke C++/CUDA rasterization routine
         
-            return preprocess_gaussians_batches(
+            return preprocess_gaussians(
                 means3D,
                 scales,
                 rotations,
                 shs,
                 opacities,
                 self.raster_settings_list,
-                batched_cuda_args)
+                batched_cuda_args,True)
 
 class GaussianRasterizer(nn.Module):
     def __init__(self, raster_settings):
diff --git a/ext.cpp b/ext.cpp
@@ -16,6 +16,7 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("mark_visible", &markVisible);
   m.def("preprocess_gaussians", &PreprocessGaussiansCUDA);
+  m.def("preprocess_gaussians_batched", &PreprocessGaussiansCUDABatches);
   m.def("preprocess_gaussians_backward", &PreprocessGaussiansBackwardCUDA);
   m.def("get_distribution_strategy", &GetDistributionStrategyCUDA);
   m.def("render_gaussians", &RenderGaussiansCUDA);
diff --git a/rasterization_tests.py b/rasterization_tests.py
@@ -143,7 +143,7 @@ def test_improved_gaussian_rasterizer():
 
     rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list)
     start_time = time.time()
-    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians_batches(
+    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(
             means3D=means3D,
             scales=scales,
             rotations=rotations,
diff --git a/rasterize_points.cu b/rasterize_points.cu
@@ -142,6 +142,88 @@ PreprocessGaussiansCUDA(
 	return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped);
 }
 
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+PreprocessGaussiansCUDABatches(
+	const torch::Tensor& means3D,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const torch::Tensor& sh,
+    const torch::Tensor& opacity,//3dgs' parametes.
+	const std::vector<float>& scale_modifier,
+	const std::vector<torch::Tensor>& viewmatrix,
+	const std::vector<torch::Tensor>& projmatrix,
+	const std::vector<float>& tan_fovx, 
+	const std::vector<float>& tan_fovy,
+    const std::vector<int>& image_height,
+    const std::vector<int>& image_width,
+	const std::vector<int>& degree,
+	const std::vector<torch::Tensor>& campos,
+	const std::vector<bool>& prefiltered,//raster_settings
+	const std::vector<bool>& debug,
+	const std::vector<pybind11::dict> &args) {
+
+	if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
+		AT_ERROR("means3D must have dimensions (num_points, 3)");
+	}
+
+	const int P = means3D.size(0);
+	// const int H = image_height;
+	// const int W = image_width;
+
+	// of shape (P, 2). means2D is (P, 2) in cuda. It will be converted to (P, 3) when is sent back to python to meet torch graph's requirement.
+	torch::Tensor means2D = torch::full({P, 2}, 0.0, means3D.options());//TODO: what about require_grads?
+	// of shape (P)
+	torch::Tensor depths = torch::full({P}, 0.0, means3D.options());
+	// of shape (P)
+	torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
+	// of shape (P, 6)
+	torch::Tensor cov3D = torch::full({P, 6}, 0.0, means3D.options());
+	// of shape (P, 4)
+	torch::Tensor conic_opacity = torch::full({P, 4}, 0.0, means3D.options());
+	// of shape (P, 3)
+	torch::Tensor rgb = torch::full({P, 3}, 0.0, means3D.options());
+	// of shape (P)
+	torch::Tensor clamped = torch::full({P, 3}, false, means3D.options().dtype(at::kBool));
+	//TODO: compare to original GeometryState implementation, this one does not explicitly do gpu memory alignment. 
+	//That may lead to problems. However, pytorch does implicit memory alignment.
+
+	int rendered = 0;//TODO: I could compute rendered here by summing up geomState.tiles_touched. 
+	if(P != 0)
+	{
+		int M = 0;
+		if(sh.size(0) != 0)
+		{
+			M = sh.size(1);
+		}
+
+		rendered = CudaRasterizer::Rasterizer::preprocessForwardBatches(
+			reinterpret_cast<float2*>(means2D.contiguous().data<float>()),//TODO: check whether it supports float2?
+			depths.contiguous().data<float>(),
+			radii.contiguous().data<int>(),
+			cov3D.contiguous().data<float>(),
+			reinterpret_cast<float4*>(conic_opacity.contiguous().data<float>()),
+			rgb.contiguous().data<float>(),
+			clamped.contiguous().data<bool>(),
+			P, degree, M,
+			image_width, image_height,
+			means3D,
+			scales,
+			rotations,
+			sh,
+			opacity, 
+			scale_modifier,
+			viewmatrix, 
+			projmatrix,
+			campos,
+			tan_fovx,
+			tan_fovy,
+			prefiltered,
+			debug,
+			args);
+	}
+	return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped);
+}
+
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
   PreprocessGaussiansBackwardCUDA(