sparse conv fix bug (#72404)

ming1753 · web-flow · commit 12f2dccae441 · 2025-04-24T15:24:47.000+08:00
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -705,7 +705,6 @@ function(prune_pybind_h)
   list(APPEND op_list "fusion_seqconv_eltadd_relu")
   list(APPEND op_list "fusion_seqpool_cvm_concat")
   list(APPEND op_list "fusion_gru")
-  list(APPEND op_list "fusion_seqexpand_concat_fc")
   list(APPEND op_list "fusion_repeated_fc_relu")
   list(APPEND op_list "fusion_squared_mat_sub")
 
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -207,73 +207,6 @@ __global__ void UniqueKernel(const IntT* in_indices,
   }
 }
 
-template <int BS>
-__global__ void GetOutIndices(const int* flags,
-                              const int n,
-                              const int* offsets,
-                              const int out_nnz,
-                              int* out) {
-  int tid = threadIdx.x + blockDim.x * blockIdx.x;
-  __shared__ int block_counts[BS];
-  __shared__ int block_outs[BS * 32];
-
-  int count = 0;
-
-  if (tid < n) {
-    // get the count of 1 in flags[tid]
-    int flag = flags[tid];
-    count = BitCount(static_cast<uint32_t>(flag));
-  }
-
-  // call block prefix_sum
-  // using namespace cub;
-  typedef cub::BlockScan<int, BS> BlockScan;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(count, count);
-  __syncthreads();
-
-  // write index to out
-  if (tid < n) {
-    // get the count of 1 in flags[tid]
-    int flag = flags[tid];
-    // int j = block_counts[threadIdx.x];
-    int j = count;
-    // TODO(zhangkaihuo): opt the loop
-    for (int i = 0; i < 32; ++i) {
-      if ((1 & (flag >> i)) == 1) {
-        block_outs[j++] = (tid << 5) + i;
-      }
-    }
-  }
-
-  __syncthreads();
-  // write to block_outs
-  int start = offsets[blockIdx.x];
-  int end = blockIdx.x == gridDim.x - 1 ? out_nnz : offsets[blockIdx.x + 1];
-  for (int i = threadIdx.x; i < end - start; i += blockDim.x) {
-    out[start + i] = block_outs[i];
-  }
-}
-
-template <typename IntT>
-__global__ void GroupIndices(const int* out_index_table,
-                             const int n,
-                             const int kernel_size,
-                             IntT* out_indices,
-                             int* out_index_counts,
-                             int* out_index_groups) {
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    IntT index = out_indices[i];
-    int real_index = out_index_table[index];
-    out_indices[i] = real_index;
-
-    // kernel_size at most
-    int j = atomicAdd(out_index_counts + real_index, 1);
-    // nnz * kernel_size
-    out_index_groups[real_index * kernel_size + j] = i;
-  }
-}
-
 template <typename IntT>
 __global__ void GetOutIndexTable1(const IntT* indices,
                                   const IntT non_zero_num,
@@ -294,33 +227,6 @@ __global__ void GetOutIndexTable1(const IntT* indices,
   }
 }
 
-template <typename IntT>
-__global__ void GetOutIndexTable(int* indices,
-                                 const int non_zero_num,
-                                 const Dims4D out_dims,
-                                 const bool is2D,
-                                 int* out_index_table,
-                                 IntT* out_indices) {
-  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
-    IntT index = static_cast<IntT>(indices[i]);
-    out_index_table[index] = i;
-    IntT batch, x, y, z;
-    phi::funcs::sparse::IndexToPoint<Dims4D>(
-        index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    if (is2D) {
-      out_indices[i + non_zero_num] = y;
-      out_indices[i + non_zero_num * 2] = x;
-    } else {
-      out_indices[i + non_zero_num] = z;
-      out_indices[i + non_zero_num * 2] = y;
-      out_indices[i + non_zero_num * 3] = x;
-    }
-    indices[i] = 0;
-  }
-}
-
 template <typename IntT>
 __global__ void CopyRuleBook(const int* counters,
                              const int* offsets,
diff --git a/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h b/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h
@@ -181,7 +181,7 @@ template <int BS>
 __global__ void GetOutIndices(const int* flags,
                               const int n,
                               const int* offsets,
-                              const int* out_nnz,
+                              const int out_nnz,
                               int* out) {
   int tid = threadIdx.x + blockDim.x * blockIdx.x;
   __shared__ int block_counts[BS];
@@ -219,21 +219,19 @@ __global__ void GetOutIndices(const int* flags,
   __syncthreads();
   // write to block_outs
   int start = offsets[blockIdx.x];
-  int end = blockIdx.x == gridDim.x - 1 ? out_nnz[0] : offsets[blockIdx.x + 1];
+  int end = blockIdx.x == gridDim.x - 1 ? out_nnz : offsets[blockIdx.x + 1];
   for (int i = threadIdx.x; i < end - start; i += blockDim.x) {
     out[start + i] = block_outs[i];
   }
 }
 
 template <typename IntT>
 __global__ void GroupIndices(const int* out_index_table,
-                             const int* rulebook_len_ptr,
+                             const int n,
                              const int kernel_size,
                              IntT* out_indices,
                              int* out_index_counts,
                              int* out_index_groups) {
-  int n = rulebook_len_ptr[0] / 2;
-  out_indices = out_indices + n;
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
     IntT index = out_indices[i];
     int real_index = out_index_table[index];
@@ -248,12 +246,11 @@ __global__ void GroupIndices(const int* out_index_table,
 
 template <typename IntT>
 __global__ void GetOutIndexTable(int* indices,
-                                 const int* non_zero_num_ptr,
+                                 const int non_zero_num,
                                  const Dims4D out_dims,
                                  const bool is2D,
                                  int* out_index_table,
                                  IntT* out_indices) {
-  int non_zero_num = non_zero_num_ptr[0];
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT index = static_cast<IntT>(indices[i]);
     out_index_table[index] = i;
@@ -463,6 +460,18 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
                                      sizeof(int),
                                      gpuMemcpyDeviceToDevice,
                                      dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(h_buffer,
+                                     d_buffer.data<int>(),
+                                     (2 * kernel_size + 3) * sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+
+  dev_ctx.Wait();
+  int rulebook_len = h_buffer[2 * kernel_size + 1] / 2;
+  int out_nnz = h_buffer[2 * kernel_size + 2];
+
+  rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
+  out_index->Resize({static_cast<int>(rulebook_len)});
 
   const int threads = 256;
   const int blocks = (index_flags->numel() + threads - 1) / threads;
@@ -493,57 +502,43 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
       <<<blocks, threads, 0, dev_ctx.stream()>>>(index_flags_ptr,
                                                  index_flags->numel(),
                                                  out_index_table_ptr,
-                                                 unique_key_ptr,
+                                                 out_nnz,
                                                  out_index_ptr);
 
   const int64_t sparse_dim = is2D ? 3 : 4;
   phi::DenseTensor out_indices =
-      phi::Empty<IntT>(dev_ctx, {sparse_dim, static_cast<int>(max_nnz)});
-  phi::DenseTensor out_values = phi::Empty<T>(
-      dev_ctx, {static_cast<int>(max_nnz), kernel_sizes[sparse_dim]});
+      phi::Empty<IntT>(dev_ctx, {sparse_dim, out_nnz});
+
+  phi::DenseTensor out_values =
+      phi::Empty<T>(dev_ctx, {out_nnz, kernel_sizes[sparse_dim]});
+  out->SetMember(out_indices, out_values, out_dims, false);
 
   IntT* out_indices_ptr = out_indices.data<IntT>();
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, max_nnz, 1);
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
   GetOutIndexTable<IntT>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           out_index_ptr,
-          unique_key_ptr,
+          out_nnz,
           d_out_dims,
           is2D,
           out_index_table_ptr,
           out_indices_ptr);
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, static_cast<int>(max_nnz), 1);
-  unique_value->ResizeAndAllocate({static_cast<int>(max_nnz * kernel_size)});
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
   int* unique_value_ptr = unique_value->data<int>();
 
   GroupIndices<<<config.block_per_grid,
                  config.thread_per_block,
                  0,
                  dev_ctx.stream()>>>(out_index_table_ptr,
-                                     rulebook_len_tensor.data<int>(),
+                                     rulebook_len,
                                      kernel_size,
-                                     rulebook_ptr,
+                                     rulebook_ptr + rulebook_len,
                                      out_index_ptr,
                                      unique_value_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(h_buffer,
-                                     d_buffer.data<int>(),
-                                     (2 * kernel_size + 3) * sizeof(int),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
-  int rulebook_len = h_buffer[2 * kernel_size + 1] / 2;
-  int out_nnz = h_buffer[2 * kernel_size + 2];
-  rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
-  out_index->Resize({static_cast<int>(rulebook_len)});
-  out_indices.Resize({sparse_dim, static_cast<int>(out_nnz)});
-  unique_value->Resize(
-      {static_cast<int>(static_cast<int>(out_nnz) * kernel_size)});
-  out_values.Resize({out_nnz, kernel_sizes[sparse_dim]});
-  out->SetMember(out_indices, out_values, out_dims, false);
   return rulebook_len;
 }
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
@@ -118,6 +118,7 @@ __global__ void lookup_coords_kernel(
 {
     int tidx = blockIdx.x * blockDim.x + threadIdx.x;
     int idx = tidx / kernel_volume;
+    if (idx >= n) return;
     int _kernel_idx = tidx % kernel_volume;
     int kernel_idx = _kernel_idx;
     const int* in_coords = coords + _width * idx;

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ __global__ void lookup_coords_kernel(`
`118`	`118`	`{`
`119`	`119`	`int tidx = blockIdx.x * blockDim.x + threadIdx.x;`
`120`	`120`	`int idx = tidx / kernel_volume;`
	`121`	`+ if (idx >= n) return;`
`121`	`122`	`int _kernel_idx = tidx % kernel_volume;`
`122`	`123`	`int kernel_idx = _kernel_idx;`
`123`	`124`	`const int* in_coords = coords + _width * idx;`