opt(insert-and-evict): thrust prefix_sum introduce cudaMalloc/cudaFree which make host wait. Replace it by cub API.

Lifann · Lifann · commit 4861ed098b26 · 2023-12-12T20:01:06.000+08:00
diff --git a/include/merlin/array_kernels.cuh b/include/merlin/array_kernels.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cooperative_groups.h>
+#include "cub/cub.cuh"
 #include "cuda_runtime.h"
 #include "thrust/device_vector.h"
 #include "thrust/execution_policy.h"
@@ -104,18 +105,11 @@ template <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>
 void gpu_boolean_mask(size_t grid_size, size_t block_size, const bool* masks,
                       size_t n, size_t* n_evicted, Tidx* offsets,
                       K* __restrict keys, V* __restrict values,
-                      S* __restrict scores, size_t dim, cudaStream_t stream) {
+                      S* __restrict scores, Tidx* offset_ws, size_t offset_ws_bytes, size_t dim, cudaStream_t stream) {
   size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
   gpu_cell_count<Tidx, TILE_SIZE>
       <<<grid_size, block_size, 0, stream>>>(masks, offsets, n, n_evicted);
-#if THRUST_VERSION >= 101600
-  auto policy = thrust::cuda::par_nosync.on(stream);
-#else
-  auto policy = thrust::cuda::par.on(stream);
-#endif
-  thrust::device_ptr<Tidx> d_src(offsets);
-  thrust::device_ptr<Tidx> d_dest(offsets);
-  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
+  cub::DeviceScan::ExclusiveSum(offset_ws, offset_ws_bytes, offsets, offsets, n_offsets, stream);
   gpu_select_kvm_kernel<K, V, S, Tidx, TILE_SIZE>
       <<<grid_size, block_size, 0, stream>>>(masks, n, offsets, keys, values,
                                              scores, dim);
diff --git a/include/merlin_hashtable.cuh b/include/merlin_hashtable.cuh
@@ -26,6 +26,7 @@
 #include <mutex>
 #include <shared_mutex>
 #include <type_traits>
+#include "cub/cub.cuh"
 #include "merlin/allocator.cuh"
 #include "merlin/array_kernels.cuh"
 #include "merlin/core_kernels.cuh"
@@ -598,9 +599,16 @@ class HashTable {
 
       keys_not_empty<K>
           <<<grid_size, block_size, 0, stream>>>(evicted_keys, d_masks, n);
+
+      void *d_temp_storage = nullptr;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_offsets, d_offsets, n_offsets, stream);
+      auto helper_ws{dev_mem_pool_->get_workspace<1>(temp_storage_bytes, stream)};
+      int64_t* d_temp_storage_i64 = helper_ws.get<int64_t*>(0);
+
       gpu_boolean_mask<K, V, S, int64_t, TILE_SIZE>(
           grid_size, block_size, d_masks, n, d_evicted_counter, d_offsets,
-          evicted_keys, evicted_values, evicted_scores, dim(), stream);
+          evicted_keys, evicted_values, evicted_scores, d_temp_storage_i64, temp_storage_bytes, dim(), stream);
     }
     return;
   }