nosferalatu · mgrabban · Sep 12, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,19 +1,79 @@
-cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+ # Modifications Copyright (C) 2023 Intel Corporation
+ #
+ # This Program is subject to the terms of The Unlicense.
+ # If a copy of the license was not distributed with this file, 
+ # you can obtain one at https://spdx.org/licenses/Unlicense.html
+ #
+ #
+ # SPDX-License-Identifier: Unlicense
+ #
 
-project(SimpleConcurrentGPUHashTable LANGUAGES CXX CUDA)
+cmake_minimum_required(VERSION 3.10)
 
-# put predefined cmake projects in their own solution folder
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+project(hashtable_sycl LANGUAGES CXX)
 
-string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=sm_35")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
-set(SOURCE_FILES src/main.cpp src/test.cpp src/linearprobing.h src/linearprobing.cu)
+option(GPU_AOT                 "Build AOT for Intel GPU"      OFF)
+option(USE_NVIDIA_BACKEND      "Build for NVIDIA backend"     OFF)
+option(USE_AMDHIP_BACKEND      "Build for AMD HIP backend"    OFF)
+option(USE_SM                  "Build for specific SM"        OFF)
+
+set(INTEL_GPU_CXX_FLAGS  " -O2 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+set(AMD_GPU_CXX_FLAGS    " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")
+
+set(USE_DEFAULT_FLAGS ON)
+if("${CMAKE_CXX_FLAGS}" STREQUAL "")
+    message(STATUS "Using DEFAULT compilation flags")
+else()
+    message(STATUS "Overriding DEFAULT compilation flags")
+    set(USE_DEFAULT_FLAGS OFF)
+endif()
+
+# AOT compilation
+if(GPU_AOT)
+    message(STATUS "Enabling INTEL backend")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for Intel backend
+    endif()
+    if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") )
+        message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 0x2f\" ")
+    else()
+        message(STATUS "Using custom AOT compilation flag ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " ${GPU_AOT} ")
+    endif()
+elseif(USE_NVIDIA_BACKEND)
+    message(STATUS "Enabling NVIDIA backend")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend
+    endif()
+    if(USE_SM)
+        message("-- Building for SM_${USE_SM} compatibility")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_${USE_SM} ")
+    else()
+        message("-- Building for SM_80 compatibility (DEFAULT)")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 ")
+    endif()
+elseif(USE_AMDHIP_BACKEND)
+    message(STATUS "Enabling AMD HIP backend for ${USE_AMDHIP_BACKEND} AMD architecture")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}") # Default flags for AMD backend (gfx90a for MI250)
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND} ")
+endif()
+
+set(SOURCES
+    ${CMAKE_SOURCE_DIR}/src/main.cpp
+    ${CMAKE_SOURCE_DIR}/src/test.cpp
+    ${CMAKE_SOURCE_DIR}/src/linearprobing.cpp
+)
 
 include_directories(${CMAKE_SOURCE_DIR}/src)
 
-add_executable(test ${SOURCE_FILES})
+add_executable(${PROJECT_NAME} ${SOURCES})
 
-# visual studio project should mimic directory structure
-# this isn't working for me; I think because
-# https://developercommunity.visualstudio.com/content/problem/777578/source-grouptree-no-longer-works.html
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES})
+target_link_libraries(${PROJECT_NAME} sycl OpenCL stdc++fs)
diff --git a/LICENSE b/LICENSE
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,8 @@
+Modifications Copyright (C) 2023 Intel Corporation
+
+This Program is subject to the terms of The Unlicense.
+If a copy of the license was not distributed with this file, 
+you can obtain one at https://spdx.org/licenses/Unlicense.html
+
+
+SPDX-License-Identifier: Unlicense
diff --git a/README.md b/README.md
@@ -1,89 +1,64 @@
-![](screenshot.png)
+# hashtable
 
-# About
+hashtable implements a simple hash table in GPU (original CUDA source code is from [here](https://github.com/nosferalatu/SimpleGPUHashTable)).
 
-This project shows how to implement a simple GPU hash table. Thanks to the high bandwidth and massive parallelism of
-GPU's, the result is a high performance hash table capable of hundreds of millions of operations per second.
 
-The code achieves an average insertion rate of 326 million key/second on my development laptop with an NVIDIA GTX 1060,
-measured by inserting 64 million elements.
+## SYCL version
 
-[Read my blog post about the code here](http://nosferalatu.com/SimpleGPUHashTable.html) for more information about the
-implementation.
+- The CUDA code was migrated using Intel DPCT, and then the resulting code was modified to remove the DPCT headers.
+- Timing code was later added for performance measurement purpose.
+- The same SYCL code runs on Intel GPUs & CPUs as well as NVIDIA (tested on A100 and H100) and AMD (tested on MI100 and MI250) GPUs.
 
-The code implements a lock free hash table using linear probing. Concurrent inserts, deletes, and lookups are supported by
-this hash table. The hash table works on 32 bit keys and 32 bit values (although 0xffffffff is reserved for both keys
-and values). The load factor of the table is set to 50% in the code, and the table size must be a power of two.
+# Build Instructions
+- icpx compiler mentioned below is included in oneAPI Base Toolkit available [here](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html).
+- clang++ compiler mentioned below is available [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md).
+## To build for SYCL
 
-Atomic operations are used to insert key/value pairs into the hash table on multiple GPU threads. It uses CUDA for ease
-of development, but could easily be ported to HLSL or GLSL. 64 bit keys and/or values could be supported using 64 bit
-atomics.
-
-Resizing the hash table is not implemented (it's a *simple* hash table!) although this can be achieved by inserting the
-contents of a table into another, larger table.
-
-The code was kept simple for readability. There are many optimizations that can be done, but they muddy the waters. I
-wanted to illustrate the basic design of the lock free hash table and how it can be implemented on a GPU.
-
-# How To Use
-
-If you build and run the executable, it enters an infinite loop of inserting and deleting random numbers into the
-GPU hash table and verifying that the results are correct. The seed used to generate random numbers changes every time
-you run the executable, but you can set the seed to a specific value in code if you'd like to reproduce results across
-runs.
-
-This is how you insert a vector of `KeyValue` pairs into the hash table and then retrieve all the `KeyValue` pairs back:
-
-```cpp
-    std::vector<KeyValue> things_to_insert = { {0,1}, {1,2}, {2,3}, {3,4} };
-
-    KeyValue* pHashTable = create_hashtable();
-    insert_hashtable(pHashTable, things_to_insert.data(), (uint32_t)things_to_insert.size());
-    std::vector<KeyValue> result = iterate_hashtable(pHashTable);
-    destroy_hashtable(pHashTable);
-```
-
-After that runs, the vectors `things_to_insert` and `result` should be the same, but possibly in a different order.
-
-# Prerequisites
-
-* CMake
-* CUDA
-
-This has been tested on Windows with Visual Studio Community 2019 on a machine with an NVIDIA GTX 1060.
-An easy way to get CMake is to open a Visual Studio command prompt (in Windows, run "x64 Native Tools Command Prompt for
-VS 2019"; that will put CMake in your path).
-
-This should work on other CUDA-supported platforms, but I have not tested this.
-
-# Cloning
+For Intel GPU -  
+First, source icpx compiler. Then,
 
 ```
-git clone https://github.com/nosferalatu/SimpleConcurrentGPUHashTable.git SimpleConcurrentGPUHashTable
+mkdir build
+cd build
+CXX=icpx cmake -DGPU_AOT=pvc ..
+make -sj
 ```
+Note:
+- To enable AOT compilation, please use the flag `-DGPU_AOT=pvc` for PVC.
 
-# Generating Build Files
-
-Run the following commands to generate .sln and .vcxproj's that can be opened in Visual Studio:
-
+For AMD GPU -  
+First source clang++ compiler. Then,
 ```
-cd ConcurrentHashTables
-md build
+mkdir build
 cd build
-cmake ..
+CXX=clang++ cmake -DUSE_AMDHIP_BACKEND=gfx90a ..
+make -sj
 ```
+Note:
+- We use the flag `-DUSE_AMDHIP_BACKEND=gfx90a` for MI250. Use the correct value for your GPU.
 
-You can now open `SimpleConcurrentGPUHashTable.sln` in Visual Studio.
-
-If CMake fails to find CUDA above, then run a CMake generator for 64 bit builds:
+For NVIDIA GPU -  
+First source clang++ compiler. Then,
 ```
-cmake -G "Visual Studio 16 2019 Win64" ..
+mkdir build
+cd build
+CXX=clang++ cmake -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=80 ..
+make -sj
 ```
+Note:
+- We use the flag `-DUSE_SM=80` for A100 or `-DUSE_SM=90` for H100.
 
-# Building
+# Run instructions
 
-You can build within Visual Studio, or from the command line with:
+After building, to run the workload, cd into the build folder. Then
 
 ```
-cmake --build . --config Release
+./hashtable_sycl
+```
+By default a verification is done and that takes some time. To skip verification:
 ```
+./hashtable_sycl --no-verify
+```
+# Output
+
+Output gives number of keys per second.
diff --git a/screenshot.png b/screenshot.png
diff --git a/src/acas.h b/src/acas.h
@@ -0,0 +1,66 @@
+/* Modifications Copyright (C) 2023 Intel Corporation
+ *
+ * This Program is subject to the terms of The Unlicense.
+ * If a copy of the license was not distributed with this file, 
+ * you can obtain one at https://spdx.org/licenses/Unlicense.html
+ *
+ *
+ * SPDX-License-Identifier: Unlicense
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Following code is copied from atomic.hpp of dpct
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+namespace acas {
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in, out] addr Multi_ptr.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
+T atomic_compare_exchange_strong(
+    sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
+    T expected,
+    T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail    = sycl::memory_order::relaxed
+) {
+    // sycl::atomic_ref<T, addressSpace> obj(addr);
+    sycl::atomic_ref<T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> obj(addr[0]);
+    obj.compare_exchange_strong(expected, desired, success, fail);
+    return expected;
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in] addr The pointer to the data.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
+T atomic_compare_exchange_strong(
+    T* addr,
+    T expected,
+    T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail    = sycl::memory_order::relaxed
+) {
+    return atomic_compare_exchange_strong(sycl::multi_ptr<T, addressSpace>(addr), expected, desired, success, fail);
+}
+
+} // namespace acas