Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 71 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,79 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
# Modifications Copyright (C) 2023 Intel Corporation
#
# This Program is subject to the terms of The Unlicense.​
# If a copy of the license was not distributed with this file, ​
# you can obtain one at https://spdx.org/licenses/Unlicense.html​
#​
#
# SPDX-License-Identifier: Unlicense
#

project(SimpleConcurrentGPUHashTable LANGUAGES CXX CUDA)
cmake_minimum_required(VERSION 3.10)

# put predefined cmake projects in their own solution folder
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
project(hashtable_sycl LANGUAGES CXX)

string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=sm_35")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

set(SOURCE_FILES src/main.cpp src/test.cpp src/linearprobing.h src/linearprobing.cu)
option(GPU_AOT "Build AOT for Intel GPU" OFF)
option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF)
option(USE_AMDHIP_BACKEND "Build for AMD HIP backend" OFF)
option(USE_SM "Build for specific SM" OFF)

set(INTEL_GPU_CXX_FLAGS " -O2 -fsycl -Wall -Wextra -Wno-unused-parameter ")
set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")
set(AMD_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ")

set(USE_DEFAULT_FLAGS ON)
if("${CMAKE_CXX_FLAGS}" STREQUAL "")
message(STATUS "Using DEFAULT compilation flags")
else()
message(STATUS "Overriding DEFAULT compilation flags")
set(USE_DEFAULT_FLAGS OFF)
endif()

# AOT compilation
if(GPU_AOT)
message(STATUS "Enabling INTEL backend")
if(USE_DEFAULT_FLAGS)
set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for Intel backend
endif()
if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") )
message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}")
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 0x2f\" ")
else()
message(STATUS "Using custom AOT compilation flag ${GPU_AOT}")
string(APPEND CMAKE_CXX_FLAGS " ${GPU_AOT} ")
endif()
elseif(USE_NVIDIA_BACKEND)
message(STATUS "Enabling NVIDIA backend")
if(USE_DEFAULT_FLAGS)
set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend
endif()
if(USE_SM)
message("-- Building for SM_${USE_SM} compatibility")
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_${USE_SM} ")
else()
message("-- Building for SM_80 compatibility (DEFAULT)")
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 ")
endif()
elseif(USE_AMDHIP_BACKEND)
message(STATUS "Enabling AMD HIP backend for ${USE_AMDHIP_BACKEND} AMD architecture")
if(USE_DEFAULT_FLAGS)
set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}") # Default flags for AMD backend (gfx90a for MI250)
endif()
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND} ")
endif()

set(SOURCES
${CMAKE_SOURCE_DIR}/src/main.cpp
${CMAKE_SOURCE_DIR}/src/test.cpp
${CMAKE_SOURCE_DIR}/src/linearprobing.cpp
)

include_directories(${CMAKE_SOURCE_DIR}/src)

add_executable(test ${SOURCE_FILES})
add_executable(${PROJECT_NAME} ${SOURCES})

# visual studio project should mimic directory structure
# this isn't working for me; I think because
# https://developercommunity.visualstudio.com/content/problem/777578/source-grouptree-no-longer-works.html
source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES})
target_link_libraries(${PROJECT_NAME} sycl OpenCL stdc++fs)
24 changes: 0 additions & 24 deletions LICENSE

This file was deleted.

8 changes: 8 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Modifications Copyright (C) 2023 Intel Corporation

This Program is subject to the terms of The Unlicense.​
If a copy of the license was not distributed with this file, ​
you can obtain one at https://spdx.org/licenses/Unlicense.html​


SPDX-License-Identifier: Unlicense
111 changes: 43 additions & 68 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,89 +1,64 @@
![](screenshot.png)
# hashtable

# About
hashtable implements a simple hash table in GPU (original CUDA source code is from [here](https://github.com/nosferalatu/SimpleGPUHashTable)).

This project shows how to implement a simple GPU hash table. Thanks to the high bandwidth and massive parallelism of
GPU's, the result is a high performance hash table capable of hundreds of millions of operations per second.

The code achieves an average insertion rate of 326 million key/second on my development laptop with an NVIDIA GTX 1060,
measured by inserting 64 million elements.
## SYCL version

[Read my blog post about the code here](http://nosferalatu.com/SimpleGPUHashTable.html) for more information about the
implementation.
- The CUDA code was migrated using Intel DPCT, and then the resulting code was modified to remove the DPCT headers.
- Timing code was later added for performance measurement purpose.
- The same SYCL code runs on Intel GPUs & CPUs as well as NVIDIA (tested on A100 and H100) and AMD (tested on MI100 and MI250) GPUs.

The code implements a lock free hash table using linear probing. Concurrent inserts, deletes, and lookups are supported by
this hash table. The hash table works on 32 bit keys and 32 bit values (although 0xffffffff is reserved for both keys
and values). The load factor of the table is set to 50% in the code, and the table size must be a power of two.
# Build Instructions
- icpx compiler mentioned below is included in oneAPI Base Toolkit available [here](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html).
- clang++ compiler mentioned below is available [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md).
## To build for SYCL

Atomic operations are used to insert key/value pairs into the hash table on multiple GPU threads. It uses CUDA for ease
of development, but could easily be ported to HLSL or GLSL. 64 bit keys and/or values could be supported using 64 bit
atomics.

Resizing the hash table is not implemented (it's a *simple* hash table!) although this can be achieved by inserting the
contents of a table into another, larger table.

The code was kept simple for readability. There are many optimizations that can be done, but they muddy the waters. I
wanted to illustrate the basic design of the lock free hash table and how it can be implemented on a GPU.

# How To Use

If you build and run the executable, it enters an infinite loop of inserting and deleting random numbers into the
GPU hash table and verifying that the results are correct. The seed used to generate random numbers changes every time
you run the executable, but you can set the seed to a specific value in code if you'd like to reproduce results across
runs.

This is how you insert a vector of `KeyValue` pairs into the hash table and then retrieve all the `KeyValue` pairs back:

```cpp
std::vector<KeyValue> things_to_insert = { {0,1}, {1,2}, {2,3}, {3,4} };

KeyValue* pHashTable = create_hashtable();
insert_hashtable(pHashTable, things_to_insert.data(), (uint32_t)things_to_insert.size());
std::vector<KeyValue> result = iterate_hashtable(pHashTable);
destroy_hashtable(pHashTable);
```

After that runs, the vectors `things_to_insert` and `result` should be the same, but possibly in a different order.

# Prerequisites

* CMake
* CUDA

This has been tested on Windows with Visual Studio Community 2019 on a machine with an NVIDIA GTX 1060.
An easy way to get CMake is to open a Visual Studio command prompt (in Windows, run "x64 Native Tools Command Prompt for
VS 2019"; that will put CMake in your path).

This should work on other CUDA-supported platforms, but I have not tested this.

# Cloning
For Intel GPU -
First, source icpx compiler. Then,

```
git clone https://github.com/nosferalatu/SimpleConcurrentGPUHashTable.git SimpleConcurrentGPUHashTable
mkdir build
cd build
CXX=icpx cmake -DGPU_AOT=pvc ..
make -sj
```
Note:
- To enable AOT compilation, please use the flag `-DGPU_AOT=pvc` for PVC.

# Generating Build Files

Run the following commands to generate .sln and .vcxproj's that can be opened in Visual Studio:

For AMD GPU -
First source clang++ compiler. Then,
```
cd ConcurrentHashTables
md build
mkdir build
cd build
cmake ..
CXX=clang++ cmake -DUSE_AMDHIP_BACKEND=gfx90a ..
make -sj
```
Note:
- We use the flag `-DUSE_AMDHIP_BACKEND=gfx90a` for MI250. Use the correct value for your GPU.

You can now open `SimpleConcurrentGPUHashTable.sln` in Visual Studio.

If CMake fails to find CUDA above, then run a CMake generator for 64 bit builds:
For NVIDIA GPU -
First source clang++ compiler. Then,
```
cmake -G "Visual Studio 16 2019 Win64" ..
mkdir build
cd build
CXX=clang++ cmake -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=80 ..
make -sj
```
Note:
- We use the flag `-DUSE_SM=80` for A100 or `-DUSE_SM=90` for H100.

# Building
# Run instructions

You can build within Visual Studio, or from the command line with:
After building, to run the workload, cd into the build folder. Then

```
cmake --build . --config Release
./hashtable_sycl
```
By default a verification is done and that takes some time. To skip verification:
```
./hashtable_sycl --no-verify
```
# Output

Output gives number of keys per second.
Binary file removed screenshot.png
Binary file not shown.
66 changes: 66 additions & 0 deletions src/acas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/* Modifications Copyright (C) 2023 Intel Corporation
*
* This Program is subject to the terms of The Unlicense.​
* If a copy of the license was not distributed with this file, ​
* you can obtain one at https://spdx.org/licenses/Unlicense.html​
*​
*
* SPDX-License-Identifier: Unlicense
*/

//===----------------------------------------------------------------------===//
//
// Following code is copied from atomic.hpp of dpct
//
//===----------------------------------------------------------------------===//

#pragma once

#include <sycl/sycl.hpp>

namespace acas {

/// Atomically compare the value at \p addr to the value expected and exchange
/// with the value desired if the value at \p addr is equal to the value expected.
/// Returns the value at the \p addr before the call.
/// \param [in, out] addr Multi_ptr.
/// \param expected The value to compare against the value at \p addr.
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
/// \param success The memory ordering used when comparison succeeds.
/// \param fail The memory ordering used when comparison fails.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
T atomic_compare_exchange_strong(
sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
T expected,
T desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed
) {
// sycl::atomic_ref<T, addressSpace> obj(addr);
sycl::atomic_ref<T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> obj(addr[0]);
obj.compare_exchange_strong(expected, desired, success, fail);
return expected;
}

/// Atomically compare the value at \p addr to the value expected and exchange
/// with the value desired if the value at \p addr is equal to the value expected.
/// Returns the value at the \p addr before the call.
/// \param [in] addr The pointer to the data.
/// \param expected The value to compare against the value at \p addr.
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
/// \param success The memory ordering used when comparison succeeds.
/// \param fail The memory ordering used when comparison fails.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace = sycl::access::address_space::global_space>
T atomic_compare_exchange_strong(
T* addr,
T expected,
T desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed
) {
return atomic_compare_exchange_strong(sycl::multi_ptr<T, addressSpace>(addr), expected, desired, success, fail);
}

} // namespace acas
Loading