Skip to content

[AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc #151882

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2944,6 +2944,40 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}

bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
if (!Pool->isGlobal() || !Pool->isCoarseGrained())
continue;
uint64_t Value;
hsa_status_t Status =
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
if (Status != HSA_STATUS_SUCCESS)
continue;
constexpr uint64_t Almost64Gig = 0xFF0000000;
if (Value >= Almost64Gig)
return true;
}
return false; // CoarseGrain pool w/ 64GB or more capacity not found
}

size_t getMemoryManagerSizeThreshold() override {
// Targeting high memory capacity GPUs such as
// data center GPUs.
if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
// Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
// if threshold is not already set by ENV var
// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
// This MemoryManager is used for omp_target_alloc(), OpenMP
// (non-usm) map clause, etc.
//
// Ideally, this kind of pooling is best performed at
// a common level (e.g, user side of HSA) between OpenMP and HIP
// but that feature does not exist (yet).
return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
}
return 0;
}

/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;
Expand Down
3 changes: 3 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Pointer to the memory manager or nullptr if not available.
MemoryManagerTy *MemoryManager;

/// Per device setting of MemoryManager's Threshold
virtual size_t getMemoryManagerSizeThreshold() { return 0; }

/// Environment variables defined by the OpenMP standard.
Int32Envar OMP_TeamLimit;
Int32Envar OMP_NumTeams;
Expand Down
5 changes: 4 additions & 1 deletion offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {

// Enable the memory manager if required.
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
if (EnableMM)
if (EnableMM) {
if (ThresholdMM == 0)
ThresholdMM = getMemoryManagerSizeThreshold();
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
}

return Plugin::success();
}
Expand Down
11 changes: 8 additions & 3 deletions offload/test/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
# For all other targets, we currently assume it is.
supports_unified_shared_memory = True
supports_apu = False
supports_large_allocation_memory_pool = False
if config.libomptarget_current_target.startswith('nvptx'):
try:
cuda_arch = int(config.cuda_test_arch[:3])
Expand All @@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
elif config.libomptarget_current_target.startswith('amdgcn'):
# amdgpu_test_arch contains a list of AMD GPUs in the system
# only check the first one assuming that we will run the test on it.
if not (config.amdgpu_test_arch.startswith("gfx90a") or
config.amdgpu_test_arch.startswith("gfx942") or
config.amdgpu_test_arch.startswith("gfx950")):
if (config.amdgpu_test_arch.startswith("gfx90a") or
config.amdgpu_test_arch.startswith("gfx942") or
config.amdgpu_test_arch.startswith("gfx950")):
supports_large_allocation_memory_pool = True
else:
supports_unified_shared_memory = False
# check if AMD architecture is an APU:
if ((config.amdgpu_test_arch.startswith("gfx942") and
Expand All @@ -144,6 +147,8 @@ if supports_unified_shared_memory:
config.available_features.add('unified_shared_memory')
if supports_apu:
config.available_features.add('apu')
if supports_large_allocation_memory_pool:
config.available_features.add('large_allocation_memory_pool')

# Setup environment to find dynamic library at runtime
if config.operating_system == 'Windows':
Expand Down
3 changes: 3 additions & 0 deletions offload/test/sanitizer/use_after_free_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

// If offload memory pooling is enabled for a large allocation, reuse error is
// not detected. UNSUPPORTED: large_allocation_memory_pool

#include <omp.h>

int main() {
Expand Down
37 changes: 37 additions & 0 deletions offload/test/sanitizer/use_after_free_3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
// clang-format on

// If offload memory pooling is enabled for a large allocation, reuse error is
// not detected. Run the test w/ and w/o ENV var override on memory pooling
// threshold. REQUIRES: large_allocation_memory_pool

#include <omp.h>
#include <stdio.h>

int main() {
int N = (1 << 30);
char *A = (char *)malloc(N);
char *P;
#pragma omp target map(A[ : N]) map(from : P)
{
P = &A[N / 2];
*P = 3;
}
// clang-format off
// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
// CHECK: Last deallocation:
// CHECK: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{
*P = 5;
}

// CHECK-PASS: PASS
printf("PASS\n");
return 0;
}
Loading