Skip to content

Commit 83e5a99

Browse files
authored
[AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc (#151882)
Enables AMD data center class GPUs to use memory manager memory pooling up to 3GB allocation by default, up from the "1 << 13" threshold that all plugin-nextgen devices use.
1 parent c310306 commit 83e5a99

File tree

6 files changed

+89
-4
lines changed

6 files changed

+89
-4
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2945,6 +2945,40 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29452945
return Plugin::success();
29462946
}
29472947

2948+
bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
2949+
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2950+
if (!Pool->isGlobal() || !Pool->isCoarseGrained())
2951+
continue;
2952+
uint64_t Value;
2953+
hsa_status_t Status =
2954+
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2955+
if (Status != HSA_STATUS_SUCCESS)
2956+
continue;
2957+
constexpr uint64_t Almost64Gig = 0xFF0000000;
2958+
if (Value >= Almost64Gig)
2959+
return true;
2960+
}
2961+
return false; // CoarseGrain pool w/ 64GB or more capacity not found
2962+
}
2963+
2964+
size_t getMemoryManagerSizeThreshold() override {
2965+
// Targeting high memory capacity GPUs such as
2966+
// data center GPUs.
2967+
if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
2968+
// Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
2969+
// if threshold is not already set by ENV var
2970+
// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
2971+
// This MemoryManager is used for omp_target_alloc(), OpenMP
2972+
// (non-usm) map clause, etc.
2973+
//
2974+
// Ideally, this kind of pooling is best performed at
2975+
// a common level (e.g, user side of HSA) between OpenMP and HIP
2976+
// but that feature does not exist (yet).
2977+
return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
2978+
}
2979+
return 0;
2980+
}
2981+
29482982
/// Envar for controlling the number of HSA queues per device. High number of
29492983
/// queues may degrade performance.
29502984
UInt32Envar OMPX_NumQueues;

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11391139
/// Pointer to the memory manager or nullptr if not available.
11401140
MemoryManagerTy *MemoryManager;
11411141

1142+
/// Per device setting of MemoryManager's Threshold
1143+
virtual size_t getMemoryManagerSizeThreshold() { return 0; }
1144+
11421145
/// Environment variables defined by the OpenMP standard.
11431146
Int32Envar OMP_TeamLimit;
11441147
Int32Envar OMP_NumTeams;

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
815815

816816
// Enable the memory manager if required.
817817
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
818-
if (EnableMM)
818+
if (EnableMM) {
819+
if (ThresholdMM == 0)
820+
ThresholdMM = getMemoryManagerSizeThreshold();
819821
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
822+
}
820823

821824
return Plugin::success();
822825
}

offload/test/lit.cfg

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
121121
# For all other targets, we currently assume it is.
122122
supports_unified_shared_memory = True
123123
supports_apu = False
124+
supports_large_allocation_memory_pool = False
124125
if config.libomptarget_current_target.startswith('nvptx'):
125126
try:
126127
cuda_arch = int(config.cuda_test_arch[:3])
@@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
132133
elif config.libomptarget_current_target.startswith('amdgcn'):
133134
# amdgpu_test_arch contains a list of AMD GPUs in the system
134135
# only check the first one assuming that we will run the test on it.
135-
if not (config.amdgpu_test_arch.startswith("gfx90a") or
136-
config.amdgpu_test_arch.startswith("gfx942") or
137-
config.amdgpu_test_arch.startswith("gfx950")):
136+
if (config.amdgpu_test_arch.startswith("gfx90a") or
137+
config.amdgpu_test_arch.startswith("gfx942") or
138+
config.amdgpu_test_arch.startswith("gfx950")):
139+
supports_large_allocation_memory_pool = True
140+
else:
138141
supports_unified_shared_memory = False
139142
# check if AMD architecture is an APU:
140143
if ((config.amdgpu_test_arch.startswith("gfx942") and
@@ -144,6 +147,8 @@ if supports_unified_shared_memory:
144147
config.available_features.add('unified_shared_memory')
145148
if supports_apu:
146149
config.available_features.add('apu')
150+
if supports_large_allocation_memory_pool:
151+
config.available_features.add('large_allocation_memory_pool')
147152

148153
# Setup environment to find dynamic library at runtime
149154
if config.operating_system == 'Windows':

offload/test/sanitizer/use_after_free_2.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// UNSUPPORTED: s390x-ibm-linux-gnu
1111
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
1212

13+
// If offload memory pooling is enabled for a large allocation, reuse error is
14+
// not detected. UNSUPPORTED: large_allocation_memory_pool
15+
1316
#include <omp.h>
1417

1518
int main() {
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compileopt-generic
3+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
4+
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
5+
// clang-format on
6+
7+
// If offload memory pooling is enabled for a large allocation, reuse error is
8+
// not detected. Run the test w/ and w/o ENV var override on memory pooling
9+
// threshold. REQUIRES: large_allocation_memory_pool
10+
11+
#include <omp.h>
12+
#include <stdio.h>
13+
14+
int main() {
15+
int N = (1 << 30);
16+
char *A = (char *)malloc(N);
17+
char *P;
18+
#pragma omp target map(A[ : N]) map(from : P)
19+
{
20+
P = &A[N / 2];
21+
*P = 3;
22+
}
23+
// clang-format off
24+
// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
25+
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
26+
// CHECK: Last deallocation:
27+
// CHECK: Last allocation of size 1073741824
28+
// clang-format on
29+
#pragma omp target
30+
{
31+
*P = 5;
32+
}
33+
34+
// CHECK-PASS: PASS
35+
printf("PASS\n");
36+
return 0;
37+
}

0 commit comments

Comments
 (0)