-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc #151882
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: None (hidekisaito) ChangesFull diff: https://github.com/llvm/llvm-project/pull/151882.diff 6 Files Affected:
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f8db9bf0ae739..94e635d94ed3b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2944,6 +2944,41 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ if (Pool->isGlobal() && Pool->isCoarseGrained()) {
+ uint64_t Value;
+ hsa_status_t Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+ if (Status != HSA_STATUS_SUCCESS)
+ continue;
+ constexpr uint64_t Almost64Gig = 0xFF0000000;
+ if (Value >= Almost64Gig)
+ return true;
+ }
+ }
+ return false; // CoarseGrain pool w/ 64GB or more capacity not found
+ }
+
+ size_t getMemoryManagerSizeThreshold() override {
+ // Targeting high memory capacity GPUs such as
+ // MI210 or later data center GPUs.
+ if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
+ // Set GenericDeviceTy::MemoryManager's Threshold to ~3GB,
+ // if threshold is not already set by ENV var
+ // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+ // This MemoryManager is used for omp_target_alloc(), OpenMP
+ // (non-usm) map clause, etc.
+ //
+ // Ideally, this kind of pooling is best performed at
+ // a common level (e.g, user side of HSA) between OpenMP and HIP
+ // but that feature does not exist (yet).
+ constexpr size_t Almost3Gig = 3000000000u;
+ return Almost3Gig;
+ }
+ return 0;
+ }
+
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c17a2ee07047..87c3777b0eda7 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1090,6 +1090,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Pointer to the memory manager or nullptr if not available.
MemoryManagerTy *MemoryManager;
+ /// Per device setting of MemoryManager's Threshold
+ virtual size_t getMemoryManagerSizeThreshold() { return 0 /* use default */; }
+
/// Environment variables defined by the OpenMP standard.
Int32Envar OMP_TeamLimit;
Int32Envar OMP_NumTeams;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94a050b559efe..46151cc9abce6 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
// Enable the memory manager if required.
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
- if (EnableMM)
+ if (EnableMM) {
+ if (ThresholdMM == 0)
+ ThresholdMM = getMemoryManagerSizeThreshold();
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+ }
return Plugin::success();
}
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 800a63bc0ee32..f3e8e9a66685e 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
# For all other targets, we currently assume it is.
supports_unified_shared_memory = True
supports_apu = False
+supports_large_allocation_memory_pool = False
if config.libomptarget_current_target.startswith('nvptx'):
try:
cuda_arch = int(config.cuda_test_arch[:3])
@@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
elif config.libomptarget_current_target.startswith('amdgcn'):
# amdgpu_test_arch contains a list of AMD GPUs in the system
# only check the first one assuming that we will run the test on it.
- if not (config.amdgpu_test_arch.startswith("gfx90a") or
- config.amdgpu_test_arch.startswith("gfx942") or
- config.amdgpu_test_arch.startswith("gfx950")):
+ if (config.amdgpu_test_arch.startswith("gfx90a") or
+ config.amdgpu_test_arch.startswith("gfx942") or
+ config.amdgpu_test_arch.startswith("gfx950")):
+ supports_large_allocation_memory_pool = True
+ else:
supports_unified_shared_memory = False
# check if AMD architecture is an APU:
if ((config.amdgpu_test_arch.startswith("gfx942") and
@@ -144,6 +147,8 @@ if supports_unified_shared_memory:
config.available_features.add('unified_shared_memory')
if supports_apu:
config.available_features.add('apu')
+if supports_large_allocation_memory_pool:
+ config.available_features.add('large_allocation_memory_pool')
# Setup environment to find dynamic library at runtime
if config.operating_system == 'Windows':
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 02aa453d0a975..1c1e09744a750 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -10,6 +10,9 @@
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. UNSUPPORTED: large_allocation_memory_pool
+
#include <omp.h>
int main() {
diff --git a/offload/test/sanitizer/use_after_free_3.c b/offload/test/sanitizer/use_after_free_3.c
new file mode 100644
index 0000000000000..fd77cff0d5c81
--- /dev/null
+++ b/offload/test/sanitizer/use_after_free_3.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
+// clang-format on
+
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. Run the test w/ and w/o ENV var override on memory pooling
+// threshold. REQUIRES: large_allocation_memory_pool
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+ int N = (1 << 30);
+ char *A = (char *)malloc(N);
+ char *P;
+#pragma omp target map(A[:N]) map(from : P)
+ {
+ P = &A[N / 2];
+ *P = 3;
+ }
+ // clang-format off
+// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
+// CHECK: Last deallocation:
+// CHECK: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+ { *P = 5; }
+
+ // CHECK-PASS: PASS
+ printf("PASS\n");
+ return 0;
+}
|
@llvm/pr-subscribers-offload Author: None (hidekisaito) ChangesFull diff: https://github.com/llvm/llvm-project/pull/151882.diff 6 Files Affected:
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f8db9bf0ae739..94e635d94ed3b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2944,6 +2944,41 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ if (Pool->isGlobal() && Pool->isCoarseGrained()) {
+ uint64_t Value;
+ hsa_status_t Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+ if (Status != HSA_STATUS_SUCCESS)
+ continue;
+ constexpr uint64_t Almost64Gig = 0xFF0000000;
+ if (Value >= Almost64Gig)
+ return true;
+ }
+ }
+ return false; // CoarseGrain pool w/ 64GB or more capacity not found
+ }
+
+ size_t getMemoryManagerSizeThreshold() override {
+ // Targeting high memory capacity GPUs such as
+ // MI210 or later data center GPUs.
+ if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
+ // Set GenericDeviceTy::MemoryManager's Threshold to ~3GB,
+ // if threshold is not already set by ENV var
+ // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+ // This MemoryManager is used for omp_target_alloc(), OpenMP
+ // (non-usm) map clause, etc.
+ //
+ // Ideally, this kind of pooling is best performed at
+ // a common level (e.g, user side of HSA) between OpenMP and HIP
+ // but that feature does not exist (yet).
+ constexpr size_t Almost3Gig = 3000000000u;
+ return Almost3Gig;
+ }
+ return 0;
+ }
+
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c17a2ee07047..87c3777b0eda7 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1090,6 +1090,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Pointer to the memory manager or nullptr if not available.
MemoryManagerTy *MemoryManager;
+ /// Per device setting of MemoryManager's Threshold
+ virtual size_t getMemoryManagerSizeThreshold() { return 0 /* use default */; }
+
/// Environment variables defined by the OpenMP standard.
Int32Envar OMP_TeamLimit;
Int32Envar OMP_NumTeams;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94a050b559efe..46151cc9abce6 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
// Enable the memory manager if required.
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
- if (EnableMM)
+ if (EnableMM) {
+ if (ThresholdMM == 0)
+ ThresholdMM = getMemoryManagerSizeThreshold();
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+ }
return Plugin::success();
}
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 800a63bc0ee32..f3e8e9a66685e 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
# For all other targets, we currently assume it is.
supports_unified_shared_memory = True
supports_apu = False
+supports_large_allocation_memory_pool = False
if config.libomptarget_current_target.startswith('nvptx'):
try:
cuda_arch = int(config.cuda_test_arch[:3])
@@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
elif config.libomptarget_current_target.startswith('amdgcn'):
# amdgpu_test_arch contains a list of AMD GPUs in the system
# only check the first one assuming that we will run the test on it.
- if not (config.amdgpu_test_arch.startswith("gfx90a") or
- config.amdgpu_test_arch.startswith("gfx942") or
- config.amdgpu_test_arch.startswith("gfx950")):
+ if (config.amdgpu_test_arch.startswith("gfx90a") or
+ config.amdgpu_test_arch.startswith("gfx942") or
+ config.amdgpu_test_arch.startswith("gfx950")):
+ supports_large_allocation_memory_pool = True
+ else:
supports_unified_shared_memory = False
# check if AMD architecture is an APU:
if ((config.amdgpu_test_arch.startswith("gfx942") and
@@ -144,6 +147,8 @@ if supports_unified_shared_memory:
config.available_features.add('unified_shared_memory')
if supports_apu:
config.available_features.add('apu')
+if supports_large_allocation_memory_pool:
+ config.available_features.add('large_allocation_memory_pool')
# Setup environment to find dynamic library at runtime
if config.operating_system == 'Windows':
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 02aa453d0a975..1c1e09744a750 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -10,6 +10,9 @@
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. UNSUPPORTED: large_allocation_memory_pool
+
#include <omp.h>
int main() {
diff --git a/offload/test/sanitizer/use_after_free_3.c b/offload/test/sanitizer/use_after_free_3.c
new file mode 100644
index 0000000000000..fd77cff0d5c81
--- /dev/null
+++ b/offload/test/sanitizer/use_after_free_3.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
+// clang-format on
+
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. Run the test w/ and w/o ENV var override on memory pooling
+// threshold. REQUIRES: large_allocation_memory_pool
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+ int N = (1 << 30);
+ char *A = (char *)malloc(N);
+ char *P;
+#pragma omp target map(A[:N]) map(from : P)
+ {
+ P = &A[N / 2];
+ *P = 3;
+ }
+ // clang-format off
+// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
+// CHECK: Last deallocation:
+// CHECK: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+ { *P = 5; }
+
+ // CHECK-PASS: PASS
+ printf("PASS\n");
+ return 0;
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
3ba5726
to
904d01e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if it wouldn't be better to just expose the device's memory size and have the MemoryManager use that internally when configuring this.
904d01e
to
c8ed60b
Compare
What threshold to set at what memory capacity, by default, is inherently device dependent. |
c8ed60b
to
77976ae
Compare
… size in omp_target_alloc
77976ae
to
2a09047
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I thought the memory manager already took the device into account. |
@hidekisaito any chance you could add an appropriate patch description? Thanks so much :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No, it does not. Currently, all device under plugin-nextgen gets Memory Manager enabled by default at threshold of "1 << 13" |
Done. |
Enables AMD data center class GPUs to use memory manager memory pooling up to 3GB allocation by default, up from the "1 << 13" threshold that all plugin-nextgen devices use.