diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index f8db9bf0ae739..f3375e3a43acc 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2944,6 +2944,40 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + bool checkIfCoarseGrainMemoryNearOrAbove64GB() { + for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { + if (!Pool->isGlobal() || !Pool->isCoarseGrained()) + continue; + uint64_t Value; + hsa_status_t Status = + Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); + if (Status != HSA_STATUS_SUCCESS) + continue; + constexpr uint64_t Almost64Gig = 0xFF0000000; + if (Value >= Almost64Gig) + return true; + } + return false; // CoarseGrain pool w/ 64GB or more capacity not found + } + + size_t getMemoryManagerSizeThreshold() override { + // Targeting high memory capacity GPUs such as + // data center GPUs. + if (checkIfCoarseGrainMemoryNearOrAbove64GB()) { + // Set GenericDeviceTy::MemoryManager's Threshold to 3GiB, + // if threshold is not already set by ENV var + // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. + // This MemoryManager is used for omp_target_alloc(), OpenMP + // (non-usm) map clause, etc. + // + // Ideally, this kind of pooling is best performed at + // a common level (e.g, user side of HSA) between OpenMP and HIP + // but that feature does not exist (yet). + return 3ul * 1024 * 1024 * 1024 /* 3 GiB */; + } + return 0; + } + /// Envar for controlling the number of HSA queues per device. High number of /// queues may degrade performance. UInt32Envar OMPX_NumQueues; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 8c17a2ee07047..8cadcacd405cd 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1090,6 +1090,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; + /// Per device setting of MemoryManager's Threshold + virtual size_t getMemoryManagerSizeThreshold() { return 0; } + /// Environment variables defined by the OpenMP standard. Int32Envar OMP_TeamLimit; Int32Envar OMP_NumTeams; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 94a050b559efe..46151cc9abce6 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { // Enable the memory manager if required. auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv(); - if (EnableMM) + if (EnableMM) { + if (ThresholdMM == 0) + ThresholdMM = getMemoryManagerSizeThreshold(); MemoryManager = new MemoryManagerTy(*this, ThresholdMM); + } return Plugin::success(); } diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 800a63bc0ee32..f3e8e9a66685e 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -121,6 +121,7 @@ if config.libomptarget_test_pgo: # For all other targets, we currently assume it is. supports_unified_shared_memory = True supports_apu = False +supports_large_allocation_memory_pool = False if config.libomptarget_current_target.startswith('nvptx'): try: cuda_arch = int(config.cuda_test_arch[:3]) @@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'): elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. - if not (config.amdgpu_test_arch.startswith("gfx90a") or - config.amdgpu_test_arch.startswith("gfx942") or - config.amdgpu_test_arch.startswith("gfx950")): + if (config.amdgpu_test_arch.startswith("gfx90a") or + config.amdgpu_test_arch.startswith("gfx942") or + config.amdgpu_test_arch.startswith("gfx950")): + supports_large_allocation_memory_pool = True + else: supports_unified_shared_memory = False # check if AMD architecture is an APU: if ((config.amdgpu_test_arch.startswith("gfx942") and @@ -144,6 +147,8 @@ if supports_unified_shared_memory: config.available_features.add('unified_shared_memory') if supports_apu: config.available_features.add('apu') +if supports_large_allocation_memory_pool: + config.available_features.add('large_allocation_memory_pool') # Setup environment to find dynamic library at runtime if config.operating_system == 'Windows': diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c index 02aa453d0a975..1c1e09744a750 100644 --- a/offload/test/sanitizer/use_after_free_2.c +++ b/offload/test/sanitizer/use_after_free_2.c @@ -10,6 +10,9 @@ // UNSUPPORTED: s390x-ibm-linux-gnu // UNSUPPORTED: s390x-ibm-linux-gnu-LTO +// If offload memory pooling is enabled for a large allocation, reuse error is +// not detected. UNSUPPORTED: large_allocation_memory_pool + #include int main() { diff --git a/offload/test/sanitizer/use_after_free_3.c b/offload/test/sanitizer/use_after_free_3.c new file mode 100644 index 0000000000000..9d8861433e7e5 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_3.c @@ -0,0 +1,37 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS +// clang-format on + +// If offload memory pooling is enabled for a large allocation, reuse error is +// not detected. Run the test w/ and w/o ENV var override on memory pooling +// threshold. REQUIRES: large_allocation_memory_pool + +#include +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); + char *P; +#pragma omp target map(A[ : N]) map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] points into prior host-issued allocation: +// CHECK: Last deallocation: +// CHECK: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { + *P = 5; + } + + // CHECK-PASS: PASS + printf("PASS\n"); + return 0; +}