restored NUMA-awareness (#658)

TysonRayJones · lucjaulmes · web-flow · commit 378ad71021c2 · 2025-07-09T11:18:36.000-04:00
Luc: v3.7 was sensible on NUMA machines “by default” through first-touch initialization. This had been lost in v4 as idnetified by James Richings. Here’s some basic numa-aware allocation, and a little love for general parallel/openmp usage. - If we’re on *nix _and_ we find libnuma, we enable NUMA-aware allocaitons - Add & use cpu_allocNumaArray() and cpu_deallocNumaArray for the state-vector allocations (as the current alloc functions are also used for many smaller regions). Fall-back to normal allocation functions if NUMA-unaware. - Perform zero-initialization in parallel (still with std::fill() but use a parallel region) - Make getCurrentNumThreads() work inside parallel regions (!) - Add getAvailableNumThreads() to get thread count outside parallel regions. Improve this from previous getCurrentNumThreads() to only call the omp function once (rather than once per thread). Luc coded the logic and Tyson added doc and error-handling. PR #658 replaced the original of #652 --------- Co-authored-by: Luc Jaulmes <ljaulmes@ed.ac.uk>
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
 # @author Tyson Jones (patches including clang multithreading)
-# @author Luc Jaulmes (patching install)
+# @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
 #  - Ania Brown
@@ -262,6 +262,23 @@ if (ENABLE_MULTITHREADING)
     OpenMP::OpenMP_C
   )
 
+  # Find NUMA - location of NUMA headers
+  if (WIN32)
+    compile_option(NUMA_AWARE 0)
+    message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
+  else()
+    include(FindPkgConfig)
+    pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
+    if (${NUMA_FOUND})
+      compile_option(NUMA_AWARE ${NUMA_FOUND})
+      target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+      message(STATUS "NUMA awareness is enabled.")
+    else()
+      compile_option(NUMA_AWARE 0)
+      message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
+    endif()
+  endif()
+
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
   endif()
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -225,7 +225,7 @@ void printCpuInfo() {
         "cpu", {
         {"numCpuCores",   printer_toStr(std::thread::hardware_concurrency()) + pm},
         {"numOmpProcs",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
-        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
+        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
         {"cpuMemory",     ram},
         {"cpuMemoryFree", un},
     });
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
 
     QuESTEnv env = getQuESTEnv();
 
-    int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
+    int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
     int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
     int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
-    qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
+    qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
 
     // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
     // note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
     validate_quregFields(qureg, __func__);
 
     // free CPU memory
-    cpu_deallocArray(qureg.cpuAmps);
+    cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
 
     // free CPU communication buffer
     if (qureg.isDistributed)
diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
 
     // and we require more than 1 thread available at QuESTEnv creation
     if (useMultithread == modeflag::USE_AUTO)
-        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
+        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
 }
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #include "quest/include/types.h"
@@ -104,6 +105,41 @@ void error_memSizeQueriedButWouldOverflow() {
     raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
 }
 
+void error_gettingPageSizeFailed() {
+
+    raiseInternalError("Failed to get the page size.");
+}
+
+void error_pageSizeNotAPowerOf2() {
+
+    raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone.");
+}
+
+void error_pageSizeNotAMultipleOfQcomp() {
+
+    raiseInternalError("The page size was indivisible by the number of bytes in a qcomp.");
+}
+
+void error_gettingNumNumaNodesFailed() {
+
+    raiseInternalError("Failed to get the NUMA node count");
+}
+
+void error_numaAllocOrDeallocAttemptedOnWindows() {
+
+    raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue.");
+}
+
+void error_numaBindingFailed() {
+
+    raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding.");
+}
+
+void error_numaUnmappingFailed() {
+
+    raiseInternalError("NUMA-aware memory deallocation unexpectedly failed.");
+}
+
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #ifndef ERRORS_HPP
@@ -50,6 +51,20 @@ void error_allocOfQuESTEnvFailed();
 
 void error_memSizeQueriedButWouldOverflow();
 
+void error_gettingPageSizeFailed();
+
+void error_pageSizeNotAPowerOf2();
+
+void error_pageSizeNotAMultipleOfQcomp();
+
+void error_gettingNumNumaNodesFailed();
+
+void error_numaAllocOrDeallocAttemptedOnWindows();
+
+void error_numaBindingFailed();
+
+void error_numaUnmappingFailed();
+
 
 
 /*
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
@@ -30,6 +30,7 @@
     #include <sys/sysctl.h>
 #elif defined(_WIN32)
     #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #endif
 
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
@@ -5,6 +5,7 @@
  * logic, matrix algebra, and channel parameters.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (distributing ranges over blocks)
  */
 
 #include "quest/include/types.h"
@@ -25,6 +26,7 @@
 
 #include <functional>
 #include <algorithm>
+#include <utility>
 #include <complex>
 #include <cmath>
 #include <vector>
@@ -930,6 +932,41 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
     return out;
 }
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(
+    qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges
+) {
+    // divides a range into whole blocks (and a single leftover sub-block) and
+    // attempts to uniformly distribute the blocks across the specified number of
+    // sub-ranges. When the blocks do not divide evenly between sub-ranges, the
+    // leftover blocks are spread apart across sub-ranges. When the range does not 
+    // divide evenly into blocks, the overflow is given to the final sub-range.
+
+    qindex numFullBlocks = rangeLen / blockLen; // floors
+    qindex subBlockLen = rangeLen % blockLen;
+
+    qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges;
+    qindex numExtraBlocks = numFullBlocks % numSubRanges;
+
+    // determine how many extra blocks this subrange should contain
+    qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges;
+    qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges;
+    bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges;
+
+    // allocate blocks to this sub-range
+    qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra;
+    qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra;
+
+    // find this sub-range indices within [0, rangeLen)
+    qindex startInd = startBlockInd * blockLen;
+    qindex endInd = endBlockInd * blockLen; // exclusive
+
+    // arbitrarily allocate the leftover sub-block to the final sub-range
+    if (idSubRange == numSubRanges - 1)
+        endInd += subBlockLen;
+
+    return std::make_pair(startInd, endInd);
+}
+
 
 
 /*
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
@@ -21,6 +21,7 @@
 
 #include <type_traits>
 #include <functional>
+#include <utility>
 #include <string>
 #include <vector>
 #include <array>
@@ -351,6 +352,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges);
+
 
 
 /*
@@ -361,6 +364,7 @@ qreal util_getPhaseFromGateAngle(qreal angle);
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
+
 /*
  * DECOHERENCE FACTORS
  */
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us`
`36`	`36`
`37`	`37`	`// and we require more than 1 thread available at QuESTEnv creation`
`38`	`38`	`if (useMultithread == modeflag::USE_AUTO)`
`39`		`- useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;`
	`39`	`+ useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;`
`40`	`40`	`}`
`41`	`41`
`42`	`42`