Skip to content

Commit 378ad71

Browse files
restored NUMA-awareness (#658)
Luc: v3.7 was sensible on NUMA machines “by default” through first-touch initialization. This had been lost in v4 as idnetified by James Richings. Here’s some basic numa-aware allocation, and a little love for general parallel/openmp usage. - If we’re on *nix _and_ we find libnuma, we enable NUMA-aware allocaitons - Add & use cpu_allocNumaArray() and cpu_deallocNumaArray for the state-vector allocations (as the current alloc functions are also used for many smaller regions). Fall-back to normal allocation functions if NUMA-unaware. - Perform zero-initialization in parallel (still with std::fill() but use a parallel region) - Make getCurrentNumThreads() work inside parallel regions (!) - Add getAvailableNumThreads() to get thread count outside parallel regions. Improve this from previous getCurrentNumThreads() to only call the omp function once (rather than once per thread). Luc coded the logic and Tyson added doc and error-handling. PR #658 replaced the original of #652 --------- Co-authored-by: Luc Jaulmes <ljaulmes@ed.ac.uk>
1 parent 738ca89 commit 378ad71

File tree

12 files changed

+323
-25
lines changed

12 files changed

+323
-25
lines changed

CMakeLists.txt

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# @author Oliver Thomson Brown
22
# @author Erich Essmann (patches including MSVC support)
33
# @author Tyson Jones (patches including clang multithreading)
4-
# @author Luc Jaulmes (patching install)
4+
# @author Luc Jaulmes (NUMA awareness, patching install)
55
#
66
# Contributions to previous builds from:
77
# - Ania Brown
@@ -262,6 +262,23 @@ if (ENABLE_MULTITHREADING)
262262
OpenMP::OpenMP_C
263263
)
264264

265+
# Find NUMA - location of NUMA headers
266+
if (WIN32)
267+
compile_option(NUMA_AWARE 0)
268+
message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
269+
else()
270+
include(FindPkgConfig)
271+
pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
272+
if (${NUMA_FOUND})
273+
compile_option(NUMA_AWARE ${NUMA_FOUND})
274+
target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
275+
message(STATUS "NUMA awareness is enabled.")
276+
else()
277+
compile_option(NUMA_AWARE 0)
278+
message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
279+
endif()
280+
endif()
281+
265282
if (VERBOSE_LIB_NAME)
266283
string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
267284
endif()

quest/src/api/environment.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ void printCpuInfo() {
225225
"cpu", {
226226
{"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm},
227227
{"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
228-
{"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
228+
{"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
229229
{"cpuMemory", ram},
230230
{"cpuMemoryFree", un},
231231
});
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
494494

495495
QuESTEnv env = getQuESTEnv();
496496

497-
int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
497+
int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
498498
int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
499499
int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
500500

quest/src/api/qureg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
154154
Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
155155

156156
// always allocate CPU memory
157-
qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
157+
qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
158158

159159
// conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
160160
// note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
334334
validate_quregFields(qureg, __func__);
335335

336336
// free CPU memory
337-
cpu_deallocArray(qureg.cpuAmps);
337+
cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
338338

339339
// free CPU communication buffer
340340
if (qureg.isDistributed)

quest/src/core/autodeployer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
3636

3737
// and we require more than 1 thread available at QuESTEnv creation
3838
if (useMultithread == modeflag::USE_AUTO)
39-
useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
39+
useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
4040
}
4141

4242

quest/src/core/errors.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* deployment is consistent with the compiled deployment modes.
66
*
77
* @author Tyson Jones
8+
* @author Luc Jaulmes (NUMA & pagesize errors)
89
*/
910

1011
#include "quest/include/types.h"
@@ -104,6 +105,41 @@ void error_memSizeQueriedButWouldOverflow() {
104105
raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
105106
}
106107

108+
void error_gettingPageSizeFailed() {
109+
110+
raiseInternalError("Failed to get the page size.");
111+
}
112+
113+
void error_pageSizeNotAPowerOf2() {
114+
115+
raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone.");
116+
}
117+
118+
void error_pageSizeNotAMultipleOfQcomp() {
119+
120+
raiseInternalError("The page size was indivisible by the number of bytes in a qcomp.");
121+
}
122+
123+
void error_gettingNumNumaNodesFailed() {
124+
125+
raiseInternalError("Failed to get the NUMA node count");
126+
}
127+
128+
void error_numaAllocOrDeallocAttemptedOnWindows() {
129+
130+
raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue.");
131+
}
132+
133+
void error_numaBindingFailed() {
134+
135+
raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding.");
136+
}
137+
138+
void error_numaUnmappingFailed() {
139+
140+
raiseInternalError("NUMA-aware memory deallocation unexpectedly failed.");
141+
}
142+
107143

108144

109145
/*

quest/src/core/errors.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* deployment is consistent with the compiled deployment modes.
66
*
77
* @author Tyson Jones
8+
* @author Luc Jaulmes (NUMA & pagesize errors)
89
*/
910

1011
#ifndef ERRORS_HPP
@@ -50,6 +51,20 @@ void error_allocOfQuESTEnvFailed();
5051

5152
void error_memSizeQueriedButWouldOverflow();
5253

54+
void error_gettingPageSizeFailed();
55+
56+
void error_pageSizeNotAPowerOf2();
57+
58+
void error_pageSizeNotAMultipleOfQcomp();
59+
60+
void error_gettingNumNumaNodesFailed();
61+
62+
void error_numaAllocOrDeallocAttemptedOnWindows();
63+
64+
void error_numaBindingFailed();
65+
66+
void error_numaUnmappingFailed();
67+
5368

5469

5570
/*

quest/src/core/memory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <sys/sysctl.h>
3131
#elif defined(_WIN32)
3232
#define NOMINMAX
33+
#define WIN32_LEAN_AND_MEAN
3334
#include <windows.h>
3435
#endif
3536

quest/src/core/utilities.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* logic, matrix algebra, and channel parameters.
66
*
77
* @author Tyson Jones
8+
* @author Luc Jaulmes (distributing ranges over blocks)
89
*/
910

1011
#include "quest/include/types.h"
@@ -25,6 +26,7 @@
2526

2627
#include <functional>
2728
#include <algorithm>
29+
#include <utility>
2830
#include <complex>
2931
#include <cmath>
3032
#include <vector>
@@ -930,6 +932,41 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
930932
return out;
931933
}
932934

935+
std::pair<qindex, qindex> util_getBlockMultipleSubRange(
936+
qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges
937+
) {
938+
// divides a range into whole blocks (and a single leftover sub-block) and
939+
// attempts to uniformly distribute the blocks across the specified number of
940+
// sub-ranges. When the blocks do not divide evenly between sub-ranges, the
941+
// leftover blocks are spread apart across sub-ranges. When the range does not
942+
// divide evenly into blocks, the overflow is given to the final sub-range.
943+
944+
qindex numFullBlocks = rangeLen / blockLen; // floors
945+
qindex subBlockLen = rangeLen % blockLen;
946+
947+
qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges;
948+
qindex numExtraBlocks = numFullBlocks % numSubRanges;
949+
950+
// determine how many extra blocks this subrange should contain
951+
qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges;
952+
qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges;
953+
bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges;
954+
955+
// allocate blocks to this sub-range
956+
qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra;
957+
qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra;
958+
959+
// find this sub-range indices within [0, rangeLen)
960+
qindex startInd = startBlockInd * blockLen;
961+
qindex endInd = endBlockInd * blockLen; // exclusive
962+
963+
// arbitrarily allocate the leftover sub-block to the final sub-range
964+
if (idSubRange == numSubRanges - 1)
965+
endInd += subBlockLen;
966+
967+
return std::make_pair(startInd, endInd);
968+
}
969+
933970

934971

935972
/*

quest/src/core/utilities.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include <type_traits>
2323
#include <functional>
24+
#include <utility>
2425
#include <string>
2526
#include <vector>
2627
#include <array>
@@ -351,6 +352,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
351352

352353
util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
353354

355+
std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges);
356+
354357

355358

356359
/*
@@ -361,6 +364,7 @@ qreal util_getPhaseFromGateAngle(qreal angle);
361364
qcomp util_getPhaseFromGateAngle(qcomp angle);
362365

363366

367+
364368
/*
365369
* DECOHERENCE FACTORS
366370
*/

0 commit comments

Comments
 (0)