Skip to content

Commit 3e17502

Browse files
committed
Numa aware alloc
1 parent 27fe05c commit 3e17502

File tree

4 files changed

+103
-15
lines changed

4 files changed

+103
-15
lines changed

CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,21 @@ if (ENABLE_MULTITHREADING)
323323
OpenMP::OpenMP_C
324324
)
325325

326+
# Find NUMA - location of NUMA headers
327+
if (WIN32)
328+
compile_option(NUMA_AWARE 0)
329+
message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
330+
else()
331+
include(FindPkgConfig)
332+
pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
333+
compile_option(NUMA_AWARE ${NUMA_FOUND})
334+
if (${NUMA_FOUND})
335+
target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
336+
else()
337+
message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
338+
endif()
339+
endif()
340+
326341
if (VERBOSE_LIB_NAME)
327342
string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
328343
endif()

quest/src/api/qureg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
151151
Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
152152

153153
// always allocate CPU memory
154-
qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
154+
qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
155155

156156
// conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
157157
// note that in distributed settings but where useDistrib=false, each node will have a
@@ -331,7 +331,7 @@ void destroyQureg(Qureg qureg) {
331331
validate_quregFields(qureg, __func__);
332332

333333
// free CPU memory
334-
cpu_deallocArray(qureg.cpuAmps);
334+
cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
335335

336336
// free CPU communication buffer
337337
if (qureg.isDistributed)

quest/src/cpu/cpu_config.cpp

Lines changed: 83 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <vector>
1515
#include <cstring>
1616
#include <cstdlib>
17+
#include <cstdint>
1718

1819
using std::vector;
1920

@@ -34,6 +35,12 @@ using std::vector;
3435
#include <omp.h>
3536
#endif
3637

38+
#if NUMA_AWARE
39+
#include <sys/mman.h>
40+
#include <unistd.h>
41+
#include <numaif.h>
42+
#include <numa.h>
43+
#endif // NUMA_AWARE
3744

3845

3946
/*
@@ -105,23 +112,69 @@ int cpu_getCurrentNumThreads() {
105112
* MEMORY ALLOCATION
106113
*/
107114

115+
#if NUMA_AWARE
116+
unsigned long get_page_size() {
117+
static unsigned long page_size = 0;
118+
if (!page_size) {
119+
page_size = sysconf(_SC_PAGESIZE);
120+
if (page_size == ~0UL) {
121+
perror("Failed to get the page size");
122+
}
123+
}
124+
return page_size;
125+
}
126+
127+
unsigned long get_numa_nodes() {
128+
static int n_nodes = 0;
129+
if (!n_nodes) {
130+
n_nodes = numa_num_configured_nodes();
131+
if (n_nodes < 1) {
132+
perror("Failed to get the numa node count");
133+
}
134+
}
135+
return n_nodes;
136+
}
137+
#endif
108138

109139
qcomp* cpu_allocArray(qindex length) {
140+
return (qcomp*) calloc(length, sizeof(qcomp));
141+
}
110142

111-
/// @todo
112-
/// here, we calloc the entire array in a serial setting, rather than one malloc
113-
/// followed by threads subsequently memset'ing their own partitions. The latter
114-
/// approach would distribute the array pages across NUMA nodes, accelerating
115-
/// their subsequent access by the same threads (via NUMA's first-touch policy).
116-
/// We have so far foregone this optimisation since a thread's memory-access pattern
117-
/// in many of the QuEST functions is non-trivial, and likely to be inconsistent
118-
/// with the memset pattern. As such, I expect the benefit is totally occluded
119-
/// and only introduces potential new bugs - but this should be tested and confirmed!
120-
121-
// we call calloc over malloc in order to fail immediately if mem isn't available;
122-
// caller must handle nullptr result
123143

124-
return (qcomp*) calloc(length, sizeof(qcomp));
144+
qcomp* cpu_allocNumaArray(qindex length) {
145+
#if !NUMA_AWARE
146+
return cpu_allocArray(length);
147+
#else
148+
unsigned long page_size = get_page_size();
149+
int n_nodes = get_numa_nodes();
150+
151+
qindex size = length * sizeof(qcomp);
152+
int pages = (size + page_size - 1) / page_size;
153+
void *addr = mmap(NULL, pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
154+
if (n_nodes == 1) {
155+
return reinterpret_cast<qcomp*>(addr);
156+
}
157+
158+
// distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
159+
int floor_pages = pages / n_nodes;
160+
int spread_pages = pages % n_nodes;
161+
162+
uintptr_t pos = (uintptr_t)addr;
163+
for (int node = 0, shift = n_nodes; node < n_nodes; ++node) {
164+
shift -= spread_pages;
165+
int node_pages = floor_pages + (shift <= 0);
166+
167+
unsigned long node_mask = 1UL << node;
168+
mbind((void*)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof(node_mask) * 8, 0);
169+
170+
pos += node_pages * page_size;
171+
if (shift <= 0) {
172+
shift += n_nodes;
173+
}
174+
}
175+
176+
return reinterpret_cast<qcomp*>(addr);
177+
#endif // NUMA_AWARE
125178
}
126179

127180

@@ -132,6 +185,23 @@ void cpu_deallocArray(qcomp* arr) {
132185
}
133186

134187

188+
void cpu_deallocNumaArray(qcomp* arr, qindex length) {
189+
if (arr == nullptr) {
190+
return;
191+
}
192+
193+
#if !NUMA_AWARE
194+
return cpu_deallocArray(arr);
195+
#else
196+
unsigned long page_size = get_page_size();
197+
qindex size = length * sizeof(qcomp);
198+
int pages = (size + page_size - 1) / page_size;
199+
200+
munmap(arr, pages * page_size);
201+
#endif // NUMA_AWARE
202+
}
203+
204+
135205
qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
136206

137207
// do not allocate if arr alloc failed (caller will handle)

quest/src/cpu/cpu_config.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ int cpu_getCurrentNumThreads();
4646
qcomp* cpu_allocArray(qindex length);
4747
void cpu_deallocArray(qcomp* arr);
4848

49+
qcomp* cpu_allocNumaArray(qindex length);
50+
void cpu_deallocNumaArray(qcomp* arr, qindex length);
51+
4952
qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
5053
void cpu_deallocMatrixWrapper(qcomp** wrapper);
5154

0 commit comments

Comments
 (0)