14
14
#include < vector>
15
15
#include < cstring>
16
16
#include < cstdlib>
17
+ #include < cstdint>
17
18
18
19
using std::vector;
19
20
@@ -34,6 +35,12 @@ using std::vector;
34
35
#include < omp.h>
35
36
#endif
36
37
38
+ #if NUMA_AWARE
39
+ #include < sys/mman.h>
40
+ #include < unistd.h>
41
+ #include < numaif.h>
42
+ #include < numa.h>
43
+ #endif // NUMA_AWARE
37
44
38
45
39
46
/*
@@ -105,23 +112,69 @@ int cpu_getCurrentNumThreads() {
105
112
* MEMORY ALLOCATION
106
113
*/
107
114
115
+ #if NUMA_AWARE
116
+ unsigned long get_page_size () {
117
+ static unsigned long page_size = 0 ;
118
+ if (!page_size) {
119
+ page_size = sysconf (_SC_PAGESIZE);
120
+ if (page_size == ~0UL ) {
121
+ perror (" Failed to get the page size" );
122
+ }
123
+ }
124
+ return page_size;
125
+ }
126
+
127
+ unsigned long get_numa_nodes () {
128
+ static int n_nodes = 0 ;
129
+ if (!n_nodes) {
130
+ n_nodes = numa_num_configured_nodes ();
131
+ if (n_nodes < 1 ) {
132
+ perror (" Failed to get the numa node count" );
133
+ }
134
+ }
135
+ return n_nodes;
136
+ }
137
+ #endif
108
138
109
139
qcomp* cpu_allocArray (qindex length) {
140
+ return (qcomp*) calloc (length, sizeof (qcomp));
141
+ }
110
142
111
- // / @todo
112
- // / here, we calloc the entire array in a serial setting, rather than one malloc
113
- // / followed by threads subsequently memset'ing their own partitions. The latter
114
- // / approach would distribute the array pages across NUMA nodes, accelerating
115
- // / their subsequent access by the same threads (via NUMA's first-touch policy).
116
- // / We have so far foregone this optimisation since a thread's memory-access pattern
117
- // / in many of the QuEST functions is non-trivial, and likely to be inconsistent
118
- // / with the memset pattern. As such, I expect the benefit is totally occluded
119
- // / and only introduces potential new bugs - but this should be tested and confirmed!
120
-
121
- // we call calloc over malloc in order to fail immediately if mem isn't available;
122
- // caller must handle nullptr result
123
143
124
- return (qcomp*) calloc (length, sizeof (qcomp));
144
+ qcomp* cpu_allocNumaArray (qindex length) {
145
+ #if !NUMA_AWARE
146
+ return cpu_allocArray (length);
147
+ #else
148
+ unsigned long page_size = get_page_size ();
149
+ int n_nodes = get_numa_nodes ();
150
+
151
+ qindex size = length * sizeof (qcomp);
152
+ int pages = (size + page_size - 1 ) / page_size;
153
+ void *addr = mmap (NULL , pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0 );
154
+ if (n_nodes == 1 ) {
155
+ return reinterpret_cast <qcomp*>(addr);
156
+ }
157
+
158
+ // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
159
+ int floor_pages = pages / n_nodes;
160
+ int spread_pages = pages % n_nodes;
161
+
162
+ uintptr_t pos = (uintptr_t )addr;
163
+ for (int node = 0 , shift = n_nodes; node < n_nodes; ++node) {
164
+ shift -= spread_pages;
165
+ int node_pages = floor_pages + (shift <= 0 );
166
+
167
+ unsigned long node_mask = 1UL << node;
168
+ mbind ((void *)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof (node_mask) * 8 , 0 );
169
+
170
+ pos += node_pages * page_size;
171
+ if (shift <= 0 ) {
172
+ shift += n_nodes;
173
+ }
174
+ }
175
+
176
+ return reinterpret_cast <qcomp*>(addr);
177
+ #endif // NUMA_AWARE
125
178
}
126
179
127
180
@@ -132,6 +185,23 @@ void cpu_deallocArray(qcomp* arr) {
132
185
}
133
186
134
187
188
+ void cpu_deallocNumaArray (qcomp* arr, qindex length) {
189
+ if (arr == nullptr ) {
190
+ return ;
191
+ }
192
+
193
+ #if !NUMA_AWARE
194
+ return cpu_deallocArray (arr);
195
+ #else
196
+ unsigned long page_size = get_page_size ();
197
+ qindex size = length * sizeof (qcomp);
198
+ int pages = (size + page_size - 1 ) / page_size;
199
+
200
+ munmap (arr, pages * page_size);
201
+ #endif // NUMA_AWARE
202
+ }
203
+
204
+
135
205
qcomp** cpu_allocAndInitMatrixWrapper (qcomp* arr, qindex dim) {
136
206
137
207
// do not allocate if arr alloc failed (caller will handle)
0 commit comments