Skip to content

Implementation of GGML_NUMA_MIRROR for inferencing performance gain on numa systems #14969

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 46 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
f98ac2d
merge in changes by @wkgcass
dbsanfte Jul 29, 2025
99b0e80
revert inadvertent change
dbsanfte Jul 29, 2025
c060a26
reverse ifdef logic
dbsanfte Jul 29, 2025
824831b
fix padding
dbsanfte Jul 29, 2025
daed6a1
print the struct offset at compile time to make this less annoying
dbsanfte Jul 29, 2025
8956732
fix
dbsanfte Jul 29, 2025
b00126a
undo cleverness
dbsanfte Jul 29, 2025
c2ba046
fix typos
dbsanfte Jul 29, 2025
b822399
fix typo
dbsanfte Jul 29, 2025
7e53968
fix padding
dbsanfte Jul 29, 2025
ab37137
refactor more t->data to tensor_data(t) etc
dbsanfte Jul 30, 2025
9b8e73f
Merge branch 'numa-improvements-take2' of https://github.com/dbsanfte…
dbsanfte Jul 30, 2025
14bfbf8
make a smarter macro for tensor_data / tensor_set_data to handle both…
dbsanfte Jul 30, 2025
7cfc6a7
fix typo
dbsanfte Jul 30, 2025
afbff14
fix for both C11 and cpp
dbsanfte Jul 30, 2025
4f0c3cb
another try at a fix
dbsanfte Jul 30, 2025
ea046b9
another try...
dbsanfte Jul 30, 2025
1553dda
revert changes to ggml.h
dbsanfte Jul 30, 2025
4998a45
actually why not just pass the memory address of the instance...
dbsanfte Jul 30, 2025
debae5f
missed a few refs
dbsanfte Jul 30, 2025
ebaf5cd
missed a few more refs
dbsanfte Jul 30, 2025
0704760
fix more refs
dbsanfte Jul 30, 2025
b97dfcb
add hugepages cleanup on exit
dbsanfte Jul 30, 2025
d1d3ebd
more fixes to cleanup
dbsanfte Jul 30, 2025
bf2d65e
more cleanup robustness
dbsanfte Jul 30, 2025
0f4bf89
robustness ++
dbsanfte Jul 30, 2025
b956e4c
don't try to emplace_back() on NUMA stuff
dbsanfte Jul 30, 2025
7faf58a
don't munmap in numa in destructor
dbsanfte Jul 30, 2025
fa72aa3
don't try to unmap_fragment on hugepages/numa
dbsanfte Jul 30, 2025
92593e7
experimental fixes for `--threads` and numa
dbsanfte Jul 31, 2025
a70929d
dev container and testing notes
dbsanfte Jul 31, 2025
18f3cff
dev container
dbsanfte Jul 31, 2025
1a053e3
better devcontainer setup
dbsanfte Jul 31, 2025
2275a66
fix for gguf multipart mappings
dbsanfte Jul 31, 2025
febdec3
fix code and instructions
dbsanfte Aug 1, 2025
892b02d
fix compiler warning
dbsanfte Aug 1, 2025
8bbb08b
do mmaps all at once, faster
dbsanfte Aug 1, 2025
f3540e6
invert switch logic for hyperthreading/efficiency cores
dbsanfte Aug 1, 2025
f57ea5f
Much better thread and numa node handling. New options: --cpu-no-hype…
dbsanfte Aug 1, 2025
5fa2334
fix segfault on multi-part ggufs
dbsanfte Aug 1, 2025
3a9bec9
fix segfault on multipart gguf load
dbsanfte Aug 1, 2025
b8ce43b
fix another segfault
dbsanfte Aug 1, 2025
e60723d
another fix
dbsanfte Aug 1, 2025
d82ca84
segfault fix
dbsanfte Aug 1, 2025
756fba6
segfault fix guide
dbsanfte Aug 1, 2025
9d66473
Merge branch 'numa-improvements-take2-iteration' into numa-improvemen…
dbsanfte Aug 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
.swiftpm
.vs/
.vscode/
.devcontainer/
.github/copilot-instructions.md
nppBackup


Expand Down Expand Up @@ -146,3 +148,5 @@ poetry.toml
# Local scripts
/run-vim.sh
/run-chat.sh
Testing/Temporary/CTestCostData.txt

22 changes: 22 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.cpuparams_batch.strict_cpu = value;
}
));
add_opt(common_arg(
{"--cpu-no-hyperthreading"},
"disable hyperthreading/SMT for math operations (use only physical cores)",
[](common_params & params) {
params.cpuparams.use_hyperthreading = false;
}
));
add_opt(common_arg(
{"--cpu-no-efficiency-cores"},
"disable efficiency cores (E-cores) for math operations (use only performance cores)",
[](common_params & params) {
params.cpuparams.use_efficiency_cores = false;
}
));
add_opt(common_arg(
{"--cpu-topology"},
"print detailed CPU topology information and exit",
[](common_params & /*params*/) {
cpu_print_topology_info();
exit(0);
}
));
add_opt(common_arg(
{"--prio-batch"}, "N",
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
Expand Down
215 changes: 200 additions & 15 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {

#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#include <pthread.h>
#include <map>
#include <set>

static void cpuid(unsigned leaf, unsigned subleaf,
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
Expand Down Expand Up @@ -152,19 +154,116 @@ static bool is_running_on_efficiency_core(void) {
return core_type == intel_atom;
}

static int cpu_count_math_cpus(int n_cpu) {
int result = 0;
for (int cpu = 0; cpu < n_cpu; ++cpu) {
if (pin_cpu(cpu)) {
return -1;
// Structure to hold detailed CPU topology information
struct cpu_topology_info {
int total_logical_cpus;
int total_physical_cores;
int performance_cores;
int efficiency_cores;
std::vector<std::vector<int>> core_siblings; // Groups of hyperthreaded CPUs
std::vector<int> performance_cpus; // CPU IDs that are performance cores
std::vector<int> efficiency_cpus; // CPU IDs that are efficiency cores
};

static cpu_topology_info detect_cpu_topology() {
cpu_topology_info info = {};
info.total_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN);

// Map to group CPUs by their thread siblings
std::map<std::string, std::vector<int>> sibling_groups;

// Read topology information for each CPU
for (int cpu = 0; cpu < info.total_logical_cpus; ++cpu) {
// Read thread siblings to identify hyperthreading groups
std::ifstream siblings_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list");
if (siblings_file.is_open()) {
std::string siblings_str;
std::getline(siblings_file, siblings_str);
sibling_groups[siblings_str].push_back(cpu);
}
if (is_running_on_efficiency_core()) {
continue; // efficiency cores harm lockstep threading

// Test if this CPU is a performance or efficiency core
if (pin_cpu(cpu) == 0) {
if (is_running_on_efficiency_core()) {
info.efficiency_cpus.push_back(cpu);
} else {
info.performance_cpus.push_back(cpu);
}
}
++cpu; // hyperthreading isn't useful for linear algebra
++result;
}
return result;

// Convert sibling groups to core_siblings vector
for (const auto& group : sibling_groups) {
info.core_siblings.push_back(group.second);
}

info.total_physical_cores = info.core_siblings.size();
info.performance_cores = info.performance_cpus.size();
info.efficiency_cores = info.efficiency_cpus.size();

return info;
}

static int cpu_count_math_cpus(int n_cpu, bool use_hyperthreading = false, bool use_efficiency_cores = false) {
GGML_UNUSED(n_cpu);
cpu_topology_info topo = detect_cpu_topology();

std::vector<int> selected_cpus;

// First, select which types of cores to use
std::vector<int> candidate_cpus;
if (!use_efficiency_cores) {
// Use only performance cores
candidate_cpus = topo.performance_cpus;
} else {
// Use all cores
candidate_cpus.reserve(topo.total_logical_cpus);
candidate_cpus.insert(candidate_cpus.end(), topo.performance_cpus.begin(), topo.performance_cpus.end());
candidate_cpus.insert(candidate_cpus.end(), topo.efficiency_cpus.begin(), topo.efficiency_cpus.end());
}

if (use_hyperthreading) {
// Use all candidate CPUs
selected_cpus = candidate_cpus;
} else {
// Select only one CPU per physical core
std::set<int> used_cores;
for (int cpu : candidate_cpus) {
// Find which core group this CPU belongs to
for (const auto& core_group : topo.core_siblings) {
if (std::find(core_group.begin(), core_group.end(), cpu) != core_group.end()) {
// Use a hash of the core group to identify unique cores
std::string core_id;
for (int sibling : core_group) {
core_id += std::to_string(sibling) + ",";
}
size_t core_hash = std::hash<std::string>{}(core_id);

if (used_cores.find(core_hash) == used_cores.end()) {
selected_cpus.push_back(cpu);
used_cores.insert(core_hash);
}
break;
}
}
}
}

// Validate selected CPUs by attempting to pin to them
int valid_count = 0;
cpu_set_t original_affinity;
pthread_getaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);

for (int cpu : selected_cpus) {
if (pin_cpu(cpu) == 0) {
valid_count++;
}
}

// Restore original affinity
pthread_setaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);

return valid_count;
}

#endif // __x86_64__ && __linux__
Expand All @@ -178,10 +277,40 @@ int32_t cpu_get_num_math() {
if (n_cpu < 1) {
return cpu_get_num_physical_cores();
}

if (is_hybrid_cpu()) {
cpu_set_t affinity;
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
// Default behavior: use hyperthreading and efficiency cores for math
// This can be overridden by environment variables or command-line options
bool use_hyperthreading = std::getenv("LLAMA_CPU_NO_HYPERTHREADING") == nullptr;
bool use_efficiency_cores = std::getenv("LLAMA_CPU_NO_EFFICIENCY_CORES") == nullptr;

int result = cpu_count_math_cpus(n_cpu, use_hyperthreading, use_efficiency_cores);
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
if (result > 0) {
return result;
}
}
}
#endif
return cpu_get_num_physical_cores();
}

/**
* Returns number of CPUs on system that are useful for math, respecting cpu_params.
*/
int32_t cpu_get_num_math_from_params(const cpu_params & params) {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpu < 1) {
return cpu_get_num_physical_cores();
}

if (is_hybrid_cpu()) {
cpu_set_t affinity;
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
int result = cpu_count_math_cpus(n_cpu);
int result = cpu_count_math_cpus(n_cpu, params.use_hyperthreading, params.use_efficiency_cores);
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
if (result > 0) {
return result;
Expand All @@ -192,6 +321,62 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores();
}

/**
* Print CPU topology information for debugging
*/
void cpu_print_topology_info() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
if (is_hybrid_cpu()) {
cpu_topology_info topo = detect_cpu_topology();

printf("CPU Topology Information:\n");
printf(" Total logical CPUs: %d\n", topo.total_logical_cpus);
printf(" Total physical cores: %d\n", topo.total_physical_cores);
printf(" Performance cores: %d\n", topo.performance_cores);
printf(" Efficiency cores: %d\n", topo.efficiency_cores);

printf(" Performance CPU IDs: ");
for (size_t i = 0; i < topo.performance_cpus.size(); ++i) {
if (i > 0) printf(", ");
printf("%d", topo.performance_cpus[i]);
}
printf("\n");

if (!topo.efficiency_cpus.empty()) {
printf(" Efficiency CPU IDs: ");
for (size_t i = 0; i < topo.efficiency_cpus.size(); ++i) {
if (i > 0) printf(", ");
printf("%d", topo.efficiency_cpus[i]);
}
printf("\n");
}

printf(" Core sibling groups (hyperthreading):\n");
for (size_t i = 0; i < topo.core_siblings.size(); ++i) {
printf(" Core %zu: ", i);
for (size_t j = 0; j < topo.core_siblings[i].size(); ++j) {
if (j > 0) printf(", ");
printf("%d", topo.core_siblings[i][j]);
}
printf("\n");
}

// Show what would be selected with different options
printf("\n Thread count recommendations:\n");
printf(" Default (P-cores + hyperthreading): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, false));
printf(" Without hyperthreading: %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, false));
printf(" With E-cores (+ HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, true));
printf(" With E-cores (no HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, true));
} else {
printf("CPU Topology: Non-hybrid CPU detected\n");
printf(" Physical cores: %d\n", cpu_get_num_physical_cores());
printf(" Logical CPUs: %d\n", (int)std::thread::hardware_concurrency());
}
#else
printf("CPU topology detection not available on this platform\n");
#endif
}

// Helper for setting process priority

#if defined(_WIN32)
Expand Down Expand Up @@ -258,7 +443,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
cpuparams.n_threads = cpu_get_num_math_from_params(cpuparams);
}
}

Expand Down Expand Up @@ -1495,7 +1680,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
// extend if necessary - do not store data for layer 0 (it's not used)
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);

const float * src = (const float *) tensor->data;
const float * src = (const float *) tensor_data(tensor);
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
for (int j = 0; j < result.n_embd; j++) {
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
Expand Down Expand Up @@ -1554,8 +1739,8 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
ggml_opt_dataset_t result = ggml_opt_dataset_init(
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);

llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
llama_token * data = (llama_token *) tensor_data(ggml_opt_dataset_data(result));
llama_token * labels = (llama_token *) tensor_data(ggml_opt_dataset_labels(result));

for (int64_t idata = 0; idata < ndata; ++idata) {
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
Expand Down
4 changes: 4 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,14 @@ struct cpu_params {
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
bool use_hyperthreading = true; // Use hyperthreading/SMT for math operations (enabled by default)
bool use_efficiency_cores = true; // Use efficiency cores (E-cores) for math operations (enabled by default)
};

int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
int32_t cpu_get_num_math_from_params(const cpu_params & params);
void cpu_print_topology_info();

//
// Common params
Expand Down
4 changes: 2 additions & 2 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,12 +408,12 @@ static void init_model(struct my_llama_model * model) {
}

static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
float * ptr = (float *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}

static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
int32_t * ptr = (int32_t *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}

Expand Down
2 changes: 1 addition & 1 deletion examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
}

if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
uint8_t * data = is_host ? (uint8_t *) tensor_data(t) : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}

Expand Down
2 changes: 1 addition & 1 deletion examples/gguf-hash/gguf-hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
auto *raw_data = tensor_data(cur);
const std::string tensor_layer_name = fname + ":" + name;

if (hash_params.xxh64) {
Expand Down
8 changes: 4 additions & 4 deletions examples/gguf/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ static bool gguf_ex_write(const std::string & fname) {
ggml_set_name(cur, name.c_str());

{
float * data = (float *) cur->data;
float * data = (float *) tensor_data(cur);
for (int j = 0; j < ggml_nelements(cur); ++j) {
data[j] = 100 + i;
}
Expand Down Expand Up @@ -201,10 +201,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, tensor_data(cur));

// print first 10 elements
const float * data = (const float *) cur->data;
const float * data = (const float *) tensor_data(cur);

printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
Expand All @@ -214,7 +214,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {

// check data
if (check_data) {
const float * data = (const float *) cur->data;
const float * data = (const float *) tensor_data(cur);
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
Expand Down
Loading