ggml-org · dbsanfte · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,8 @@
 .swiftpm
 .vs/
 .vscode/
+.devcontainer/
+.github/copilot-instructions.md
 nppBackup
 
 
@@ -146,3 +148,5 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
+Testing/Temporary/CTestCostData.txt
+
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1386,6 +1386,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cpuparams_batch.strict_cpu = value;
         }
     ));
+    add_opt(common_arg(
+        {"--cpu-no-hyperthreading"},
+        "disable hyperthreading/SMT for math operations (use only physical cores)",
+        [](common_params & params) {
+            params.cpuparams.use_hyperthreading = false;
+        }
+    ));
+    add_opt(common_arg(
+        {"--cpu-no-efficiency-cores"},
+        "disable efficiency cores (E-cores) for math operations (use only performance cores)",
+        [](common_params & params) {
+            params.cpuparams.use_efficiency_cores = false;
+        }
+    ));
+    add_opt(common_arg(
+        {"--cpu-topology"},
+        "print detailed CPU topology information and exit",
+        [](common_params & /*params*/) {
+            cpu_print_topology_info();
+            exit(0);
+        }
+    ));
     add_opt(common_arg(
         {"--prio-batch"}, "N",
         string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),

diff --git a/common/common.cpp b/common/common.cpp
@@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {
 
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
 #include <pthread.h>
+#include <map>
+#include <set>
 
 static void cpuid(unsigned leaf, unsigned subleaf,
                   unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
@@ -152,19 +154,116 @@ static bool is_running_on_efficiency_core(void) {
     return core_type == intel_atom;
 }
 
-static int cpu_count_math_cpus(int n_cpu) {
-    int result = 0;
-    for (int cpu = 0; cpu < n_cpu; ++cpu) {
-        if (pin_cpu(cpu)) {
-            return -1;
+// Structure to hold detailed CPU topology information
+struct cpu_topology_info {
+    int total_logical_cpus;
+    int total_physical_cores;
+    int performance_cores;
+    int efficiency_cores;
+    std::vector<std::vector<int>> core_siblings; // Groups of hyperthreaded CPUs
+    std::vector<int> performance_cpus;           // CPU IDs that are performance cores
+    std::vector<int> efficiency_cpus;            // CPU IDs that are efficiency cores
+};
+
+static cpu_topology_info detect_cpu_topology() {
+    cpu_topology_info info = {};
+    info.total_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+    // Map to group CPUs by their thread siblings
+    std::map<std::string, std::vector<int>> sibling_groups;
+
+    // Read topology information for each CPU
+    for (int cpu = 0; cpu < info.total_logical_cpus; ++cpu) {
+        // Read thread siblings to identify hyperthreading groups
+        std::ifstream siblings_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list");
+        if (siblings_file.is_open()) {
+            std::string siblings_str;
+            std::getline(siblings_file, siblings_str);
+            sibling_groups[siblings_str].push_back(cpu);
         }
-        if (is_running_on_efficiency_core()) {
-            continue; // efficiency cores harm lockstep threading
+
+        // Test if this CPU is a performance or efficiency core
+        if (pin_cpu(cpu) == 0) {
+            if (is_running_on_efficiency_core()) {
+                info.efficiency_cpus.push_back(cpu);
+            } else {
+                info.performance_cpus.push_back(cpu);
+            }
         }
-        ++cpu; // hyperthreading isn't useful for linear algebra
-        ++result;
     }
-    return result;
+
+    // Convert sibling groups to core_siblings vector
+    for (const auto& group : sibling_groups) {
+        info.core_siblings.push_back(group.second);
+    }
+
+    info.total_physical_cores = info.core_siblings.size();
+    info.performance_cores = info.performance_cpus.size();
+    info.efficiency_cores = info.efficiency_cpus.size();
+
+    return info;
+}
+
+static int cpu_count_math_cpus(int n_cpu, bool use_hyperthreading = false, bool use_efficiency_cores = false) {
+    GGML_UNUSED(n_cpu);
+    cpu_topology_info topo = detect_cpu_topology();
+
+    std::vector<int> selected_cpus;
+
+    // First, select which types of cores to use
+    std::vector<int> candidate_cpus;
+    if (!use_efficiency_cores) {
+        // Use only performance cores
+        candidate_cpus = topo.performance_cpus;
+    } else {
+        // Use all cores
+        candidate_cpus.reserve(topo.total_logical_cpus);
+        candidate_cpus.insert(candidate_cpus.end(), topo.performance_cpus.begin(), topo.performance_cpus.end());
+        candidate_cpus.insert(candidate_cpus.end(), topo.efficiency_cpus.begin(), topo.efficiency_cpus.end());
+    }
+
+    if (use_hyperthreading) {
+        // Use all candidate CPUs
+        selected_cpus = candidate_cpus;
+    } else {
+        // Select only one CPU per physical core
+        std::set<int> used_cores;
+        for (int cpu : candidate_cpus) {
+            // Find which core group this CPU belongs to
+            for (const auto& core_group : topo.core_siblings) {
+                if (std::find(core_group.begin(), core_group.end(), cpu) != core_group.end()) {
+                    // Use a hash of the core group to identify unique cores
+                    std::string core_id;
+                    for (int sibling : core_group) {
+                        core_id += std::to_string(sibling) + ",";
+                    }
+                    size_t core_hash = std::hash<std::string>{}(core_id);
+
+                    if (used_cores.find(core_hash) == used_cores.end()) {
+                        selected_cpus.push_back(cpu);
+                        used_cores.insert(core_hash);
+                    }
+                    break;
+                }
+            }
+        }
+    }
+
+    // Validate selected CPUs by attempting to pin to them
+    int valid_count = 0;
+    cpu_set_t original_affinity;
+    pthread_getaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
+
+    for (int cpu : selected_cpus) {
+        if (pin_cpu(cpu) == 0) {
+            valid_count++;
+        }
+    }
+
+    // Restore original affinity
+    pthread_setaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
+
+    return valid_count;
 }
 
 #endif // __x86_64__ && __linux__
@@ -178,10 +277,40 @@ int32_t cpu_get_num_math() {
     if (n_cpu < 1) {
         return cpu_get_num_physical_cores();
     }
+
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            // Default behavior: use hyperthreading and efficiency cores for math
+            // This can be overridden by environment variables or command-line options
+            bool use_hyperthreading = std::getenv("LLAMA_CPU_NO_HYPERTHREADING") == nullptr;
+            bool use_efficiency_cores = std::getenv("LLAMA_CPU_NO_EFFICIENCY_CORES") == nullptr;
+
+            int result = cpu_count_math_cpus(n_cpu, use_hyperthreading, use_efficiency_cores);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return cpu_get_num_physical_cores();
+}
+
+/**
+ * Returns number of CPUs on system that are useful for math, respecting cpu_params.
+ */
+int32_t cpu_get_num_math_from_params(const cpu_params & params) {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if (n_cpu < 1) {
+        return cpu_get_num_physical_cores();
+    }
+
     if (is_hybrid_cpu()) {
         cpu_set_t affinity;
         if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
-            int result = cpu_count_math_cpus(n_cpu);
+            int result = cpu_count_math_cpus(n_cpu, params.use_hyperthreading, params.use_efficiency_cores);
             pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
             if (result > 0) {
                 return result;
@@ -192,6 +321,62 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+/**
+ * Print CPU topology information for debugging
+ */
+void cpu_print_topology_info() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    if (is_hybrid_cpu()) {
+        cpu_topology_info topo = detect_cpu_topology();
+
+        printf("CPU Topology Information:\n");
+        printf("  Total logical CPUs: %d\n", topo.total_logical_cpus);
+        printf("  Total physical cores: %d\n", topo.total_physical_cores);
+        printf("  Performance cores: %d\n", topo.performance_cores);
+        printf("  Efficiency cores: %d\n", topo.efficiency_cores);
+
+        printf("  Performance CPU IDs: ");
+        for (size_t i = 0; i < topo.performance_cpus.size(); ++i) {
+            if (i > 0) printf(", ");
+            printf("%d", topo.performance_cpus[i]);
+        }
+        printf("\n");
+
+        if (!topo.efficiency_cpus.empty()) {
+            printf("  Efficiency CPU IDs: ");
+            for (size_t i = 0; i < topo.efficiency_cpus.size(); ++i) {
+                if (i > 0) printf(", ");
+                printf("%d", topo.efficiency_cpus[i]);
+            }
+            printf("\n");
+        }
+
+        printf("  Core sibling groups (hyperthreading):\n");
+        for (size_t i = 0; i < topo.core_siblings.size(); ++i) {
+            printf("    Core %zu: ", i);
+            for (size_t j = 0; j < topo.core_siblings[i].size(); ++j) {
+                if (j > 0) printf(", ");
+                printf("%d", topo.core_siblings[i][j]);
+            }
+            printf("\n");
+        }
+
+        // Show what would be selected with different options
+        printf("\n  Thread count recommendations:\n");
+        printf("    Default (P-cores + hyperthreading): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, false));
+        printf("    Without hyperthreading: %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, false));
+        printf("    With E-cores (+ HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, true));
+        printf("    With E-cores (no HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, true));
+    } else {
+        printf("CPU Topology: Non-hybrid CPU detected\n");
+        printf("  Physical cores: %d\n", cpu_get_num_physical_cores());
+        printf("  Logical CPUs: %d\n", (int)std::thread::hardware_concurrency());
+    }
+#else
+    printf("CPU topology detection not available on this platform\n");
+#endif
+}
+
 // Helper for setting process priority
 
 #if defined(_WIN32)
@@ -258,7 +443,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         if (role_model != nullptr) {
             cpuparams = *role_model;
         } else {
-            cpuparams.n_threads = cpu_get_num_math();
+            cpuparams.n_threads = cpu_get_num_math_from_params(cpuparams);
         }
     }
 
@@ -1495,7 +1680,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
         // extend if necessary - do not store data for layer 0 (it's not used)
         result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
 
-        const float * src = (const float *) tensor->data;
+        const float * src = (const float *) tensor_data(tensor);
         float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
         for (int j = 0; j < result.n_embd; j++) {
             dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
@@ -1554,8 +1739,8 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
     ggml_opt_dataset_t result = ggml_opt_dataset_init(
         GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
 
-    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
-    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
+    llama_token * data   = (llama_token *) tensor_data(ggml_opt_dataset_data(result));
+    llama_token * labels = (llama_token *) tensor_data(ggml_opt_dataset_labels(result));
 
     for (int64_t idata = 0; idata < ndata; ++idata) {
         memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));

diff --git a/common/common.h b/common/common.h
@@ -55,10 +55,14 @@ struct cpu_params {
     enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
     uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+    bool     use_hyperthreading          = true;    // Use hyperthreading/SMT for math operations (enabled by default)
+    bool     use_efficiency_cores        = true;    // Use efficiency cores (E-cores) for math operations (enabled by default)
 };
 
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
+int32_t cpu_get_num_math_from_params(const cpu_params & params);
+void cpu_print_topology_info();
 
 //
 // Common params

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -408,12 +408,12 @@ static void init_model(struct my_llama_model * model) {
 }
 
 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    float * ptr = (float *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
     return *ptr;
 }
 
 static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    int32_t * ptr = (int32_t *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
     return *ptr;
 }
 

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
@@ -121,7 +121,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     }
 
     if (!ggml_is_quantized(t->type)) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        uint8_t * data = is_host ? (uint8_t *) tensor_data(t) : cb_data->data.data();
         ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
     }
 

diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -336,7 +336,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         const char * name = gguf_get_tensor_name(ctx, i);
         struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
         auto n_bytes = ggml_nbytes(cur);
-        auto *raw_data = cur->data;
+        auto *raw_data = tensor_data(cur);
         const std::string tensor_layer_name = fname + ":" + name;
 
         if (hash_params.xxh64) {

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
@@ -63,7 +63,7 @@ static bool gguf_ex_write(const std::string & fname) {
         ggml_set_name(cur, name.c_str());
 
         {
-            float * data = (float *) cur->data;
+            float * data = (float *) tensor_data(cur);
             for (int j = 0; j < ggml_nelements(cur); ++j) {
                 data[j] = 100 + i;
             }
@@ -201,10 +201,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
             struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
 
             printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
-                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
+                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, tensor_data(cur));
 
             // print first 10 elements
-            const float * data = (const float *) cur->data;
+            const float * data = (const float *) tensor_data(cur);
 
             printf("%s data[:10] : ", name);
             for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
@@ -214,7 +214,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 
             // check data
             if (check_data) {
-                const float * data = (const float *) cur->data;
+                const float * data = (const float *) tensor_data(cur);
                 for (int j = 0; j < ggml_nelements(cur); ++j) {
                     if (data[j] != 100 + i) {
                         fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));