diff --git a/.gitignore b/.gitignore index f8ceb1560a1df..003669d47a0f9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,8 @@ .swiftpm .vs/ .vscode/ +.devcontainer/ +.github/copilot-instructions.md nppBackup @@ -146,3 +148,5 @@ poetry.toml # Local scripts /run-vim.sh /run-chat.sh +Testing/Temporary/CTestCostData.txt + diff --git a/common/arg.cpp b/common/arg.cpp index 060053595dbfd..a475ec45f590e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1386,6 +1386,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams_batch.strict_cpu = value; } )); + add_opt(common_arg( + {"--cpu-no-hyperthreading"}, + "disable hyperthreading/SMT for math operations (use only physical cores)", + [](common_params & params) { + params.cpuparams.use_hyperthreading = false; + } + )); + add_opt(common_arg( + {"--cpu-no-efficiency-cores"}, + "disable efficiency cores (E-cores) for math operations (use only performance cores)", + [](common_params & params) { + params.cpuparams.use_efficiency_cores = false; + } + )); + add_opt(common_arg( + {"--cpu-topology"}, + "print detailed CPU topology information and exit", + [](common_params & /*params*/) { + cpu_print_topology_info(); + exit(0); + } + )); add_opt(common_arg( {"--prio-batch"}, "N", string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), diff --git a/common/common.cpp b/common/common.cpp index d8c4d988b6f8b..2cc1cff89ff49 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() { #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) #include +#include +#include static void cpuid(unsigned leaf, unsigned subleaf, unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { @@ -152,19 +154,116 @@ static bool is_running_on_efficiency_core(void) { return core_type == intel_atom; } -static int cpu_count_math_cpus(int n_cpu) { - int result = 0; - for (int cpu = 0; cpu < n_cpu; ++cpu) { - if (pin_cpu(cpu)) { - return -1; +// Structure to hold detailed CPU topology information +struct cpu_topology_info { + int total_logical_cpus; + int total_physical_cores; + int performance_cores; + int efficiency_cores; + std::vector> core_siblings; // Groups of hyperthreaded CPUs + std::vector performance_cpus; // CPU IDs that are performance cores + std::vector efficiency_cpus; // CPU IDs that are efficiency cores +}; + +static cpu_topology_info detect_cpu_topology() { + cpu_topology_info info = {}; + info.total_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + // Map to group CPUs by their thread siblings + std::map> sibling_groups; + + // Read topology information for each CPU + for (int cpu = 0; cpu < info.total_logical_cpus; ++cpu) { + // Read thread siblings to identify hyperthreading groups + std::ifstream siblings_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list"); + if (siblings_file.is_open()) { + std::string siblings_str; + std::getline(siblings_file, siblings_str); + sibling_groups[siblings_str].push_back(cpu); } - if (is_running_on_efficiency_core()) { - continue; // efficiency cores harm lockstep threading + + // Test if this CPU is a performance or efficiency core + if (pin_cpu(cpu) == 0) { + if (is_running_on_efficiency_core()) { + info.efficiency_cpus.push_back(cpu); + } else { + info.performance_cpus.push_back(cpu); + } } - ++cpu; // hyperthreading isn't useful for linear algebra - ++result; } - return result; + + // Convert sibling groups to core_siblings vector + for (const auto& group : sibling_groups) { + info.core_siblings.push_back(group.second); + } + + info.total_physical_cores = info.core_siblings.size(); + info.performance_cores = info.performance_cpus.size(); + info.efficiency_cores = info.efficiency_cpus.size(); + + return info; +} + +static int cpu_count_math_cpus(int n_cpu, bool use_hyperthreading = false, bool use_efficiency_cores = false) { + GGML_UNUSED(n_cpu); + cpu_topology_info topo = detect_cpu_topology(); + + std::vector selected_cpus; + + // First, select which types of cores to use + std::vector candidate_cpus; + if (!use_efficiency_cores) { + // Use only performance cores + candidate_cpus = topo.performance_cpus; + } else { + // Use all cores + candidate_cpus.reserve(topo.total_logical_cpus); + candidate_cpus.insert(candidate_cpus.end(), topo.performance_cpus.begin(), topo.performance_cpus.end()); + candidate_cpus.insert(candidate_cpus.end(), topo.efficiency_cpus.begin(), topo.efficiency_cpus.end()); + } + + if (use_hyperthreading) { + // Use all candidate CPUs + selected_cpus = candidate_cpus; + } else { + // Select only one CPU per physical core + std::set used_cores; + for (int cpu : candidate_cpus) { + // Find which core group this CPU belongs to + for (const auto& core_group : topo.core_siblings) { + if (std::find(core_group.begin(), core_group.end(), cpu) != core_group.end()) { + // Use a hash of the core group to identify unique cores + std::string core_id; + for (int sibling : core_group) { + core_id += std::to_string(sibling) + ","; + } + size_t core_hash = std::hash{}(core_id); + + if (used_cores.find(core_hash) == used_cores.end()) { + selected_cpus.push_back(cpu); + used_cores.insert(core_hash); + } + break; + } + } + } + } + + // Validate selected CPUs by attempting to pin to them + int valid_count = 0; + cpu_set_t original_affinity; + pthread_getaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity); + + for (int cpu : selected_cpus) { + if (pin_cpu(cpu) == 0) { + valid_count++; + } + } + + // Restore original affinity + pthread_setaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity); + + return valid_count; } #endif // __x86_64__ && __linux__ @@ -178,10 +277,40 @@ int32_t cpu_get_num_math() { if (n_cpu < 1) { return cpu_get_num_physical_cores(); } + + if (is_hybrid_cpu()) { + cpu_set_t affinity; + if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { + // Default behavior: use hyperthreading and efficiency cores for math + // This can be overridden by environment variables or command-line options + bool use_hyperthreading = std::getenv("LLAMA_CPU_NO_HYPERTHREADING") == nullptr; + bool use_efficiency_cores = std::getenv("LLAMA_CPU_NO_EFFICIENCY_CORES") == nullptr; + + int result = cpu_count_math_cpus(n_cpu, use_hyperthreading, use_efficiency_cores); + pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); + if (result > 0) { + return result; + } + } + } +#endif + return cpu_get_num_physical_cores(); +} + +/** + * Returns number of CPUs on system that are useful for math, respecting cpu_params. + */ +int32_t cpu_get_num_math_from_params(const cpu_params & params) { +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) + int n_cpu = sysconf(_SC_NPROCESSORS_ONLN); + if (n_cpu < 1) { + return cpu_get_num_physical_cores(); + } + if (is_hybrid_cpu()) { cpu_set_t affinity; if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { - int result = cpu_count_math_cpus(n_cpu); + int result = cpu_count_math_cpus(n_cpu, params.use_hyperthreading, params.use_efficiency_cores); pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); if (result > 0) { return result; @@ -192,6 +321,62 @@ int32_t cpu_get_num_math() { return cpu_get_num_physical_cores(); } +/** + * Print CPU topology information for debugging + */ +void cpu_print_topology_info() { +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) + if (is_hybrid_cpu()) { + cpu_topology_info topo = detect_cpu_topology(); + + printf("CPU Topology Information:\n"); + printf(" Total logical CPUs: %d\n", topo.total_logical_cpus); + printf(" Total physical cores: %d\n", topo.total_physical_cores); + printf(" Performance cores: %d\n", topo.performance_cores); + printf(" Efficiency cores: %d\n", topo.efficiency_cores); + + printf(" Performance CPU IDs: "); + for (size_t i = 0; i < topo.performance_cpus.size(); ++i) { + if (i > 0) printf(", "); + printf("%d", topo.performance_cpus[i]); + } + printf("\n"); + + if (!topo.efficiency_cpus.empty()) { + printf(" Efficiency CPU IDs: "); + for (size_t i = 0; i < topo.efficiency_cpus.size(); ++i) { + if (i > 0) printf(", "); + printf("%d", topo.efficiency_cpus[i]); + } + printf("\n"); + } + + printf(" Core sibling groups (hyperthreading):\n"); + for (size_t i = 0; i < topo.core_siblings.size(); ++i) { + printf(" Core %zu: ", i); + for (size_t j = 0; j < topo.core_siblings[i].size(); ++j) { + if (j > 0) printf(", "); + printf("%d", topo.core_siblings[i][j]); + } + printf("\n"); + } + + // Show what would be selected with different options + printf("\n Thread count recommendations:\n"); + printf(" Default (P-cores + hyperthreading): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, false)); + printf(" Without hyperthreading: %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, false)); + printf(" With E-cores (+ HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, true)); + printf(" With E-cores (no HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, true)); + } else { + printf("CPU Topology: Non-hybrid CPU detected\n"); + printf(" Physical cores: %d\n", cpu_get_num_physical_cores()); + printf(" Logical CPUs: %d\n", (int)std::thread::hardware_concurrency()); + } +#else + printf("CPU topology detection not available on this platform\n"); +#endif +} + // Helper for setting process priority #if defined(_WIN32) @@ -258,7 +443,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) if (role_model != nullptr) { cpuparams = *role_model; } else { - cpuparams.n_threads = cpu_get_num_math(); + cpuparams.n_threads = cpu_get_num_math_from_params(cpuparams); } } @@ -1495,7 +1680,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co // extend if necessary - do not store data for layer 0 (it's not used) result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f); - const float * src = (const float *) tensor->data; + const float * src = (const float *) tensor_data(tensor); float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0] for (int j = 0; j < result.n_embd; j++) { dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file @@ -1554,8 +1739,8 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std ggml_opt_dataset_t result = ggml_opt_dataset_init( GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1); - llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data; - llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data; + llama_token * data = (llama_token *) tensor_data(ggml_opt_dataset_data(result)); + llama_token * labels = (llama_token *) tensor_data(ggml_opt_dataset_labels(result)); for (int64_t idata = 0; idata < ndata; ++idata) { memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token)); diff --git a/common/common.h b/common/common.h index 00f42694eafa8..ade642821d65b 100644 --- a/common/common.h +++ b/common/common.h @@ -55,10 +55,14 @@ struct cpu_params { enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) bool strict_cpu = false; // Use strict CPU placement uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) + bool use_hyperthreading = true; // Use hyperthreading/SMT for math operations (enabled by default) + bool use_efficiency_cores = true; // Use efficiency cores (E-cores) for math operations (enabled by default) }; int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_math(); +int32_t cpu_get_num_math_from_params(const cpu_params & params); +void cpu_print_topology_info(); // // Common params diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a9cd3..fae03e46f9d7e 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -408,12 +408,12 @@ static void init_model(struct my_llama_model * model) { } static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + float * ptr = (float *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + int32_t * ptr = (int32_t *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]); return *ptr; } diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 4afd80eb454ad..764e44d095704 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -121,7 +121,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + uint8_t * data = is_host ? (uint8_t *) tensor_data(t) : cb_data->data.data(); ggml_print_tensor(data, t->type, t->ne, t->nb, 3); } diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122f573..ce92883583781 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -336,7 +336,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); auto n_bytes = ggml_nbytes(cur); - auto *raw_data = cur->data; + auto *raw_data = tensor_data(cur); const std::string tensor_layer_name = fname + ":" + name; if (hash_params.xxh64) { diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c55c6..fb4a6d22d6d90 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -63,7 +63,7 @@ static bool gguf_ex_write(const std::string & fname) { ggml_set_name(cur, name.c_str()); { - float * data = (float *) cur->data; + float * data = (float *) tensor_data(cur); for (int j = 0; j < ggml_nelements(cur); ++j) { data[j] = 100 + i; } @@ -201,10 +201,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n", - __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); + __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, tensor_data(cur)); // print first 10 elements - const float * data = (const float *) cur->data; + const float * data = (const float *) tensor_data(cur); printf("%s data[:10] : ", name); for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { @@ -214,7 +214,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { // check data if (check_data) { - const float * data = (const float *) cur->data; + const float * data = (const float *) tensor_data(cur); for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i)); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index de6d789c98a03..6010eef666f59 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -208,6 +208,8 @@ option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING "gmml: OpenCL API version to target") + +option(GGML_NUMA_MIRROR "ggml: support numa aware tensor data" OFF) # toolchain for vulkan-shaders-gen set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") @@ -328,6 +330,43 @@ set(variable_set_statements set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS}) +if (GGML_NUMA_MIRROR) + find_library(NUMA_LIBRARY NAMES numa) + if (!NUMA_LIBRARY) + message(FATAL_ERROR "libnuma is not found") + endif() + message(STATUS "libnuma: ${NUMA_LIBRARY}") + + if (NOT DEFINED GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET) + set(GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET "0x200000000000ULL") + endif() + if (NOT DEFINED GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT) + set(GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT "0x200000000000ULL") + endif() + if (NOT DEFINED GGML_MMAP_HUGEPAGESZ) + set(GGML_MMAP_HUGEPAGESZ "1073741824ULL") + endif() + + message(STATUS + "-----------------\n" + "Enabling GGML_NUMA_MIRROR\n" + "Hugepages must be reserved properly,\n" + "and your program should have write access to /dev/hugepages\n" + "GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET = ${GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET}\n" + "GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT = ${GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT}\n" + "GGML_MMAP_HUGEPAGESZ = ${GGML_MMAP_HUGEPAGESZ}") + message(STATUS + "-----------------") + + foreach(lib "ggml" "ggml-base") + target_compile_definitions(${lib} PUBLIC GGML_NUMA_MIRROR) + target_compile_definitions(${lib} PUBLIC GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET=${GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET}) + target_compile_definitions(${lib} PUBLIC GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT=${GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT}) + target_compile_definitions(${lib} PUBLIC GGML_MMAP_HUGEPAGESZ=${GGML_MMAP_HUGEPAGESZ}) + target_link_libraries(${lib} PUBLIC ${NUMA_LIBRARY}) + endforeach() +endif() + get_cmake_property(all_variables VARIABLES) foreach(variable_name IN LISTS all_variables) if(variable_name MATCHES "^GGML_") diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index a2977ea2e56d9..c096a44ed69bb 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -86,7 +86,7 @@ extern "C" { GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - // "offset" refers to the offset in tensor->data for setting/getting data + // "offset" refers to the offset in tensor_data(tensor) for setting/getting data GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8a8775be36583..9bb6402503f70 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -310,6 +310,9 @@ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) +#define GGML_LIKELY (x) __builtin_expect(!!(x), 1) +#define GGML_UNLIKELY(x) __builtin_expect(!!(x), 0) + #ifdef __cplusplus extern "C" { #endif @@ -619,15 +622,68 @@ extern "C" { struct ggml_tensor * view_src; size_t view_offs; +#ifdef GGML_NUMA_MIRROR + union { + #ifdef __NVCC__ + void * data; + #endif + void * __data[2]; + }; +#else void * data; +#endif char name[GGML_MAX_NAME]; void * extra; // extra things e.g. for ggml-cuda.cu - char padding[8]; +#ifdef GGML_NUMA_MIRROR + char padding[10]; +#else + char padding[8]; +#endif }; +#ifdef GGML_NUMA_MIRROR + extern __thread int ggml_current_numa_node; +#endif + + static inline void * tensor_data(const struct ggml_tensor * tensor) { +#ifdef GGML_NUMA_MIRROR + int n = ggml_current_numa_node; + if (n == -1) + n = 0; + return tensor->__data[n]; +#else + return tensor->data; +#endif + } + + static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) { +#ifdef GGML_NUMA_MIRROR + if ((uint64_t)data >= \ + GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \ + GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT && \ + (uint64_t)data < GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \ + 2 * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT) { + data = (void*) ((uint64_t)data - GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT); + } + tensor->__data[0] = data; + if ((uint64_t)data >= \ + GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET && \ + (uint64_t)data < \ + GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \ + GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT) { + tensor->__data[1] = (void*) ((uint64_t)data + \ + GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT); + } else { + tensor->__data[1] = data; + } +#else + tensor->data = data; +#endif + } + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); // Abort callback diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index fcc552da519b1..7abbde22dd572 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -457,7 +457,7 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { } static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; + return tensor_data(t) != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; } static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { @@ -478,7 +478,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor // if the node's data is external, then we cannot re-use it if (!ggml_gallocr_is_own(galloc, parent)) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, tensor_data(parent)); continue; } @@ -498,7 +498,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor if (ggml_is_view(parent)) { struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && tensor_data(view_src) == tensor_data(parent)) { AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); assert(view_src_hn->offset == p_hn->offset); hn->buffer_id = p_hn->buffer_id; @@ -689,7 +689,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - if (node->view_src || node->data) { + if (node->view_src || tensor_data(node)) { node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.size_max = 0; @@ -701,7 +701,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; - if (!src || src->view_src || src->data) { + if (!src || src->view_src || tensor_data(src)) { node_alloc->src[j].buffer_id = -1; node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].size_max = 0; @@ -722,7 +722,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - if (leaf->view_src || leaf->data) { + if (leaf->view_src || tensor_data(leaf)) { galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.size_max = 0; @@ -771,7 +771,7 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { int buffer_id = tensor_alloc->buffer_id; - assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); + assert(tensor_data(tensor) || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); if (tensor->view_src != NULL) { if (tensor->buffer == NULL) { @@ -783,7 +783,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * ggml_backend_view_init(tensor); } } else { - if (tensor->data == NULL) { + if (tensor_data(tensor) == NULL) { assert(tensor_alloc->offset != SIZE_MAX); assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); @@ -800,7 +800,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { size_t node_size = 0; - if (!node->data && !node->view_src) { + if (!tensor_data(node) && !node->view_src) { // If we previously had data but don't now then reallocate if (talloc->buffer_id < 0) { return false; @@ -947,7 +947,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx, for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { enum ggml_status status = GGML_STATUS_SUCCESS; - if (t->data == NULL) { + if (tensor_data(t) == NULL) { if (t->view_src == NULL) { status = ggml_tallocr_alloc(&tallocr, t); } else if (t->buffer == NULL) { @@ -982,7 +982,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte struct ggml_tensor * first = ggml_get_first_tensor(ctx); for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { size_t this_size = 0; - if (t->data == NULL && t->view_src == NULL) { + if (tensor_data(t) == NULL && t->view_src == NULL) { this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b7498b8d40238..d18da6d7bd18f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -232,7 +232,7 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) { } void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor_data(tensor) != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (backend->iface.set_tensor_async == NULL) { @@ -243,7 +243,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * } void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor_data(tensor) != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (backend->iface.get_tensor_async == NULL) { @@ -262,7 +262,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz } GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor_data(tensor) != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); buf->iface.set_tensor(buf, tensor, data, offset, size); @@ -277,7 +277,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz } GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor_data(tensor) != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); buf->iface.get_tensor(buf, tensor, data, offset, size); @@ -291,7 +291,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size } GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor_data(tensor) != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer"); @@ -360,9 +360,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst } if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); + ggml_backend_tensor_set(dst, tensor_data(src), 0, ggml_nbytes(src)); } else if (ggml_backend_buffer_is_host(dst->buffer)) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + ggml_backend_tensor_get(src, tensor_data(dst), 0, ggml_nbytes(src)); } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); @@ -1645,23 +1645,23 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); - GGML_ASSERT(tensor->view_src->data != NULL); + GGML_ASSERT(tensor_data(tensor->view_src) != NULL); tensor->buffer = tensor->view_src->buffer; - tensor->data = (char *)tensor->view_src->data + tensor->view_offs; + tensor_set_data(tensor, (char *)tensor_data(tensor->view_src) + tensor->view_offs); return ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->data == NULL); + GGML_ASSERT(tensor_data(tensor) == NULL); GGML_ASSERT(tensor->view_src == NULL); GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer)); GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <= (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer)); tensor->buffer = buffer; - tensor->data = addr; + tensor_set_data(tensor, addr); return ggml_backend_buffer_init_tensor(buffer, tensor); } @@ -1669,14 +1669,14 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) { GGML_ASSERT(src != NULL); - GGML_ASSERT(src->data && "graph must be allocated"); + GGML_ASSERT(tensor_data(src) != NULL && "graph must be allocated"); size_t id = ggml_hash_insert(&hash_set, src); if (id == GGML_HASHSET_ALREADY_EXISTS) { return node_copies[ggml_hash_find(&hash_set, src)]; } - struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); + struct ggml_tensor * dst = ggml_dup_tensor_layout(tensor_data(src) && !src->view_src ? ctx_allocated : ctx_unallocated, src); if (src->view_src != NULL) { dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); dst->view_offs = src->view_offs; @@ -1885,26 +1885,26 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); + memset((char *)tensor_data(tensor) + offset, value, size); GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *)tensor_data(tensor) + offset, data, size); GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *)tensor_data(tensor) + offset, size); GGML_UNUSED(buffer); } static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { - memcpy(dst->data, src->data, ggml_nbytes(src)); + memcpy(tensor_data(dst), tensor_data(src), ggml_nbytes(src)); return true; } return false; diff --git a/ggml/src/ggml-cpu/binary-ops.cpp b/ggml/src/ggml-cpu/binary-ops.cpp index 14f5b43ae0eb1..d70e62d6a9be5 100644 --- a/ggml/src/ggml-cpu/binary-ops.cpp +++ b/ggml/src/ggml-cpu/binary-ops.cpp @@ -90,9 +90,9 @@ static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * ds const int64_t i12 = i02 % ne12; const int64_t i11 = i01 % ne11; - dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + dst_t * dst_ptr = (dst_t *) ((char *) tensor_data(dst) + i03*nb3 + i02*nb2 + i01*nb1 ); + const src0_t * src0_ptr = (const src0_t *) ((const char *) tensor_data(src0) + i03*nb03 + i02*nb02 + i01*nb01); + const src1_t * src1_ptr = (const src1_t *) ((const char *) tensor_data(src1) + i13*nb13 + i12*nb12 + i11*nb11); if (is_src1_contiguous) { // src1 is broadcastable across src0 and dst in i1, i2, i3 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c5271b7757228..0fafd89caede2 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,11 @@ #include "ops.h" #include "ggml.h" +#ifdef GGML_NUMA_MIRROR +#include +#include +#endif + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -712,7 +717,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = tensor_data(tensor); switch (tensor->type) { case GGML_TYPE_I8: @@ -771,7 +776,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = tensor_data(tensor); switch (tensor->type) { case GGML_TYPE_I8: @@ -835,32 +840,32 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); - return ((int8_t *)(tensor->data))[i]; + return ((int8_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); - return ((int16_t *)(tensor->data))[i]; + return ((int16_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); - return ((int32_t *)(tensor->data))[i]; + return ((int32_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor_data(tensor)))[i]); } case GGML_TYPE_BF16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); - return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor_data(tensor)))[i]); } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); - return ((float *)(tensor->data))[i]; + return ((float *)(tensor_data(tensor)))[i]; } default: { @@ -880,32 +885,32 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); - ((int8_t *)(tensor->data))[i] = value; + ((int8_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); - ((int16_t *)(tensor->data))[i] = value; + ((int16_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); - ((int32_t *)(tensor->data))[i] = value; + ((int32_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor_data(tensor)))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); - ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + ((ggml_bf16_t *)(tensor_data(tensor)))[i] = GGML_FP32_TO_BF16(value); } break; case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); - ((float *)(tensor->data))[i] = value; + ((float *)(tensor_data(tensor)))[i] = value; } break; default: { @@ -915,7 +920,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { } int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + void * data = (char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: return ((int8_t *) data)[0]; @@ -935,7 +940,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i } void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + void * data = (char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: { @@ -977,27 +982,27 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { case GGML_TYPE_I8: { - return ((int8_t *)(tensor->data))[i]; + return ((int8_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_I16: { - return ((int16_t *)(tensor->data))[i]; + return ((int16_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_I32: { - return ((int32_t *)(tensor->data))[i]; + return ((int32_t *)(tensor_data(tensor)))[i]; } case GGML_TYPE_F16: { - return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor_data(tensor)))[i]); } case GGML_TYPE_BF16: { - return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor_data(tensor)))[i]); } case GGML_TYPE_F32: { - return ((float *)(tensor->data))[i]; + return ((float *)(tensor_data(tensor)))[i]; } default: { @@ -1016,27 +1021,27 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { switch (tensor->type) { case GGML_TYPE_I8: { - ((int8_t *)(tensor->data))[i] = value; + ((int8_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_I16: { - ((int16_t *)(tensor->data))[i] = value; + ((int16_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_I32: { - ((int32_t *)(tensor->data))[i] = value; + ((int32_t *)(tensor_data(tensor)))[i] = value; } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor_data(tensor)))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { - ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + ((ggml_bf16_t *)(tensor_data(tensor)))[i] = GGML_FP32_TO_BF16(value); } break; case GGML_TYPE_F32: { - ((float *)(tensor->data))[i] = value; + ((float *)(tensor_data(tensor)))[i] = value; } break; default: { @@ -1046,7 +1051,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + void * data = (char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: return ((int8_t *) data)[0]; @@ -1066,7 +1071,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + void * data = (char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: { @@ -1134,7 +1139,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( return; } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? tensor_data(src1) : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(ne12 % ne02 == 0); @@ -1165,7 +1170,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( const int64_t i2 = i12; const int64_t i3 = i13; - const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + const char * src0_row = (const char*)tensor_data(src0) + (0 + i02 * nb02 + i03 * nb03); // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using @@ -1175,7 +1180,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( (src1_cont || src1->type != vec_dot_type ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size : (i11 * nb11 + i12 * nb12 + i13 * nb13)); - float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + float * dst_col = (float*)((char*)tensor_data(dst) + (i1 * nb1 + i2 * nb2 + i3 * nb3)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); @@ -1240,11 +1245,11 @@ void ggml_compute_forward_mul_mat( for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(params, ne01, ne11, ne00/ggml_blck_size(src0->type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + (const char *)tensor_data(src0) + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), - (const char *)src1->data + i12*nb12 + i13*nb13, + (const char *)tensor_data(src1) + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), - (char *)dst->data + i12*nb2 + i13*nb3, + (char *)tensor_data(dst) + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), src0->type, src1->type, @@ -1270,7 +1275,7 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + from_float((float *)((char *) tensor_data(src1) + i13*nb13 + i12*nb12 + i11*nb11), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), ne10); } @@ -1283,7 +1288,7 @@ UseGgmlGemm1:; size_t bs = ggml_blck_size(vec_dot_type); int64_t ne10_block_start = (ith * ne10/bs) / nth; int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth; - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), + from_float((float *)((char *) tensor_data(src1) + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0), (ne10_block_end - ne10_block_start) * bs); } @@ -1301,18 +1306,18 @@ UseGgmlGemm1:; #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { - const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void* wdata = (src1->type == vec_dot_type) ? tensor_data(src1) : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(params, ne01, ne11, ne00/ggml_blck_size(src0->type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + (const char *)tensor_data(src0) + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), - (char *)dst->data + i12*nb2 + i13*nb3, + (char *)tensor_data(dst) + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), src0->type, vec_dot_type, @@ -1447,7 +1452,7 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( ? (i11 + i12*ne11)*row_size : (i11*nb11 + i12*nb12)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); + float * dst_col = (float *) ((char *) tensor_data(dst) + (i1*nb1 + i2*nb2)); for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); @@ -1533,7 +1538,7 @@ static void ggml_compute_forward_mul_mat_id( for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = ith; i12 < ne12; i12 += nth) { for (int64_t i11 = 0; i11 < ne11; ++i11) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + from_float((float *)((char *) tensor_data(src1) + i13*nb13 + i12*nb12 + i11*nb11), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), ne10); } @@ -1546,7 +1551,7 @@ static void ggml_compute_forward_mul_mat_id( size_t bs = ggml_blck_size(vec_dot_type); int64_t ne10_block_start = (ith * ne10/bs) / nth; int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth; - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), + from_float((float *)((char *) tensor_data(src1) + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0), (ne10_block_end - ne10_block_start) * bs); } @@ -1562,7 +1567,7 @@ static void ggml_compute_forward_mul_mat_id( // group rows by src0 matrix for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { for (int id = 0; id < n_ids; ++id) { - const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]); + const int32_t i02 = *(const int32_t *) ((const char *) tensor_data(ids) + iid1*ids->nb[1] + id*ids->nb[0]); assert(i02 >= 0 && i02 < n_as); @@ -1587,8 +1592,8 @@ static void ggml_compute_forward_mul_mat_id( continue; } - const char * src0_cur = (const char *) src0->data + cur_a * nb02; - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const char * src0_cur = (const char *) tensor_data(src0) + cur_a * nb02; + const void * wdata = (src1->type == vec_dot_type) ? tensor_data(src1) : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); const int64_t nr0 = ne01; @@ -2823,6 +2828,11 @@ struct ggml_cplan ggml_graph_plan( return cplan; } +#ifdef GGML_NUMA_MIRROR +static bool g_cpuset_isset = false; +static cpu_set_t g_cpuset; +#endif + static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_threadpool * tp = state->threadpool; @@ -2840,6 +2850,79 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.threadpool=*/ tp, }; +#ifdef GGML_NUMA_MIRROR + if (GGML_UNLIKELY(ggml_current_numa_node == -1)) { + int thread_id = state->ith; + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + + // Distribute threads evenly across NUMA nodes first, then assign CPUs within each node + int num_numa_nodes = numa_num_configured_nodes(); + if (num_numa_nodes <= 0) num_numa_nodes = 1; + + // Calculate which NUMA node this thread should use + int target_numa_node = thread_id % num_numa_nodes; + + bool cpumask[GGML_MAX_N_THREADS]; + memset(cpumask, 0, sizeof(bool) * GGML_MAX_N_THREADS); + for (int i = 0; i < GGML_MAX_N_THREADS; ++i) { + if (CPU_ISSET(i, &g_cpuset)) { + cpumask[i] = true; + } + } + + int cpuid = -1; + + // Try to find a CPU on the target NUMA node + struct bitmask* node_cpus = numa_allocate_cpumask(); + if (numa_node_to_cpus(target_numa_node, node_cpus) == 0) { + // Find the first available CPU on the target NUMA node that's also in our allowed set + for (int i = 0; i < GGML_MAX_N_THREADS; ++i) { + if (cpumask[i] && numa_bitmask_isbitset(node_cpus, i)) { + cpuid = i; + break; + } + } + } + numa_free_cpumask(node_cpus); + + // Fallback: if we couldn't find a CPU on the target node, use the original algorithm + if (cpuid == -1) { + bool local_mask[GGML_MAX_N_THREADS]; + int iter = 0; + for (int j = 0; j < thread_id; ++j) { + ggml_thread_cpumask_next(cpumask, local_mask, true, &iter); + } + memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS); + ggml_thread_cpumask_next(cpumask, local_mask, true, &iter); + for (int i = 0; i < GGML_MAX_N_THREADS; ++i) { + if (local_mask[i]) { + cpuid = i; + break; + } + } + } + + if (cpuid != -1) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpuid, &cpuset); + sched_setaffinity(gettid(), sizeof(cpuset), &cpuset); + } + + unsigned int numa_node = 0; + getcpu(NULL, &numa_node); + ggml_current_numa_node = numa_node; + + struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes()); + numa_bitmask_setbit(mask, ggml_current_numa_node); + numa_set_membind(mask); + numa_bitmask_free(mask); + + GGML_LOG_INFO("thread_id = %02d, target_node = %d, actual_node = %d, cpuid = %02d, n_threads = %d\n", + thread_id, target_numa_node, ggml_current_numa_node, cpuid, n_threads); + } +#endif // GGML_NUMA_MIRROR + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; @@ -3106,6 +3189,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl threadpool->abort = -1; threadpool->ec = GGML_STATUS_SUCCESS; } + +#ifdef GGML_NUMA_MIRROR + if (!g_cpuset_isset) { + CPU_ZERO(&g_cpuset); + sched_getaffinity(getpid(), sizeof(g_cpuset), &g_cpuset); + g_cpuset_isset = true; + } +#endif #ifdef GGML_USE_OPENMP if (n_threads > 1) { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6581d27adde2e..4d4db7684a55c 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -34,8 +34,8 @@ static void ggml_compute_forward_dup_same_cont( if (k0 < k1) { memcpy( - ((char *) dst->data + k0*nb0), - ((char *) src0->data + k0*nb0), + ((char *) tensor_data(dst) + k0*nb0), + ((char *) tensor_data(src0) + k0*nb0), (k1 - k0) * nb0); } } @@ -70,8 +70,8 @@ static void ggml_compute_forward_dup_f16( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03), rs); } } @@ -86,13 +86,13 @@ static void ggml_compute_forward_dup_f16( if (dst->type == GGML_TYPE_F16) { size_t id = 0; const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + const char * src0_ptr = (char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03; memcpy(dst_ptr + id, src0_ptr, rs); id += rs; } @@ -101,13 +101,13 @@ static void ggml_compute_forward_dup_f16( } } else if (dst->type == GGML_TYPE_F32) { size_t id = 0; - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); id++; @@ -122,13 +122,13 @@ static void ggml_compute_forward_dup_f16( size_t id = 0; size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); @@ -148,14 +148,14 @@ static void ggml_compute_forward_dup_f16( if (dst->type == GGML_TYPE_F32) { size_t id = 0; - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr); id++; @@ -166,14 +166,14 @@ static void ggml_compute_forward_dup_f16( } } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -213,8 +213,8 @@ static void ggml_compute_forward_dup_f16( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); @@ -265,8 +265,8 @@ static void ggml_compute_forward_dup_f16( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); @@ -334,8 +334,8 @@ static void ggml_compute_forward_dup_bf16( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03), rs); } } @@ -350,13 +350,13 @@ static void ggml_compute_forward_dup_bf16( if (dst->type == GGML_TYPE_BF16) { size_t id = 0; const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + const char * src0_ptr = (char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03; memcpy(dst_ptr + id, src0_ptr, rs); id += rs; } @@ -365,13 +365,13 @@ static void ggml_compute_forward_dup_bf16( } } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); id++; @@ -382,13 +382,13 @@ static void ggml_compute_forward_dup_bf16( } } else if (dst->type == GGML_TYPE_F32) { size_t id = 0; - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]); id++; @@ -403,13 +403,13 @@ static void ggml_compute_forward_dup_bf16( size_t id = 0; size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]); @@ -429,14 +429,14 @@ static void ggml_compute_forward_dup_bf16( if (dst->type == GGML_TYPE_F32) { size_t id = 0; - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr); id++; @@ -447,14 +447,14 @@ static void ggml_compute_forward_dup_bf16( } } else if (dst->type == GGML_TYPE_BF16) { size_t id = 0; - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -465,14 +465,14 @@ static void ggml_compute_forward_dup_bf16( } } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); id++; @@ -512,8 +512,8 @@ static void ggml_compute_forward_dup_bf16( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); @@ -564,8 +564,8 @@ static void ggml_compute_forward_dup_bf16( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); @@ -616,8 +616,8 @@ static void ggml_compute_forward_dup_bf16( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr); @@ -685,8 +685,8 @@ static void ggml_compute_forward_dup_f32( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03), rs); } } @@ -702,13 +702,13 @@ static void ggml_compute_forward_dup_f32( size_t id = 0; size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * src0_ptr = (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); from_float(src0_ptr, dst_ptr + id, ne00); id += rs; } @@ -723,14 +723,14 @@ static void ggml_compute_forward_dup_f32( if (dst->type == GGML_TYPE_F32) { size_t id = 0; - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const float * src0_ptr = (float *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -741,14 +741,14 @@ static void ggml_compute_forward_dup_f32( } } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const float * src0_ptr = (float *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; @@ -759,14 +759,14 @@ static void ggml_compute_forward_dup_f32( } } else if (dst->type == GGML_TYPE_BF16) { size_t id = 0; - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) tensor_data(dst); for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const float * src0_ptr = (float *) ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr); id++; @@ -808,8 +808,8 @@ static void ggml_compute_forward_dup_f32( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, sizeof(float)); @@ -860,8 +860,8 @@ static void ggml_compute_forward_dup_f32( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr); @@ -912,8 +912,8 @@ static void ggml_compute_forward_dup_f32( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr); @@ -989,8 +989,8 @@ static void ggml_compute_forward_dup_bytes( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03), rs); } } @@ -1000,7 +1000,7 @@ static void ggml_compute_forward_dup_bytes( if (ggml_is_contiguous(dst)) { size_t id = 0; - char * dst_ptr = (char *) dst->data; + char * dst_ptr = (char *) tensor_data(dst); const size_t rs = ne00 * type_size; if (nb00 == type_size) { @@ -1009,7 +1009,7 @@ static void ggml_compute_forward_dup_bytes( for (int64_t i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int64_t i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + const char * src0_ptr = (char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03; memcpy(dst_ptr + id, src0_ptr, rs); id += rs; } @@ -1024,7 +1024,7 @@ static void ggml_compute_forward_dup_bytes( id += rs * ir0; for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; + const char * src0_ptr = (char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; memcpy(dst_ptr + id, src0_ptr, type_size); id += type_size; @@ -1065,8 +1065,8 @@ static void ggml_compute_forward_dup_bytes( } for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t k00 = 0; k00 < nk00; k00++) { - const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + const char * src0_ptr = ((char *) tensor_data(src0) + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) tensor_data(dst) + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, type_size); @@ -1147,8 +1147,8 @@ static void ggml_compute_forward_dup_q( const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; dequantize_row_q( - (const void *) ((char *) src0->data + x_offset), - (float *) ((char *) dst->data + dst_offset), qk); + (const void *) ((char *) tensor_data(src0) + x_offset), + (float *) ((char *) tensor_data(dst) + dst_offset), qk); } } @@ -1246,9 +1246,9 @@ static void ggml_compute_forward_add_q_f32( const int i2 = i02; const int i1 = i01; - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); - void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + void * src0_row = (void *) ((char *) tensor_data(src0) + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) tensor_data(src1) + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) tensor_data(dst) + ( i1*nb1 + i2*nb2 + i3*nb3)); assert(ne00 % 32 == 0); @@ -1348,15 +1348,15 @@ static void ggml_compute_forward_add1_f32( GGML_UNUSED(ggml_vec_add1_f32); vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data), 0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + (float *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) tensor_data(src1)), 0, + (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else ggml_vec_add1_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - *(float *) src1->data); + (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01), + *(float *) tensor_data(src1)); #endif } } @@ -1372,7 +1372,7 @@ static void ggml_compute_forward_add1_f16_f32( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = *(float *) src1->data; + const float v = *(float *) tensor_data(src1); const int ith = params->ith; const int nth = params->nth; @@ -1401,8 +1401,8 @@ static void ggml_compute_forward_add1_f16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } @@ -1420,7 +1420,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) tensor_data(src1)); const int ith = params->ith; const int nth = params->nth; @@ -1449,8 +1449,8 @@ static void ggml_compute_forward_add1_f16_f16( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } @@ -1468,7 +1468,7 @@ static void ggml_compute_forward_add1_q_f32( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = *(float *) src1->data; + const float v = *(float *) tensor_data(src1); const int ith = params->ith; const int nth = params->nth; @@ -1508,8 +1508,8 @@ static void ggml_compute_forward_add1_q_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); - void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); + void * src0_row = (void *) ((char *) tensor_data(src0) + (i1*nb01 + i2*nb02 + i3*nb03)); + void * dst_row = (void *) ((char *) tensor_data(dst) + (i1*nb1 + i2*nb2 + i3*nb0 )); assert(ne0 % 32 == 0); @@ -1533,7 +1533,7 @@ static void ggml_compute_forward_add1_bf16_f32( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = *(float *) src1->data; + const float v = *(float *) tensor_data(src1); const int ith = params->ith; const int nth = params->nth; @@ -1562,8 +1562,8 @@ static void ggml_compute_forward_add1_bf16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); } @@ -1581,7 +1581,7 @@ static void ggml_compute_forward_add1_bf16_bf16( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); + const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) tensor_data(src1)); const int ith = params->ith; const int nth = params->nth; @@ -1610,8 +1610,8 @@ static void ggml_compute_forward_add1_bf16_bf16( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); } @@ -1711,8 +1711,8 @@ static void ggml_compute_forward_acc_f32( // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( - ((char *) dst->data), - ((char *) src0->data), + ((char *) tensor_data(dst)), + ((char *) tensor_data(src0)), ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -1756,14 +1756,14 @@ static void ggml_compute_forward_acc_f32( #ifdef GGML_USE_ACCELERATE vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); + (float *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, + (float *) ((char *) tensor_data(src1) + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); #else ggml_vec_add_f32(nc, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + offset), + (float *) ((char *) tensor_data(src1) + i3*nb13 + i2*nb12 + i1*nb11)); #endif } } @@ -1836,12 +1836,12 @@ static void ggml_compute_forward_sum_f32( for (int64_t i01 = 0; i01 < ne01; i01++) { ggml_vec_sum_f32_ggf(ne00, &row_sum, - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03)); sum += row_sum; } } } - ((float *) dst->data)[0] = sum; + ((float *) tensor_data(dst))[0] = sum; } static void ggml_compute_forward_sum_f16( @@ -1869,12 +1869,12 @@ static void ggml_compute_forward_sum_f16( for (int64_t i01 = 0; i01 < ne01; i01++) { ggml_vec_sum_f16_ggf(ne00, &row_sum, - (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + (ggml_fp16_t *) ((char *) tensor_data(src0) + i01 * nb01 + i02 * nb02 + i03 * nb03)); sum += row_sum; } } } - ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum); + ((ggml_fp16_t *) tensor_data(dst))[0] = GGML_CPU_FP32_TO_FP16(sum); } static void ggml_compute_forward_sum_bf16( @@ -1902,12 +1902,12 @@ static void ggml_compute_forward_sum_bf16( for (int64_t i01 = 0; i01 < ne01; i01++) { ggml_vec_sum_bf16_ggf(ne00, &row_sum, - (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + (ggml_bf16_t *) ((char *) tensor_data(src0) + i01 * nb01 + i02 * nb02 + i03 * nb03)); sum += row_sum; } } } - ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); + ((ggml_bf16_t *) tensor_data(dst))[0] = GGML_FP32_TO_BF16(sum); } void ggml_compute_forward_sum( @@ -1961,8 +1961,8 @@ static void ggml_compute_forward_sum_rows_f32( for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float * src_row = (float *) ((char *) tensor_data(src0) + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) tensor_data(dst) + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; ggml_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; @@ -2019,10 +2019,10 @@ static void ggml_compute_forward_mean_f32( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { ggml_vec_sum_f32(ne00, - (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + (float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3), + (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03)); - *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; + *(float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; } } } @@ -2068,8 +2068,8 @@ static void ggml_compute_forward_argmax_f32( const size_t nb0 = dst->nb[0]; for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src = (float *) ((char *) src0->data + i1*nb01); - int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + float * src = (float *) ((char *) tensor_data(src0) + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) tensor_data(dst) + i1*nb0); int v = 0; ggml_vec_argmax_f32(ne00, &v, src); dst_[0] = v; @@ -2131,8 +2131,8 @@ static void ggml_compute_forward_count_equal_i32( const int64_t i02 = (ir - i03*ne03) / ne01; const int64_t i01 = ir - i03*ne03 - i02*ne02; - const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; - const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; + const char * data0 = (const char *) tensor_data(src0) + i03*nb03 + i02*nb02 + i01*nb01; + const char * data1 = (const char *) tensor_data(src1) + i03*nb13 + i02*nb12 + i01*nb11; for (int64_t i00 = 0; i00 < ne00; ++i00) { const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); @@ -2153,7 +2153,7 @@ static void ggml_compute_forward_count_equal_i32( for (int ith_other = 1; ith_other < nth; ++ith_other) { sum_thread += sums[ith_other]; } - *((int64_t *) dst->data) = sum_thread; + *((int64_t *) tensor_data(dst)) = sum_thread; } void ggml_compute_forward_count_equal( @@ -2209,8 +2209,8 @@ static void ggml_compute_forward_repeat_f32( for (int k1 = 0; k1 < ne01; k1++) { for (int i0 = 0; i0 < nr0; i0++) { ggml_vec_cpy_f32(ne00, - (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), - (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); + (float *) ((char *) tensor_data(dst) + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) tensor_data(src0) + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); } } } @@ -2252,8 +2252,8 @@ static void ggml_compute_forward_repeat_f16( for (int i1 = 0; i1 < nr1; i1++) { for (int k1 = 0; k1 < ne01; k1++) { for (int i0 = 0; i0 < nr0; i0++) { - ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); - ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); + ggml_fp16_t * y = (ggml_fp16_t *) ((char *) tensor_data(dst) + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); + ggml_fp16_t * x = (ggml_fp16_t *) ((char *) tensor_data(src0) + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); // ggml_vec_cpy_f16(ne00, y, x) for (int i = 0; i < ne00; ++i) { y[i] = x[i]; @@ -2325,13 +2325,13 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(nb00 == sizeof(float)); if (ggml_is_contiguous(dst)) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)tensor_data(dst), 0); } else { for (int k3 = 0; k3 < ne3; k3++) { for (int k2 = 0; k2 < ne2; k2++) { for (int k1 = 0; k1 < ne1; k1++) { ggml_vec_set_f32(ne0, - (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + (float *) ((char *) tensor_data(dst) + k1*nb1 + k2*nb2 + k3*nb3), 0); } } @@ -2347,8 +2347,8 @@ static void ggml_compute_forward_repeat_back_f32( for (int k1 = 0; k1 < ne1; k1++) { for (int i0 = 0; i0 < nr0; i0++) { ggml_vec_acc_f32(ne0, - (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), - (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + (float *) ((char *) tensor_data(dst) + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) tensor_data(src0) + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); } } } @@ -2407,12 +2407,12 @@ static void ggml_compute_forward_concat_any( for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03; + x = (const char *)tensor_data(src0) + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03; } else { - x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13; + x = (const char *)tensor_data(src1) + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13; } - char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3; + char * y = (char *)tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3; memcpy(y, x, len); } @@ -2450,12 +2450,12 @@ static void ggml_compute_forward_concat_i8( for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + x = (const int8_t *) ((const char *)tensor_data(src0) + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); } else { - x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + x = (const int8_t *) ((const char *)tensor_data(src1) + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); } - int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + int8_t * y = (int8_t *)((char *)tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y = *x; } @@ -2493,12 +2493,12 @@ static void ggml_compute_forward_concat_f16( for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + x = (const ggml_fp16_t *) ((const char *)tensor_data(src0) + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); } else { - x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + x = (const ggml_fp16_t *) ((const char *)tensor_data(src1) + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); } - ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + ggml_fp16_t * y = (ggml_fp16_t *)((char *)tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y = *x; } @@ -2536,12 +2536,12 @@ static void ggml_compute_forward_concat_f32( for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + x = (const float *) ((const char *)tensor_data(src0) + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); } else { - x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + x = (const float *) ((const char *)tensor_data(src1) + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); } - float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + float * y = (float *)((char *)tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y = *x; } @@ -2606,12 +2606,12 @@ static void ggml_compute_forward_gelu_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); + (float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (float *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -2645,12 +2645,12 @@ static void ggml_compute_forward_gelu_f16( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -2709,12 +2709,12 @@ static void ggml_compute_forward_gelu_erf_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_erf_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); + (float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (float *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -2748,12 +2748,12 @@ static void ggml_compute_forward_gelu_erf_f16( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_erf_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -2812,12 +2812,12 @@ static void ggml_compute_forward_gelu_quick_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_quick_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); + (float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (float *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -2851,12 +2851,12 @@ static void ggml_compute_forward_gelu_quick_f16( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_gelu_quick_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -2915,12 +2915,12 @@ static void ggml_compute_forward_silu_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_silu_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); + (float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (float *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -2954,12 +2954,12 @@ static void ggml_compute_forward_silu_f16( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_silu_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src0) + i1*(src0->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])))[k]; const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3017,8 +3017,8 @@ static void ggml_compute_forward_leaky_relu_f32( for (int i = 0; i < n; i++) { ggml_vec_leaky_relu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); + (float *) ((char *) tensor_data(dst) + i*( dst->nb[1])), + (float *) ((char *) tensor_data(src0) + i*(src0->nb[1])), negative_slope); } } @@ -3047,8 +3047,8 @@ static void ggml_compute_forward_leaky_relu_f16( for (int i = 0; i < n; i++) { ggml_vec_leaky_relu_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src0) + i*(src0->nb[1])), negative_slope); } } @@ -3104,13 +3104,13 @@ static void ggml_compute_forward_silu_back_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_silu_backward_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src1->data + i1*(src1->nb[1])), - (float *) ((char *) grad->data + i1*(grad->nb[1]))); + (float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (float *) ((char *) tensor_data(src1) + i1*(src1->nb[1])), + (float *) ((char *) tensor_data(grad) + i1*(grad->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3147,13 +3147,13 @@ static void ggml_compute_forward_silu_back_f16( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_silu_backward_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])), - (ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1]))); + (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(src1) + i1*(src1->nb[1])), + (ggml_fp16_t *) ((char *) tensor_data(grad) + i1*(grad->nb[1]))); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3193,8 +3193,8 @@ static void ggml_compute_forward_reglu_f32( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3233,11 +3233,11 @@ static void ggml_compute_forward_reglu_f32( src1_p += swapped ? 0 : nc; } - ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_reglu_f32(nc, (float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3252,8 +3252,8 @@ static void ggml_compute_forward_reglu_f16( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3292,11 +3292,11 @@ static void ggml_compute_forward_reglu_f16( src1_p += swapped ? 0 : nc; } - ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3336,8 +3336,8 @@ static void ggml_compute_forward_geglu_f32( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3376,11 +3376,11 @@ static void ggml_compute_forward_geglu_f32( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_f32(nc, (float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3395,8 +3395,8 @@ static void ggml_compute_forward_geglu_f16( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3435,11 +3435,11 @@ static void ggml_compute_forward_geglu_f16( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3479,8 +3479,8 @@ static void ggml_compute_forward_swiglu_f32( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3519,11 +3519,11 @@ static void ggml_compute_forward_swiglu_f32( src1_p += swapped ? 0 : nc; } - ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_swiglu_f32(nc, (float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3538,8 +3538,8 @@ static void ggml_compute_forward_swiglu_f16( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3578,11 +3578,11 @@ static void ggml_compute_forward_swiglu_f16( src1_p += swapped ? 0 : nc; } - ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3622,8 +3622,8 @@ static void ggml_compute_forward_geglu_erf_f32( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3662,11 +3662,11 @@ static void ggml_compute_forward_geglu_erf_f32( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_erf_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_erf_f32(nc, (float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3681,8 +3681,8 @@ static void ggml_compute_forward_geglu_erf_f16( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3721,11 +3721,11 @@ static void ggml_compute_forward_geglu_erf_f16( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3765,8 +3765,8 @@ static void ggml_compute_forward_geglu_quick_f32( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3805,11 +3805,11 @@ static void ggml_compute_forward_geglu_quick_f32( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_quick_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_quick_f32(nc, (float *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; GGML_UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -3824,8 +3824,8 @@ static void ggml_compute_forward_geglu_quick_f16( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - char * src0_d = (char *) src0->data; - char * src1_d = (char *) (src1 ? src1->data : src0->data); + char * src0_d = (char *) tensor_data(src0); + char * src1_d = (char *) (src1 ? tensor_data(src1) : tensor_data(src0)); const size_t src0_o = src0->nb[1]; const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; @@ -3864,11 +3864,11 @@ static void ggml_compute_forward_geglu_quick_f16( src1_p += swapped ? 0 : nc; } - ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) tensor_data(dst) + i1*(dst->nb[1])), src0_p, src1_p); #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) tensor_data(dst) + i1*( dst->nb[1])))[k]; const float v = GGML_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); @@ -3926,7 +3926,7 @@ static void ggml_compute_forward_norm_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * x = (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -3935,7 +3935,7 @@ static void ggml_compute_forward_norm_f32( float mean = sum/ne00; - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + float * y = (float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3); ggml_float sum2 = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -3997,7 +3997,7 @@ static void ggml_compute_forward_rms_norm_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * x = (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -4006,7 +4006,7 @@ static void ggml_compute_forward_rms_norm_f32( const float mean = sum/ne00; - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + float * y = (float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3); memcpy(y, x, ne00 * sizeof(float)); // for (int i00 = 0; i00 < ne00; i00++) { @@ -4071,8 +4071,8 @@ static void ggml_compute_forward_rms_norm_back_f32( const int64_t i12 = i02; const int64_t i13 = i03; - const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + const float * dz = (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); + const float * x = (float *) ((char *) tensor_data(src1) + i11*nb11 + i12*nb12 + i13*nb13); ggml_float sum_xx = 0.0; ggml_float sum_xdz = 0.0; @@ -4186,7 +4186,7 @@ static void ggml_compute_forward_rms_norm_back_f32( // dx := scale(dx,-mean_xdz/mean_eps) // dx := add(dx, dz) // dx := scale(dx, rrms) - float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + float * dx = (float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3); // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps) ggml_vec_cpy_f32 (ne00, dx, x); @@ -4254,7 +4254,7 @@ static void ggml_compute_forward_group_norm_f32( ggml_float sum = 0.0; for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + const float * x = (float *)((char *) tensor_data(src0) + i01 * nb01 + i02 * nb02 + i03 * nb03); ggml_float sumr = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -4268,9 +4268,9 @@ static void ggml_compute_forward_group_norm_f32( ggml_float sum2 = 0.0; for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + const float * x = (float *)((char *) tensor_data(src0) + i01 * nb01 + i02 * nb02 + i03 * nb03); - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + float * y = (float *)((char *) tensor_data(dst) + i01 * nb1 + i02 * nb2 + i03 * nb3); ggml_float sumr = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -4286,7 +4286,7 @@ static void ggml_compute_forward_group_norm_f32( for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + float * y = (float *)((char *) tensor_data(dst) + i01 * nb1 + i02 * nb2 + i03 * nb3); ggml_vec_scale_f32(ne00, y, scale); } } @@ -4338,14 +4338,14 @@ static void ggml_compute_forward_l2_norm_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * x = (float *) ((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { sum += (ggml_float)(x[i00] * x[i00]); } - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + float * y = (float *) ((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3); memcpy(y, x, ne00 * sizeof(float)); @@ -4414,7 +4414,7 @@ static void ggml_compute_forward_out_prod_f32( // compute by src0 rows if (ith == 0) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)tensor_data(dst), 0); } ggml_barrier(params->threadpool); @@ -4467,18 +4467,18 @@ static void ggml_compute_forward_out_prod_f32( for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) { const int64_t i11 = i01; - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + float * s0 = (float *) ((char *) tensor_data(src0) + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) tensor_data(src1) + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) tensor_data(dst) + ( i1*nb1 + i2*nb2 + i3*nb3)); ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); } for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { const int64_t i11 = i01; - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + float * s0 = (float *) ((char *) tensor_data(src0) + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) tensor_data(src1) + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) tensor_data(dst) + ( i1*nb1 + i2*nb2 + i3*nb3)); ggml_vec_mad_f32(ne0, d, s0, *s1); } @@ -4486,9 +4486,9 @@ static void ggml_compute_forward_out_prod_f32( for (int64_t i01 = bi01; i01 < bne01; ++i01) { const int64_t i11 = i01; - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + float * s0 = (float *) ((char *) tensor_data(src0) + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) tensor_data(src1) + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) tensor_data(dst) + ( i1*nb1 + i2*nb2 + i3*nb3)); ggml_vec_mad_f32(ne0, d, s0, *s1); } @@ -4536,7 +4536,7 @@ static void ggml_compute_forward_out_prod_q_f32( // compute by src0 rows if (ith == 0) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float *)tensor_data(dst), 0); } ggml_barrier(params->threadpool); @@ -4577,9 +4577,9 @@ static void ggml_compute_forward_out_prod_q_f32( for (int64_t i01 = 0; i01 < ne01; ++i01) { const int64_t i11 = i01; - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + float * s0 = (float *) ((char *) tensor_data(src0) + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) tensor_data(src1) + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) tensor_data(dst) + ( i1*nb1 + i2*nb2 + i3*nb3)); dequantize_row_q(s0, wdata, ne0); ggml_vec_mad_f32(ne0, d, wdata, *s1); @@ -4671,18 +4671,18 @@ static void ggml_compute_forward_scale_f32( if (b == 0.0f) { for (int i1 = ir0; i1 < ir1; i1++) { - if (dst->data != src0->data) { + if (tensor_data(dst) != tensor_data(src0)) { // src0 is same shape as dst => same indices // TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy - memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + memcpy((char *)tensor_data(dst) + i1*nb1, (char *)tensor_data(src0) + i1*nb01, nc * sizeof(float)); } - ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s); + ggml_vec_scale_f32(nc, (float *) ((char *) tensor_data(dst) + i1*nb1), s); } } else { for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_mad1_f32(nc, - (float *) ((char *) dst->data + i1*nb1), - (float *) ((char *) src0->data + i1*nb1), + (float *) ((char *) tensor_data(dst) + i1*nb1), + (float *) ((char *) tensor_data(src0) + i1*nb1), s, b); } } @@ -4731,8 +4731,8 @@ static void ggml_compute_forward_set_f32( // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( - ((char *) dst->data), - ((char *) src0->data), + ((char *) tensor_data(dst)), + ((char *) tensor_data(src0)), ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -4774,8 +4774,8 @@ static void ggml_compute_forward_set_f32( const int i1 = (ir - i3*ne12*ne11 - i2*ne11); ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) tensor_data(src1) + i3*nb13 + i2*nb12 + i1*nb11)); } } @@ -4802,8 +4802,8 @@ static void ggml_compute_forward_set_i32( // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( - ((char *) dst->data), - ((char *) src0->data), + ((char *) tensor_data(dst)), + ((char *) tensor_data(src0)), ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -4845,8 +4845,8 @@ static void ggml_compute_forward_set_i32( const int i1 = (ir - i3*ne12*ne11 - i2*ne11); ggml_vec_cpy_i32(nc, - (int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), - (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + (int32_t *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (int32_t *) ((char *) tensor_data(src1) + i3*nb13 + i2*nb12 + i1*nb11)); } } @@ -4988,13 +4988,13 @@ static void ggml_compute_forward_get_rows_q( const int64_t i12 = i/(ne11*ne10); const int64_t i11 = (i - i12*ne11*ne10)/ne10; const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t i01 = *(int32_t *) ((char *) tensor_data(src1) + i10*nb10 + i11*nb11 + i12*nb12); GGML_ASSERT(i01 >= 0 && i01 < ne01); dequantize_row_q( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + (const void *) ((char *) tensor_data(src0) + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) tensor_data(dst) + i10*nb1 + i11*nb2 + i12*nb3), nc); } } @@ -5029,13 +5029,13 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i12 = i/(ne11*ne10); const int64_t i11 = (i - i12*ne11*ne10)/ne10; const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t i01 = *(int32_t *) ((char *) tensor_data(src1) + i10*nb10 + i11*nb11 + i12*nb12); GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_cpu_fp16_to_fp32( - (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + (const ggml_fp16_t*) ((char *) tensor_data(src0) + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) tensor_data(dst) + i10*nb1 + i11*nb2 + i12*nb3), nc); } } @@ -5070,13 +5070,13 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i12 = i/(ne11*ne10); const int64_t i11 = (i - i12*ne11*ne10)/ne10; const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t i01 = *(int32_t *) ((char *) tensor_data(src1) + i10*nb10 + i11*nb11 + i12*nb12); GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_cpu_bf16_to_fp32( - (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + (const ggml_bf16_t *) ((char *) tensor_data(src0) + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) tensor_data(dst) + i10*nb1 + i11*nb2 + i12*nb3), nc); } } @@ -5111,13 +5111,13 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i12 = i/(ne11*ne10); const int64_t i11 = (i - i12*ne11*ne10)/ne10; const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t i01 = *(int32_t *) ((char *) tensor_data(src1) + i10*nb10 + i11*nb11 + i12*nb12); GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), - (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); + (float *) ((char *) tensor_data(dst) + i10*nb1 + i11*nb2 + i12*nb3), + (float *) ((char *) tensor_data(src0) + i01*nb01 + i11*nb02 + i12*nb03)); } } @@ -5180,7 +5180,7 @@ void ggml_compute_forward_get_rows( // for (int k = 0; k < dst->ne[1]; ++k) { // for (int j = 0; j < dst->ne[0]/16; ++j) { // for (int i = 0; i < 16; ++i) { - // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // printf("%8.4f ", ((float *) tensor_data(dst))[k*dst->ne[0] + j*16 + i]); // } // printf("\n"); // } @@ -5229,13 +5229,13 @@ static void ggml_compute_forward_set_rows_f32( const int64_t i11 = i02%ne11; const int64_t i10 = i; - const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int64_t i1 = *(int64_t *) ((char *) tensor_data(src1) + i10*nb10 + i11*nb11 + i12*nb12); GGML_ASSERT(i1 >= 0 && i1 < ne1); from_float( - (const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03), - ((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc); + (const float *) ((char *) tensor_data(src0) + i*nb01 + i02*nb02 + i03*nb03), + ((char *) tensor_data(dst) + i1*nb1 + i02*nb2 + i03*nb3), nc); } } } @@ -5276,7 +5276,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - memset(dst->data, 0, ggml_nbytes(dst)); + memset(tensor_data(dst), 0, ggml_nbytes(dst)); const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); @@ -5285,11 +5285,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16( GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + const int r = ((int32_t *) tensor_data(src1))[i]; for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v); + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) tensor_data(src0) + i*src0->nb[1]))[j]; + ((float *) ((char *) tensor_data(dst) + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v); } } } @@ -5309,7 +5309,7 @@ static void ggml_compute_forward_get_rows_back_f32( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - memset(dst->data, 0, ggml_nbytes(dst)); + memset(tensor_data(dst), 0, ggml_nbytes(dst)); const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); @@ -5318,12 +5318,12 @@ static void ggml_compute_forward_get_rows_back_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + const int r = ((int32_t *) tensor_data(src1))[i]; ggml_vec_add_f32(nc, - (float *) ((char *) dst->data + r*dst->nb[1]), - (float *) ((char *) dst->data + r*dst->nb[1]), - (float *) ((char *) src0->data + i*src0->nb[1])); + (float *) ((char *) tensor_data(dst) + r*dst->nb[1]), + (float *) ((char *) tensor_data(dst) + r*dst->nb[1]), + (float *) ((char *) tensor_data(src0) + i*src0->nb[1])); } } @@ -5356,7 +5356,7 @@ void ggml_compute_forward_get_rows_back( // for (int k = 0; k < dst->ne[1]; ++k) { // for (int j = 0; j < dst->ne[0]/16; ++j) { // for (int i = 0; i < 16; ++i) { - // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // printf("%8.4f ", ((float *) tensor_data(dst))[k*dst->ne[0] + j*16 + i]); // } // printf("\n"); // } @@ -5395,8 +5395,8 @@ static void ggml_compute_forward_diag_f32( for (int i3 = 0; i3 < ne3; i3++) { for (int i2 = 0; i2 < ne2; i2++) { for (int i1 = 0; i1 < ne1; i1++) { - float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); + float * d = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1); + float * s = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02); for (int i0 = 0; i0 < i1; i0++) { d[i0] = 0; } @@ -5440,7 +5440,7 @@ static void ggml_compute_forward_diag_mask_f32( const int nth = params->nth; const int n_past = ((int32_t *) dst->op_params)[0]; - const bool inplace = src0->data == dst->data; + const bool inplace = tensor_data(src0) == tensor_data(dst); GGML_ASSERT(n_past >= 0); @@ -5451,8 +5451,8 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); memcpy( - ((char *) dst->data), - ((char *) src0->data), + ((char *) tensor_data(dst)), + ((char *) tensor_data(src0)), ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -5472,7 +5472,7 @@ static void ggml_compute_forward_diag_mask_f32( for (int j = ith; j < nr; j += nth) { for (int i = n_past; i < nc; i++) { if (i > n_past + j) { - *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; + *(float *)((char *) tensor_data(dst) + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; } } } @@ -5568,12 +5568,12 @@ static void ggml_compute_forward_soft_max_f32( const uint32_t h = i02; // head const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; - float * sp = (float *)((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - float * dp = (float *)((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + float * sp = (float *)((char *) tensor_data(src0) + i01*nb01 + i02*nb02 + i03*nb03); + float * dp = (float *)((char *) tensor_data(dst) + i01*nb1 + i02*nb2 + i03*nb3); // broadcast the mask across rows - ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL; - float * mp_f32 = src1 ? (float *)((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13) : NULL; + ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) tensor_data(src1) + i11*nb11 + i12*nb12 + i13*nb13) : NULL; + float * mp_f32 = src1 ? (float *)((char *) tensor_data(src1) + i11*nb11 + i12*nb12 + i13*nb13) : NULL; ggml_vec_cpy_f32 (ne00, wp, sp); ggml_vec_scale_f32(ne00, wp, scale); @@ -5674,9 +5674,9 @@ static void ggml_compute_forward_soft_max_ext_back_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); - float *y = (float *)((char *) src1->data + i1*src1->nb[1]); - float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); + float *dy = (float *)((char *) tensor_data(src0) + i1*src0->nb[1]); + float *y = (float *)((char *) tensor_data(src1) + i1*src1->nb[1]); + float *dx = (float *)((char *) tensor_data(dst) + i1*dst->nb[1]); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -5768,8 +5768,8 @@ static void ggml_compute_forward_clamp_f32( GGML_ASSERT(nb00 == sizeof(float)); for (int j = ith; j < n; j += nth) { - float * dst_ptr = (float *) ((char *) dst->data + j*nb1); - float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + float * dst_ptr = (float *) ((char *) tensor_data(dst) + j*nb1); + float * src0_ptr = (float *) ((char *) tensor_data(src0) + j*nb01); for (int i = 0; i < nc; i++) { dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); @@ -5804,8 +5804,8 @@ static void ggml_compute_forward_clamp_f16( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); for (int j = ith; j < n; j += nth) { - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) tensor_data(dst) + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) tensor_data(src0) + j*nb01); for (int i = 0; i < nc; i++) { float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]); @@ -6037,7 +6037,7 @@ static void ggml_compute_forward_rope_f32( if (src2 != NULL) { GGML_ASSERT(src2->type == GGML_TYPE_F32); GGML_ASSERT(src2->ne[0] >= n_dims / 2); - freq_factors = (const float *) src2->data; + freq_factors = (const float *) tensor_data(src2); } // backward process uses inverse rotation by cos and sin. @@ -6045,7 +6045,7 @@ static void ggml_compute_forward_rope_f32( // this essentially just switches the sign of sin. const float sin_sign = forward ? 1.0f : -1.0f; - const int32_t * pos = (const int32_t *) src1->data; + const int32_t * pos = (const int32_t *) tensor_data(src1); for (int64_t i3 = 0; i3 < ne3; i3++) { // batch for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len @@ -6077,8 +6077,8 @@ static void ggml_compute_forward_rope_f32( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const float * const src = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + float * dst_data = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = src[0]; const float x1 = src[n_dims]; @@ -6093,8 +6093,8 @@ static void ggml_compute_forward_rope_f32( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const float * const src = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + float * dst_data = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = src[0]; const float x1 = src[n_dims/2]; @@ -6108,8 +6108,8 @@ static void ggml_compute_forward_rope_f32( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const float * const src = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = src[0]; const float x1 = src[1]; @@ -6126,8 +6126,8 @@ static void ggml_compute_forward_rope_f32( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const float * const src = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + float * dst_data = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = src[0]; const float x1 = src[n_dims]; @@ -6138,8 +6138,8 @@ static void ggml_compute_forward_rope_f32( } else { // fill the remain channels with data from src tensor for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const float * const src = (float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); dst_data[0] = src[0]; dst_data[1] = src[1]; @@ -6223,7 +6223,7 @@ static void ggml_compute_forward_rope_f16( if (src2 != NULL) { GGML_ASSERT(src2->type == GGML_TYPE_F32); GGML_ASSERT(src2->ne[0] >= n_dims / 2); - freq_factors = (const float *) src2->data; + freq_factors = (const float *) tensor_data(src2); } // backward process uses inverse rotation by cos and sin. @@ -6231,7 +6231,7 @@ static void ggml_compute_forward_rope_f16( // this essentially just switches the sign of sin. const float sin_sign = forward ? 1.0f : -1.0f; - const int32_t * pos = (const int32_t *) src1->data; + const int32_t * pos = (const int32_t *) tensor_data(src1); for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { @@ -6263,8 +6263,8 @@ static void ggml_compute_forward_rope_f16( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); @@ -6279,8 +6279,8 @@ static void ggml_compute_forward_rope_f16( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]); @@ -6294,8 +6294,8 @@ static void ggml_compute_forward_rope_f16( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); const float x1 = GGML_CPU_FP16_TO_FP32(src[1]); @@ -6312,8 +6312,8 @@ static void ggml_compute_forward_rope_f16( const float cos_theta = cache[i0 + 0]; const float sin_theta = cache[i0 + 1]; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); @@ -6323,8 +6323,8 @@ static void ggml_compute_forward_rope_f16( } } else { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); dst_data[0] = src[0]; dst_data[1] = src[1]; @@ -6413,7 +6413,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i02*nb02 + i01*nb01); ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ne02 + i02] = src[i00]; @@ -6428,7 +6428,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( ggml_fp16_t * dst_data = wdata; for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); + const float * const src = (float *)((char *) tensor_data(src1) + i11*nb11); for (int64_t i10 = 0; i10 < ne10; i10++) { dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]); } @@ -6436,7 +6436,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( } // need to zero dst since we are accumulating into it - memset(dst->data, 0, ggml_nbytes(dst)); + memset(tensor_data(dst), 0, ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -6456,7 +6456,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( ggml_fp16_t * const wdata_src = wdata + nk; for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); + float * dst_data = (float *)((char *) tensor_data(dst) + i1*nb1); ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; for (int i10 = 0; i10 < ne10; i10++) { const int i1n = i10*ne11; @@ -6501,7 +6501,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + const float * const src = (float *)((char *) tensor_data(src0) + i02*nb02 + i01*nb01); float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ne02 + i02] = src[i00]; @@ -6516,7 +6516,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( float * dst_data = wdata; for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); + const float * const src = (float *)((char *) tensor_data(src1) + i11*nb11); for (int64_t i10 = 0; i10 < ne10; i10++) { dst_data[i10*ne11 + i11] = src[i10]; } @@ -6524,7 +6524,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( } // need to zero dst since we are accumulating into it - memset(dst->data, 0, ggml_nbytes(dst)); + memset(tensor_data(dst), 0, ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -6544,7 +6544,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( float * const wdata_src = wdata + nk; for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); + float * dst_data = (float *)((char *) tensor_data(dst) + i1*nb1); float * wdata_kernel = wdata + i1*ne02*ne00; for (int i10 = 0; i10 < ne10; i10++) { const int i1n = i10*ne11; @@ -6626,7 +6626,7 @@ static void ggml_compute_forward_im2col_f32( // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { - float * const wdata = (float *) dst->data; + float * const wdata = (float *) tensor_data(dst); for (int64_t in = 0; in < N; in++) { for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 @@ -6635,7 +6635,7 @@ static void ggml_compute_forward_im2col_f32( // micro kernel float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + const float * const src_data = (float *)((char *) tensor_data(src1) + in*ofs0 + iic*ofs1); // [IH, IW] for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 for (int64_t ikw = 0; ikw < KW; ikw++) { @@ -6704,7 +6704,7 @@ static void ggml_compute_forward_im2col_f16( // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { - ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + ggml_fp16_t * const wdata = (ggml_fp16_t *) tensor_data(dst); for (int64_t in = 0; in < N; in++) { for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 @@ -6713,7 +6713,7 @@ static void ggml_compute_forward_im2col_f16( // micro kernel ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + const float * const src_data = (float *)((char *) tensor_data(src1) + in*ofs0 + iic*ofs1); // [IH, IW] for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 for (int64_t ikw = 0; ikw < KW; ikw++) { @@ -6797,7 +6797,7 @@ void ggml_compute_forward_im2col_back_f32( // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { - float * const wdata = (float *) dst->data; + float * const wdata = (float *) tensor_data(dst); for (int64_t in = 0; in < N; in++) { for (int64_t iic = ith; iic < IC; iic += nth) { @@ -6834,7 +6834,7 @@ void ggml_compute_forward_im2col_back_f32( continue; } - const float * const grad_in = (const float *) src0->data + const float * const grad_in = (const float *) tensor_data(src0) + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] grad += grad_in[iic*(KH*KW) + ikh*KW + ikw]; } @@ -6861,7 +6861,7 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params src1.nb[1] = k * traits->type_size; src1.nb[2] = src1.nb[1]; src1.nb[3] = src1.nb[2]; - src1.data = a; + tensor_set_data(&src1, a); struct ggml_tensor src0 = {}; src0.type = type; @@ -6873,7 +6873,7 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params src0.nb[1] = k * traits->type_size; src0.nb[2] = src0.nb[1]; src0.nb[3] = src0.nb[2]; - src0.data = b; + tensor_set_data(&src0, b); struct ggml_tensor dst = {}; dst.ne[0] = n; @@ -6884,7 +6884,7 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params dst.nb[1] = n * sizeof(float); dst.nb[2] = dst.nb[1]; dst.nb[3] = dst.nb[2]; - dst.data = c; + tensor_set_data(&dst, c); dst.src[0] = &src0; dst.src[1] = &src1; @@ -6923,9 +6923,9 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int64_t dst_w = dst->ne[0]; const int64_t dst_h = dst->ne[1]; - const float * src_data = (float *) src->data; - void * knl_data = kernel->data; - float * dst_data = (float *) dst->data; + const float * src_data = (float *) tensor_data(src); + void * knl_data = tensor_data(kernel); + float * dst_data = (float *) tensor_data(dst); const int64_t knl_n = knl_w * knl_h * c_in; const int64_t patch_total = dst->ne[3] * dst_w * dst_h; @@ -7060,7 +7060,7 @@ void ggml_compute_forward_conv_transpose_2d( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) tensor_data(src0) + i03*nb03 + i02*nb02); ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { @@ -7076,7 +7076,7 @@ void ggml_compute_forward_conv_transpose_2d( ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; for (int i12 = 0; i12 < ne12; i12++) { for (int i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + const float * const src = (float *)((char *) tensor_data(src1) + i12*nb12 + i11*nb11); ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); @@ -7085,7 +7085,7 @@ void ggml_compute_forward_conv_transpose_2d( } } - memset(dst->data, 0, ggml_nbytes(dst)); + memset(tensor_data(dst), 0, ggml_nbytes(dst)); } ggml_barrier(params->threadpool); @@ -7105,7 +7105,7 @@ void ggml_compute_forward_conv_transpose_2d( ggml_fp16_t * const wdata_src = wdata + nk; for (int i2 = ip0; i2 < ip1; i2++) { // Cout - float * dst_data = (float *)((char *) dst->data + i2*nb2); + float * dst_data = (float *)((char *) tensor_data(dst) + i2*nb2); ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; for (int i11 = 0; i11 < ne11; i11++) { for (int i10 = 0; i10 < ne10; i10++) { @@ -7151,7 +7151,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( const ggml_conv_2d_dw_params & p) { const int64_t c = p.channels; - const float * knl_data = (const float *)kernel->data; + const float * knl_data = (const float *)tensor_data(kernel); const int64_t rows_total = p.dst_h * p.batch; const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; @@ -7168,9 +7168,9 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( for (int64_t row = row_start; row < row_end; ++row) { const int64_t dst_y = row % p.dst_h; - const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + const float * src_data = (const float *)tensor_data(src) + (row / p.dst_h) * p.src_w * p.src_h * c; for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { - float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + float * dst_data = (float *)tensor_data(dst) + (row * p.dst_w + dst_x) * c; const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; @@ -7232,9 +7232,9 @@ static void ggml_compute_forward_conv_2d_dw_whcn( const int64_t end = MIN(start + per_thread, n); for (int64_t i = start; i < end; ++i) { - const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; - const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; - float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + const float * knl_data = (const float *)tensor_data(kernel) + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)tensor_data(src) + i * p.src_w * p.src_h; + float * dst_data = (float *)tensor_data(dst) + i * p.dst_w * p.dst_h; for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { @@ -7312,9 +7312,9 @@ static void ggml_compute_forward_pool_1d_sk_p0( return; } - const char * cdata = (const char *)src->data; + const char * cdata = (const char *)tensor_data(src); const char * const data_end = cdata + ggml_nbytes(src); - float * drow = (float *)dst->data; + float * drow = (float *)tensor_data(dst); const int64_t rs = dst->ne[0]; @@ -7387,14 +7387,14 @@ void ggml_compute_forward_pool_2d( const int s1 = opts[4]; const int p0 = opts[5]; const int p1 = opts[6]; - const char * cdata = (const char*)src->data; + const char * cdata = (const char*)tensor_data(src); const char * const data_end = cdata + ggml_nbytes(src); const int64_t px = dst->ne[0]; const int64_t py = dst->ne[1]; const int64_t pa = px * py; - float * dplane = (float *)dst->data; + float * dplane = (float *)tensor_data(dst); const int ka = k0 * k1; const int offset0 = -p0; @@ -7465,8 +7465,8 @@ void ggml_compute_forward_pool_2d_back( const int p0 = opts[5]; const int p1 = opts[6]; - char * cdata = (char *) dst->data; - const char * cdataf = (const char *) dstf->data; + char * cdata = (char *) tensor_data(dst); + const char * cdataf = (const char *) tensor_data(dstf); const char * const data_end = cdata + ggml_nbytes(dst); GGML_ASSERT(params->ith == 0); @@ -7476,7 +7476,7 @@ void ggml_compute_forward_pool_2d_back( const int64_t py = src->ne[1]; const int64_t pa = px * py; - const float * splane = (const float *) src->data; + const float * splane = (const float *) tensor_data(src); const int ka = k0 * k1; const int offset0 = -p0; @@ -7596,8 +7596,8 @@ static void ggml_compute_forward_upscale_f32( for (int64_t i0 = 0; i0 < ne0; i0++) { const int64_t i00 = i0 / sf0; - const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + const float * x = (float *)((char *) tensor_data(src0) + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + float * y = (float *)((char *) tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y = *x; } @@ -7639,14 +7639,14 @@ static void ggml_compute_forward_upscale_f32( dx = std::max(0.0f, std::min(dx, 1.0f)); // fetch the four surrounding pixel values and interpolate - const float a = *(const float *)((const char *)src0->data + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03); - const float b = *(const float *)((const char *)src0->data + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03); - const float c = *(const float *)((const char *)src0->data + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03); - const float d = *(const float *)((const char *)src0->data + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03); + const float a = *(const float *)((const char *)tensor_data(src0) + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03); + const float b = *(const float *)((const char *)tensor_data(src0) + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03); + const float c = *(const float *)((const char *)tensor_data(src0) + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03); + const float d = *(const float *)((const char *)tensor_data(src0) + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03); const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy; - float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + float * y_dst = (float *)((char *)tensor_data(dst) + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y_dst = val; } } @@ -7692,7 +7692,7 @@ static void ggml_compute_forward_pad_f32( GGML_TENSOR_UNARY_OP_LOCALS - float * dst_ptr = (float *) dst->data; + float * dst_ptr = (float *) tensor_data(dst); // TODO: optimize @@ -7702,7 +7702,7 @@ static void ggml_compute_forward_pad_f32( for (int64_t i3 = 0; i3 < ne3; ++i3) { const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + const float * src_ptr = (const float *)((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { dst_ptr[dst_idx] = *src_ptr; @@ -7756,10 +7756,10 @@ void ggml_compute_forward_pad_reflect_1d( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); - float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); + float * left = (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); + float * right = (float *) ((char *) tensor_data(dst) + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); - ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); + ggml_vec_cpy_f32(ne00, left, (float *) ((char *) tensor_data(src0) + i3*nb03 + i2*nb02 + i1*nb01)); for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } @@ -7784,8 +7784,8 @@ static void ggml_compute_forward_roll_f32( ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src_data = (const float *) src0->data; - float * dst_data = (float *) dst->data; + const float * src_data = (const float *) tensor_data(src0); + float * dst_data = (float *) tensor_data(dst); GGML_TENSOR_UNARY_OP_LOCALS @@ -7856,7 +7856,7 @@ static void ggml_compute_forward_arange_f32( for (int64_t i = ith; i < steps; i+= nth) { float value = start + step * i; - ((float *)dst->data)[i] = value; + ((float *)tensor_data(dst))[i] = value; } } @@ -7894,9 +7894,9 @@ static void ggml_compute_forward_timestep_embedding_f32( int half = dim / 2; for (int64_t i = 0; i < ne00; i++) { - float * embed_data = (float *)((char *) dst->data + i*nb1); + float * embed_data = (float *)((char *) tensor_data(dst) + i*nb1); for (int64_t j = ith; j < half; j += nth) { - float timestep = ((float *)src0->data)[i]; + float timestep = ((float *)tensor_data(src0))[i]; float freq = (float)expf(-logf(max_period) * j / half); float arg = timestep * freq; embed_data[j] = cosf(arg); @@ -7946,8 +7946,8 @@ static void ggml_compute_forward_argsort_f32( ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0); for (int64_t i = ith; i < nr; i += nth) { - int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); - const float * src_data = (float *)((char *) src0->data + i*nb01); + int32_t * dst_data = (int32_t *)((char *) tensor_data(dst) + i*nb1); + const float * src_data = (float *)((char *) tensor_data(src0) + i*nb01); for (int64_t j = 0; j < ne0; j++) { dst_data[j] = j; @@ -8100,7 +8100,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( memset(VKQ32, 0, DV*sizeof(float)); } - const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL; + const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) tensor_data(mask) + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL; // k indices const int ik3 = iq3 / rk3; @@ -8110,7 +8110,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( const int iv3 = iq3 / rv3; const int iv2 = iq2 / rv2; - const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); + const float * pq = (const float *) ((char *) tensor_data(q) + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); q_to_vec_dot(pq, Q_q, DK); // online softmax / attention @@ -8124,7 +8124,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( float s; // KQ value - const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); + const char * k_data = (const char *) tensor_data(k) + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1); s = s*scale; // scale KQ value @@ -8140,7 +8140,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value float vs = 1.0f; // post-softmax KQ value, expf(s - M) - const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); + const char * v_data = ((const char *) tensor_data(v) + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); if (v->type == GGML_TYPE_F16) { if (s > M) { @@ -8199,10 +8199,10 @@ static void ggml_compute_forward_flash_attn_ext_f16( const int i3 = iq3; // original - //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); + //memcpy((char *) tensor_data(dst) + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); // permute(0, 2, 1, 3) - memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); + memcpy((char *) tensor_data(dst) + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); } } @@ -8286,7 +8286,7 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb2 <= nb3); if (ith == 0) { - memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + memset(tensor_data(dst), 0, nb0*ne0*ne1*ne2*ne3); } ggml_barrier(params->threadpool); @@ -8301,9 +8301,9 @@ static void ggml_compute_forward_flash_attn_back_f32( const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); - void * grad_q = (char *) dst->data; - void * grad_k = (char *) dst->data + offs_k; - void * grad_v = (char *) dst->data + offs_v; + void * grad_q = (char *) tensor_data(dst); + void * grad_k = (char *) tensor_data(dst) + offs_k; + void * grad_v = (char *) tensor_data(dst) + offs_v; const size_t nbgq1 = nb0*neq0; const size_t nbgq2 = nb0*neq0*neq1; @@ -8373,8 +8373,8 @@ static void ggml_compute_forward_flash_attn_back_f32( ggml_vec_dot_f32(neq0, S + i1, 0, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); + (float *) ((char *) tensor_data(k) + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) tensor_data(q) + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -8482,8 +8482,8 @@ static void ggml_compute_forward_flash_attn_back_f32( for (int64_t ic = 0; ic < D; ++ic) { ggml_vec_mad_f32(masked_begin, S, - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + (float *) ((char *) tensor_data(v) + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), + *(float *) ((char *) tensor_data(d) + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); } // S = SM * (S - dot(SM, S)) @@ -8512,7 +8512,7 @@ static void ggml_compute_forward_flash_attn_back_f32( for (int64_t ic = 0; ic < masked_begin; ++ic) { ggml_vec_mad_f32(D, (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), - (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) tensor_data(k) + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), S[ic]); } @@ -8524,7 +8524,7 @@ static void ggml_compute_forward_flash_attn_back_f32( for (int64_t ic = 0; ic < masked_begin; ++ic) { ggml_vec_mad_f32(D, (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), + (float *) ((char *) tensor_data(q) + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), S[ic]); } @@ -8537,7 +8537,7 @@ static void ggml_compute_forward_flash_attn_back_f32( ggml_vec_mad_f32(masked_begin, (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), SM, - *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + *(float *) ((char *) tensor_data(d) + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); } } } @@ -8597,9 +8597,9 @@ static void ggml_compute_forward_ssm_conv_f32( for (int i2 = 0; i2 < n_t; ++i2) { // {d_conv - 1 + n_t, d_inner, n_seqs} // sliding window - const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} - const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner} - float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} + const float * s = (const float *) ((const char *) tensor_data(src0) + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} + const float * c = (const float *) ((const char *) tensor_data(src1) + ir0*(src1->nb[1])); // {d_conv, d_inner} + float * x = (float *) ((char *) tensor_data(dst) + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} // TODO: transpose the output for smaller strides for big batches? // d_inner @@ -8677,19 +8677,19 @@ static void ggml_compute_forward_ssm_scan_f32( const int ih0 = dh*ith; const int ih1 = MIN(ih0 + dh, nh); - const int32_t * ids = (const int32_t *) src6->data; + const int32_t * ids = (const int32_t *) tensor_data(src6); for (int i3 = 0; i3 < ns; ++i3) { - const float * s0 = (const float *) ((const char *) src0->data + ids[i3]*(src0->nb[3])); // {d_state, dim, nh, ns} - float * s = ( float *) (( char *) dst->data + i3*(src0->nb[3]) + s_off); // {d_state, dim, nh, ns} + const float * s0 = (const float *) ((const char *) tensor_data(src0) + ids[i3]*(src0->nb[3])); // {d_state, dim, nh, ns} + float * s = ( float *) (( char *) tensor_data(dst) + i3*(src0->nb[3]) + s_off); // {d_state, dim, nh, ns} for (int i2 = 0; i2 < nt; ++i2) { - const float * x = (const float *) ((const char *) src1->data + i2*(src1->nb[2]) + i3*(src1->nb[3])); // {dim, nh, nt, ns} - const float * dt = (const float *) ((const char *) src2->data + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {nh, nt, ns} - const float * A = (const float *) ((const char *) src3->data); // {d_state, nh} or {1, nh} - const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[2]) + i3*(src4->nb[3])); // {d_state, ng, nt, ns} - const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[2]) + i3*(src5->nb[3])); // {d_state, ng, nt, ns} - float * y = ( float *) (( char *) dst->data + i2*(nh*nr*sizeof(float)) + i3*(nt*nh*nr*sizeof(float))); // {dim, nh, nt, ns} + const float * x = (const float *) ((const char *) tensor_data(src1) + i2*(src1->nb[2]) + i3*(src1->nb[3])); // {dim, nh, nt, ns} + const float * dt = (const float *) ((const char *) tensor_data(src2) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {nh, nt, ns} + const float * A = (const float *) ((const char *) tensor_data(src3)); // {d_state, nh} or {1, nh} + const float * B = (const float *) ((const char *) tensor_data(src4) + i2*(src4->nb[2]) + i3*(src4->nb[3])); // {d_state, ng, nt, ns} + const float * C = (const float *) ((const char *) tensor_data(src5) + i2*(src5->nb[2]) + i3*(src5->nb[3])); // {d_state, ng, nt, ns} + float * y = ( float *) (( char *) tensor_data(dst) + i2*(nh*nr*sizeof(float)) + i3*(nt*nh*nr*sizeof(float))); // {dim, nh, nt, ns} if (src3->ne[0] == 1) { // Mamba-2 has a scalar decay factor per head; dA can be outside the state-wise loop @@ -8893,9 +8893,9 @@ static void ggml_compute_forward_win_part_f32( const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { - ((float *) dst->data)[i] = 0.0f; + ((float *) tensor_data(dst))[i] = 0.0f; } else { - ((float *) dst->data)[i] = ((float *) src0->data)[j]; + ((float *) tensor_data(dst))[i] = ((float *) tensor_data(src0))[j]; } } } @@ -8959,7 +8959,7 @@ static void ggml_compute_forward_win_unpart_f32( const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; - ((float *) dst->data)[j] = ((float *) src0->data)[i]; + ((float *) tensor_data(dst))[j] = ((float *) tensor_data(src0))[i]; } } } @@ -9110,8 +9110,8 @@ static void ggml_compute_forward_get_rel_pos_f16( const int64_t w = ne1; - ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; - ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; + ggml_fp16_t * src0_data = (ggml_fp16_t *) tensor_data(src0); + ggml_fp16_t * dst_data = (ggml_fp16_t *) tensor_data(dst); for (int64_t i2 = 0; i2 < ne2; ++i2) { for (int64_t i1 = 0; i1 < ne1; ++i1) { @@ -9155,15 +9155,15 @@ static void ggml_compute_forward_add_rel_pos_f32( const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace) { if (params->ith == 0) { - memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + memcpy((char *) tensor_data(dst), (char *) tensor_data(src0), ggml_nbytes(dst)); } ggml_barrier(params->threadpool); } // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 - float * src1_data = (float *) src1->data; - float * src2_data = (float *) src2->data; - float * dst_data = (float *) dst->data; + float * src1_data = (float *) tensor_data(src1); + float * src2_data = (float *) tensor_data(src2); + float * dst_data = (float *) tensor_data(dst); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -9234,8 +9234,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32( const int64_t n_seqs = dst->src[5]->ne[1]; const int64_t head_size = C / HEADS; - float * dst_data = (float *) dst->data; - float * state = ((float *) dst->data) + C * T; + float * dst_data = (float *) tensor_data(dst); + float * state = ((float *) tensor_data(dst)) + C * T; const int ith = params->ith; const int nth = params->nth; @@ -9248,11 +9248,11 @@ static void ggml_compute_forward_rwkv_wkv6_f32( const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? (HEADS * (ith + 1)) / nth : HEADS; - float * k = (float *) dst->src[0]->data; - float * v = (float *) dst->src[1]->data; - float * r = (float *) dst->src[2]->data; - float * time_faaaa = (float *) dst->src[3]->data; - float * time_decay = (float *) dst->src[4]->data; + float * k = (float *) tensor_data(dst->src[0]); + float * v = (float *) tensor_data(dst->src[1]); + float * r = (float *) tensor_data(dst->src[2]); + float * time_faaaa = (float *) tensor_data(dst->src[3]); + float * time_decay = (float *) tensor_data(dst->src[4]); size_t t_stride = HEADS * head_size; // Same to C @@ -9313,7 +9313,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32( size_t t_offset = t * t_stride; size_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[5]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { size_t h_offset = h * h_stride; @@ -9385,7 +9385,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32( size_t t_offset = t * t_stride; size_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[5]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { size_t h_offset = h * h_stride; @@ -9451,8 +9451,8 @@ static void ggml_compute_forward_gla_f32( const int64_t head_size = C / HEADS; const float scale = ggml_get_op_params_f32(dst, 0); - float * dst_data = (float *) dst->data; - float * state = ((float *) dst->data) + C * T; + float * dst_data = (float *) tensor_data(dst); + float * state = ((float *) tensor_data(dst)) + C * T; const int ith = params->ith; const int nth = params->nth; @@ -9465,10 +9465,10 @@ static void ggml_compute_forward_gla_f32( const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? (HEADS * (ith + 1)) / nth : HEADS; - float * k = (float *) dst->src[0]->data; - float * v = (float *) dst->src[1]->data; - float * q = (float *) dst->src[2]->data; - float * g = (float *) dst->src[3]->data; + float * k = (float *) tensor_data(dst->src[0]); + float * v = (float *) tensor_data(dst->src[1]); + float * q = (float *) tensor_data(dst->src[2]); + float * g = (float *) tensor_data(dst->src[3]); size_t t_stride = HEADS * head_size; // Same to C @@ -9529,7 +9529,7 @@ static void ggml_compute_forward_gla_f32( size_t t_offset = t * t_stride; size_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[4]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { size_t h_offset = h * h_stride; @@ -9593,7 +9593,7 @@ static void ggml_compute_forward_gla_f32( size_t t_offset = t * t_stride; size_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[4]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { size_t h_offset = h * h_stride; @@ -9655,8 +9655,8 @@ static void ggml_compute_forward_rwkv_wkv7_f32( const int64_t n_seqs = dst->src[6]->ne[1]; const int64_t head_size = C / HEADS; - float * dst_data = (float *) dst->data; - float * state = ((float *) dst->data) + C * T; + float * dst_data = (float *) tensor_data(dst); + float * state = ((float *) tensor_data(dst)) + C * T; const int ith = params->ith; const int nth = params->nth; @@ -9669,12 +9669,12 @@ static void ggml_compute_forward_rwkv_wkv7_f32( const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? (HEADS * (ith + 1)) / nth : HEADS; - float * r = (float *) dst->src[0]->data; - float * w = (float *) dst->src[1]->data; - float * k = (float *) dst->src[2]->data; - float * v = (float *) dst->src[3]->data; - float * a = (float *) dst->src[4]->data; - float * b = (float *) dst->src[5]->data; + float * r = (float *) tensor_data(dst->src[0]); + float * w = (float *) tensor_data(dst->src[1]); + float * k = (float *) tensor_data(dst->src[2]); + float * v = (float *) tensor_data(dst->src[3]); + float * a = (float *) tensor_data(dst->src[4]); + float * b = (float *) tensor_data(dst->src[5]); int64_t t_stride = HEADS * head_size; // Same to C @@ -9689,7 +9689,7 @@ static void ggml_compute_forward_rwkv_wkv7_f32( int64_t t_offset = t * t_stride; int64_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[6]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { int64_t h_offset = h * h_stride; @@ -9729,7 +9729,7 @@ static void ggml_compute_forward_rwkv_wkv7_f32( int64_t t_offset = t * t_stride; int64_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[6]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { int64_t h_offset = h * h_stride; @@ -9808,7 +9808,7 @@ static void ggml_compute_forward_rwkv_wkv7_f32( int64_t t_offset = t * t_stride; int64_t state_offset = head_size * C * (t / (T / n_seqs)); float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)tensor_data(dst->src[6]) + state_offset; for (int64_t h = h_start; h < h_end; h++) { int64_t h_offset = h * h_stride; @@ -9960,8 +9960,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32( const int64_t ir1 = MIN(ir0 + dr, nr); for (int64_t i1 = ir0; i1 < ir1; ++i1) { - const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); - const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); + const float * s0 = (const float *)((const char *) tensor_data(src0) + i1*src0->nb[1]); + const float * s1 = (const float *)((const char *) tensor_data(src1) + i1*src1->nb[1]); #ifndef NDEBUG for (int64_t i = 0; i < nc; ++i) { @@ -9994,7 +9994,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( ggml_barrier(params->threadpool); if (ith == 0) { - float * dp = (float *) dst->data; + float * dp = (float *) tensor_data(dst); ggml_vec_sum_f32(nth, dp, sums); dp[0] *= -1.0f / (float) nr; } @@ -10048,12 +10048,12 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ir0 = dr*ith; const int64_t ir1 = MIN(ir0 + dr, nr); - const float d_by_nr = ((const float *) grad->data)[0] / (float) nr; + const float d_by_nr = ((const float *) tensor_data(grad))[0] / (float) nr; for (int64_t i1 = ir0; i1 < ir1; i1++) { - float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); - const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]); - const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]); + float * ds0 = (float *)((char *) tensor_data(dst) + i1*dst->nb[1]); + const float * s0 = (const float *)((const char *) tensor_data(src0f) + i1*src0f->nb[1]); + const float * s1 = (const float *)((const char *) tensor_data(src1f) + i1*src1f->nb[1]); #ifndef NDEBUG for (int64_t i = 0; i < nc; ++i) { @@ -10147,10 +10147,10 @@ static void ggml_compute_forward_opt_step_adamw_f32( const size_t offset = i03*nb03 + i02*nb02 + i01*nb01; - float * w = (float *) ((char *) src0->data + offset); // weight - const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad - float * m = (float *) ((char *) src0_grad_m->data + offset); - float * v = (float *) ((char *) src0_grad_v->data + offset); + float * w = (float *) ((char *) tensor_data(src0) + offset); // weight + const float * g = (const float *) ((const char *) tensor_data(src0_grad) + offset); // grad + float * m = (float *) ((char *) tensor_data(src0_grad_m) + offset); + float * v = (float *) ((char *) tensor_data(src0_grad_v) + offset); for (int i00 = 0; i00 < ne00; ++i00) { m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 72ee93a5abc7c..fdc00e04a5a20 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -920,7 +920,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 4 || interleave_block == 8); constexpr int nrows_interleaved = 4; - block_q4_0x4 * dst = (block_q4_0x4 *)t->data; + block_q4_0x4 * dst = (block_q4_0x4 *)tensor_data(t); const block_q4_0 * src = (const block_q4_0 *)data; block_q4_0 dst_tmp[4]; int nrow = ggml_nrows(t); @@ -950,7 +950,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; - block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; + block_q4_Kx8 * dst = (block_q4_Kx8*)tensor_data(t); const block_q4_K * src = (const block_q4_K*) data; block_q4_K dst_tmp[8]; int nrow = ggml_nrows(t); @@ -981,7 +981,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; - block_q4_0x8 * dst = (block_q4_0x8*)t->data; + block_q4_0x8 * dst = (block_q4_0x8*)tensor_data(t); const block_q4_0 * src = (const block_q4_0*) data; block_q4_0 dst_tmp[8]; int nrow = ggml_nrows(t); @@ -1047,7 +1047,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); GGML_ASSERT(interleave_block == 4); - block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; + block_iq4_nlx4 * dst = (block_iq4_nlx4 *)tensor_data(t); const block_iq4_nl * src = (const block_iq4_nl *)data; block_iq4_nl dst_tmp[4]; int nrow = ggml_nrows(t); @@ -1239,12 +1239,12 @@ template ((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); + ggml_quantize_mat_t((float *) ((char *) tensor_data(src1) + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); } i11_processed = ne11 - ne11 % 4; for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { - from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); + from_float((float *) ((char *) tensor_data(src1) + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); } ggml_barrier(params->threadpool); @@ -1262,14 +1262,14 @@ template 3) { gemm(ne00, - (float *) ((char *) dst->data) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, + (float *) ((char *) tensor_data(dst) + src0_start), ne01, + (const char *) tensor_data(src0) + src0_start * nb01, (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); } for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { gemv(ne00, - (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, + (float *) ((char *) tensor_data(dst) + (iter * nb1)) + src0_start, ne01, + (const char *) tensor_data(src0) + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, src0_end - src0_start); } @@ -1332,7 +1332,7 @@ template param type for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), + from_float((float *)((char *) tensor_data(src1) + i12 * nb12 + i11 * nb11), (void *) (wdata + i12 * nbw2 + i11 * nbw1), ne10); } @@ -1348,7 +1348,7 @@ template ne[1]; ++iid1) { for (int32_t id = 0; id < n_ids; ++id) { const int32_t i02 = - *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + *(const int32_t *) ((const char *) tensor_data(ids) + iid1 * ids->nb[1] + id * ids->nb[0]); GGML_ASSERT(i02 >= 0 && i02 < n_as); @@ -1368,7 +1368,7 @@ template data + cur_a*nb02; + const auto * src0_cur = (const char *) tensor_data(src0) + cur_a*nb02; //const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows @@ -1397,7 +1397,7 @@ template (ne00, - (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, + (float *)((char *) tensor_data(dst) + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); } diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp index 4fce569b3bfc8..7d4149d9b0ee0 100644 --- a/ggml/src/ggml-cpu/unary-ops.cpp +++ b/ggml/src/ggml-cpu/unary-ops.cpp @@ -92,8 +92,8 @@ static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst const int64_t i02 = (ir - i03*ne02*ne01)/ne01; const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + dst_t * dst_ptr = (dst_t *) ((char *) tensor_data(dst) + i03*nb3 + i02*nb2 + i01*nb1 ); + const src0_t * src0_ptr = (const src0_t *) ((const char *) tensor_data(src0) + i03*nb03 + i02*nb02 + i01*nb01); vec_unary_op(ne0, dst_ptr, src0_ptr); } diff --git a/ggml/src/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu index e084607c029a6..a8e711a437ee6 100644 --- a/ggml/src/ggml-cuda/acc.cu +++ b/ggml/src/ggml-cuda/acc.cu @@ -38,9 +38,9 @@ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu index b5e495a246227..2757122bce716 100644 --- a/ggml/src/ggml-cuda/arange.cu +++ b/ggml/src/ggml-cuda/arange.cu @@ -15,7 +15,7 @@ static void arange_f32_cuda(float * dst, const int ne0, const float start, const } void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(dst->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index 5340eedc08916..12a539aae45ee 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -77,8 +77,8 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ne00 = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - const float * src0_d = (const float *) src0->data; - int32_t * dst_d = (int32_t *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + int32_t * dst_d = (int32_t *) tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 607ded8558b45..b2757fb81165d 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -87,8 +87,8 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index e1fbf0e13665d..9d782a60f51d0 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -312,23 +312,23 @@ static void ggml_cuda_op_bin_bcast( } void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream()); + ggml_cuda_op_bin_bcast>(dst, dst->src[0], dst, nullptr, tensor_data(dst->src[0]), tensor_data(dst), ctx.stream()); } void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, tensor_data(dst->src[0]), tensor_data(dst->src[1]), tensor_data(dst), ctx.stream()); } void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, tensor_data(dst->src[0]), tensor_data(dst->src[1]), tensor_data(dst), ctx.stream()); } void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, tensor_data(dst->src[0]), tensor_data(dst->src[1]), tensor_data(dst), ctx.stream()); } void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, tensor_data(dst->src[0]), tensor_data(dst->src[1]), tensor_data(dst), ctx.stream()); } void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -352,8 +352,8 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst switch (dst->type) { case GGML_TYPE_F32: { - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream); } break; default: { diff --git a/ggml/src/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu index fe415e7f78dd6..5bb36fc07fece 100644 --- a/ggml/src/ggml-cuda/clamp.cu +++ b/ggml/src/ggml-cuda/clamp.cu @@ -24,8 +24,8 @@ static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const void * src0_d = src0->data; - void * dst_d = dst->data; + const void * src0_d = tensor_data(src0); + void * dst_d = tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index e9ffd274b9966..ae6a7efcd7ad6 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -167,10 +167,10 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->type == GGML_TYPE_F32); if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; + const float * src0_d = (const float *)tensor_data(src0); + const float * src1_d = (const float *)tensor_data(src1); - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); if (dim != 3) { for (int i3 = 0; i3 < dst->ne[3]; i3++) { @@ -192,7 +192,7 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]); auto launch_kernel = [&](auto dim) { concat_f32_non_cont<<>>( - (const char *) src0->data, (const char *) src1->data, (char *) dst->data, + (const char *) tensor_data(src0), (const char *) tensor_data(src1), (char *) tensor_data(dst), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu index fe4caf674d4d9..81b11bd7b0939 100644 --- a/ggml/src/ggml-cuda/conv-transpose-1d.cu +++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu @@ -59,12 +59,12 @@ static void conv_transpose_1d_f32_f32_cuda( void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; + const float * src0_d = (const float *)tensor_data(src0); const ggml_tensor * src1 = dst->src[1]; - const float * src1_d = (const float *)src1->data; + const float * src1_d = (const float *)tensor_data(src1); - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu index 7583233b1b7cd..0a3fd67b94189 100644 --- a/ggml/src/ggml-cuda/conv2d-dw.cu +++ b/ggml/src/ggml-cuda/conv2d-dw.cu @@ -121,9 +121,9 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const ggml_tensor * input = dst->src[1]; GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - const float * w_d = (const float *) kernel->data; - const float * x_d = (const float *) input->data; - float * y_d = (float *) dst->data; + const float * w_d = (const float *) tensor_data(kernel); + const float * x_d = (const float *) tensor_data(input); + float * y_d = (float *) tensor_data(dst); const int32_t * p = (const int32_t *) dst->op_params; const int stride_x = p[0]; diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu index 03224e404d32d..866d4bac58f6b 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cu +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -58,9 +58,9 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - const float * input_data = (const float *) input->data; - float * output_data = (float *) dst->data; - const half * kernel_data = (const half *) kernel->data; + const float * input_data = (const float *) tensor_data(input); + float * output_data = (float *) tensor_data(dst); + const half * kernel_data = (const half *) tensor_data(kernel); const int input_w = input->ne[0]; const int input_h = input->ne[1]; diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu index 08898115daed2..c91ad25e69f00 100644 --- a/ggml/src/ggml-cuda/count-equal.cu +++ b/ggml/src/ggml-cuda/count-equal.cu @@ -37,7 +37,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(dst)); - int64_t * dst_d = (int64_t *) dst->data; + int64_t * dst_d = (int64_t *) tensor_data(dst); cudaStream_t stream = ctx.stream(); const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; @@ -53,8 +53,8 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_I32: { - const int * src0_d = (const int *) src0->data; - const int * src1_d = (const int *) src1->data; + const int * src0_d = (const int *) tensor_data(src0); + const int * src1_d = (const int *) tensor_data(src1); count_equal<<>>(src0_d, src1_d, dst_d, dne, ne); } break; default: diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 0e5964907e186..54212528051a9 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -309,8 +309,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg cudaStream_t main_stream = ctx.stream(); - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; + char * src0_ddc = (char *) tensor_data(src0); + char * src1_ddc = (char *) tensor_data(src1); char ** dest_ptrs_d = nullptr; int graph_cpynode_index = -1; diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index 0c8b0819724e4..8b8dc4e587ed8 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -106,9 +106,9 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * const int64_t ne00 = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); ggml_cuda_pool & pool = ctx.pool(); cudaStream_t stream = ctx.stream(); @@ -154,10 +154,10 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten const int64_t ne00 = src0f->ne[0]; const int64_t nrows = ggml_nrows(src0f); - const float * grad_d = (const float *) grad->data; - const float * src0f_d = (const float *) src0f->data; - const float * src1f_d = (const float *) src1f->data; - float * dst_d = (float *) dst->data; + const float * grad_d = (const float *) tensor_data(grad); + const float * src0f_d = (const float *) tensor_data(src0f); + const float * src1f_d = (const float *) tensor_data(src1f); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu index 4b713ba22eb53..826d54a3d45d9 100644 --- a/ggml/src/ggml-cuda/diagmask.cu +++ b/ggml/src/ggml-cuda/diagmask.cu @@ -23,8 +23,8 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 95e704e393c2a..5e96d1df463f8 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -714,12 +714,12 @@ void launch_fattn( ggml_cuda_pool_alloc dst_tmp(pool); ggml_cuda_pool_alloc dst_tmp_meta(pool); - const char * K_data = (const char *) K->data; + const char * K_data = (const char *) tensor_data(K); size_t nb11 = K->nb[1]; size_t nb12 = K->nb[2]; size_t nb13 = K->nb[3]; - const char * V_data = V ? (const char *) V->data : nullptr; + const char * V_data = V ? (const char *) tensor_data(V) : nullptr; size_t nb21 = V ? V->nb[1] : nb11; size_t nb22 = V ? V->nb[2] : nb12; size_t nb23 = V ? V->nb[3] : nb13; @@ -866,11 +866,12 @@ void launch_fattn( GGML_ASSERT(block_dim.x % warp_size == 0); fattn_kernel<<>>( - (const char *) Q->data, + (const char *) tensor_data(Q), K_data, V_data, - mask ? ((const char *) mask->data) : nullptr, - !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr, + mask ? ((const char *) tensor_data(mask)) : nullptr, + !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) tensor_data(KQV), + dst_tmp_meta.ptr, scale, max_bias, m0, m1, n_head_log2, logit_softcap, Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13, @@ -887,7 +888,7 @@ void launch_fattn( flash_attn_stream_k_fixup <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]); + ((float *) tensor_data(KQV), dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]); } } else if (parallel_blocks > 1) { const dim3 block_dim_combine(DV, 1, 1); @@ -896,7 +897,7 @@ void launch_fattn( flash_attn_combine_results <<>> - (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); + (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) tensor_data(KQV), parallel_blocks); } CUDA_CHECK(cudaGetLastError()); } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index f77b2629a19b0..5bae0ec3aa160 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -247,7 +247,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type, + get_rows_cuda(tensor_data(src0), src0->type, (const int32_t *) tensor_data(src1), tensor_data(dst), dst->type, ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); } @@ -257,9 +257,9 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d GGML_TENSOR_BINARY_OP_LOCALS - const float * src0_d = (const float *) src0->data; - const int32_t * src1_d = (const int32_t *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const int32_t * src1_d = (const int32_t *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 03c380897cd8a..a4a6f8f2e5980 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -589,7 +589,7 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer if (padded_size > original_size) { ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); + CUDA_CHECK(cudaMemset((char *)tensor_data(tensor) + original_size, 0, padded_size - original_size)); } } return GGML_STATUS_SUCCESS; @@ -599,7 +599,7 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread)); + CUDA_CHECK(cudaMemsetAsync((char *)tensor_data(tensor) + offset, value, size, cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } @@ -607,7 +607,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor_data(tensor) + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } @@ -615,7 +615,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor_data(tensor) + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } @@ -624,12 +624,12 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; if (src_ctx->device == dst_ctx->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + CUDA_CHECK(cudaMemcpyAsync(tensor_data(dst), tensor_data(src), ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { #ifdef GGML_CUDA_NO_PEER_COPY return false; #else - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); + CUDA_CHECK(cudaMemcpyPeerAsync(tensor_data(dst), dst_ctx->device, tensor_data(src), src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); #endif } CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); @@ -1172,7 +1172,7 @@ typedef void (*ggml_cuda_op_mul_mat_t)( static cudaError_t ggml_cuda_cpy_tensor_2d( void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { - const char * src_ptr = (const char *) src->data; + const char * src_ptr = (const char *) tensor_data(src); char * dst_ptr = (char *) dst; const int64_t ne0 = src->ne[0]; @@ -1556,7 +1556,7 @@ static void ggml_cuda_op_mul_mat( cudaStream_t stream = ctx.stream(id, 0); if (src0_is_contiguous) { - dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data; + dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) tensor_data(src0); } else { // If src0 is not contiguous it will be copied to a temporary buffer. // This buffer needs to be cleared entirely because multiple regions will function as padding. @@ -1576,7 +1576,7 @@ static void ggml_cuda_op_mul_mat( } if (src1_on_device && src1_is_contiguous) { - dev[id].src1_ddf = (float *) src1->data; + dev[id].src1_ddf = (float *) tensor_data(src1); } else { dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); } @@ -1598,7 +1598,7 @@ static void ggml_cuda_op_mul_mat( } if (dst_on_device) { - dev[id].dst_dd = (float *) dst->data; + dev[id].dst_dd = (float *) tensor_data(dst); } else { const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst); dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf); @@ -1673,7 +1673,7 @@ static void ggml_cuda_op_mul_mat( src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream)); } } else { - float * src1_ddf_i_source = (float *) src1->data; + float * src1_ddf_i_source = (float *) tensor_data(src1); src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device, src1_ncols*ne10*sizeof(float), stream)); @@ -1705,7 +1705,7 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { - void * dst_off_device = dst->data; + void * dst_off_device = tensor_data(dst); if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. @@ -1837,7 +1837,7 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct cudaStream_t main_stream = ctx.stream(); CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - float * dst_ddf = (float *) dst->data; + float * dst_ddf = (float *) tensor_data(dst); const size_t ts_src1 = ggml_type_size(src1->type); GGML_ASSERT(nb10 == ts_src1); int64_t s11 = nb11 / ts_src1; @@ -1851,11 +1851,11 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct ggml_cuda_pool_alloc src1_alloc(ctx.pool()); // Handle src0 - src0_ptr = (const cuda_t *) src0->data; + src0_ptr = (const cuda_t *) tensor_data(src0); // Handle src1 - convert if necessary if (src1->type == src0_type) { - src1_ptr = (const cuda_t *) src1->data; + src1_ptr = (const cuda_t *) tensor_data(src1); } else { // Convert src1 to target type using traits conversion functions const int64_t ne_src1 = ggml_nelements(src1); @@ -1863,7 +1863,7 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct const auto convert_func = traits::get_nc_converter(src1->type); GGML_ASSERT(convert_func != nullptr); - convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + convert_func(tensor_data(src1), src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); src1_ptr = src1_alloc.get(); s11 = ne10; s12 = ne11*s11; @@ -2119,7 +2119,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_pool_alloc dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted); std::vector ids_host(ggml_nbytes(ids)); - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), tensor_data(ids), ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices @@ -2146,7 +2146,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows; const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows; - get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, + get_rows_cuda(tensor_data(src1), src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, ne10, nb11, nb12, nb13, ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream); @@ -2164,7 +2164,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * src0_slice.nb[3] = src0_slice.nb[2]; src0_slice.op = GGML_OP_VIEW; src0_slice.view_src = dst->src[0]; // non-const pointer to src0 - src0_slice.data = (char *) src0->data + i02*nb02; + tensor_set_data(&src0_slice, (char *) tensor_data(src0) + i02*nb02); ggml_tensor src1_slice; memset(&src1_slice, 0, sizeof(src1_slice)); @@ -2178,7 +2178,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0]; src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1]; src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2]; - src1_slice.data = src1_data_cur; + tensor_set_data(&src1_slice, src1_data_cur); ggml_tensor dst_slice; memset(&dst_slice, 0, sizeof(dst_slice)); @@ -2192,7 +2192,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0]; dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1]; dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; - dst_slice.data = dst_data_cur; + tensor_set_data(&dst_slice, dst_data_cur); ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); CUDA_CHECK(cudaGetLastError()); @@ -2201,7 +2201,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_data_cur += dst_slice.nb[2]; } - get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type, + get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, tensor_data(dst), dst->type, ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), nb1, nb2, nb3, stream); @@ -2509,7 +2509,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor_data(tensor) + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); } static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -2518,7 +2518,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor_data(tensor) + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); } static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { @@ -2550,12 +2550,12 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ if (backend_src != backend_dst) { // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); + CUDA_CHECK(cudaMemcpyAsync(tensor_data(dst), tensor_data(src), ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } else { #ifdef GGML_CUDA_NO_PEER_COPY return false; #else - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + CUDA_CHECK(cudaMemcpyPeerAsync(tensor_data(dst), cuda_ctx_dst->device, tensor_data(src), cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); #endif } @@ -2571,7 +2571,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); } else { // src and dst are on the same backend - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); + CUDA_CHECK(cudaMemcpyAsync(tensor_data(dst), tensor_data(src), ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; } @@ -2631,7 +2631,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Store the pointers which are updated for each token, such that these can be sent // to the device and accessed using indirection from CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data); + cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) tensor_data(node->src[1])); // store a pointer to each copy op CUDA kernel to identify it later void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); @@ -2658,20 +2658,20 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud } static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { - graph_node_properties->node_address = node->data; + graph_node_properties->node_address = tensor_data(node); graph_node_properties->node_op = node->op; for (int i = 0; i < GGML_MAX_DIMS; i++) { graph_node_properties->ne[i] = node->ne[i]; graph_node_properties->nb[i] = node->nb[i]; } for (int i = 0; i < GGML_MAX_SRC; i++) { - graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr; + graph_node_properties->src_address[i] = node->src[i] ? tensor_data(node->src[i]) : nullptr; } memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS); } static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { - if (node->data != graph_node_properties->node_address && + if (tensor_data(node) != graph_node_properties->node_address && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW) { return false; @@ -2692,7 +2692,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra for (int i = 0; i < GGML_MAX_SRC; i++) { if (node->src[i] && - node->src[i]->data != graph_node_properties->src_address[i] && + tensor_data(node->src[i]) != graph_node_properties->src_address[i] && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW ) { diff --git a/ggml/src/ggml-cuda/gla.cu b/ggml/src/ggml-cuda/gla.cu index f7d615a8282fc..804eb3a20aa8a 100644 --- a/ggml/src/ggml-cuda/gla.cu +++ b/ggml/src/ggml-cuda/gla.cu @@ -62,11 +62,11 @@ static __global__ void gated_linear_attn_f32(const int B, const int T, const int } void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const float * k_d = (const float *)dst->src[0]->data; - const float * v_d = (const float *)dst->src[1]->data; - const float * r_d = (const float *)dst->src[2]->data; - const float * td_d = (const float *)dst->src[3]->data; - const float * s_d = (const float *)dst->src[4]->data; + const float * k_d = (const float *)tensor_data(dst->src[0]); + const float * v_d = (const float *)tensor_data(dst->src[1]); + const float * r_d = (const float *)tensor_data(dst->src[2]); + const float * td_d = (const float *)tensor_data(dst->src[3]); + const float * s_d = (const float *)tensor_data(dst->src[4]); const int64_t B = dst->src[4]->ne[1]; const int64_t T = dst->src[0]->ne[2]; @@ -76,7 +76,7 @@ void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor float scale; memcpy(&scale, (float*)dst->op_params, sizeof(float)); - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 5bb85b4807bcf..5712aeec73e09 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -65,8 +65,8 @@ static void im2col_cuda_f32(const float * x, float * dst, void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; + const float * src1_d = (const float *)tensor_data(src1); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src1->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 4b238a3998ba3..ded402dcbd3f1 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -2,8 +2,8 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 2db5b4ab0f09c..8d38e6531b917 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -85,9 +85,9 @@ void ggml_cuda_mul_mat_q( GGML_ASSERT( nb0 == ts_dst); GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - const char * src0_d = (const char *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const char * src0_d = (const char *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); // If src0 is a temporary compute buffer, clear any potential padding. if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { @@ -96,7 +96,7 @@ void ggml_cuda_mul_mat_q( if (size_alloc > size_data) { GGML_ASSERT(ggml_is_contiguously_allocated(src0)); GGML_ASSERT(!src0->view_src); - CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream)); + CUDA_CHECK(cudaMemsetAsync((char *) tensor_data(src0) + size_data, 0, size_alloc - size_data, stream)); } } @@ -154,7 +154,7 @@ void ggml_cuda_mul_mat_q( std::vector expert_bounds_host(ne02 + 1); ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), tensor_data(ids), ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index e14c93516bddf..b7c954e84648f 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -329,9 +329,9 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const float * src1_d = (const float *) src1->data; - const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; - float * dst_d = (float *) dst->data; + const float * src1_d = (const float *) tensor_data(src1); + const int32_t * ids_d = ids ? (const int32_t *) tensor_data(ids) : nullptr; + float * dst_d = (float *) tensor_data(dst); const int64_t s01 = src0->nb[1] / ts_src0; const int64_t s11 = src1->nb[1] / ts_src1; @@ -354,19 +354,19 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * switch (src0->type) { case GGML_TYPE_F32: { - const float * src0_d = (const float *) src0->data; + const float * src0_d = (const float *) tensor_data(src0); mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_F16: { - const half * src0_d = (const half *) src0->data; + const half * src0_d = (const half *) tensor_data(src0); mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { - const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; + const nv_bfloat16 * src0_d = (const nv_bfloat16 *) tensor_data(src0); mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index dc7adf509fac0..13ebc281e04a9 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -509,9 +509,9 @@ void ggml_cuda_mul_mat_vec_q( GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. - const float * src1_d = (const float *) src1->data; - const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; - float * dst_d = (float *) dst->data; + const float * src1_d = (const float *) tensor_data(src1); + const int32_t * ids_d = ids ? (const int32_t *) tensor_data(ids) : nullptr; + float * dst_d = (float *) tensor_data(dst); // If src0 is a temporary compute buffer, clear any potential padding. if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { @@ -520,7 +520,7 @@ void ggml_cuda_mul_mat_vec_q( if (size_alloc > size_data) { GGML_ASSERT(ggml_is_contiguously_allocated(src0)); GGML_ASSERT(!src0->view_src); - CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream)); + CUDA_CHECK(cudaMemsetAsync((char *) tensor_data(src0) + size_data, 0, size_alloc - size_data, stream)); } } @@ -554,7 +554,7 @@ void ggml_cuda_mul_mat_vec_q( const int64_t stride_channel_y = ids ? s11 : s12; mul_mat_vec_q_switch_type( - src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, + tensor_data(src0), src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, stream); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index bddcca51b7bfc..608e9ac7b7c73 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -376,8 +376,8 @@ static void l2_norm_f32_cuda( void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -400,8 +400,8 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -419,8 +419,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -447,21 +447,21 @@ void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * memcpy(&eps, dst->op_params, sizeof(float)); - const float * src0_d = (const float *) rms_norm_src->data; + const float * src0_d = (const float *) tensor_data(rms_norm_src); const float * mul_d = nullptr; const ggml_tensor * mul_src = nullptr; if (mul_tensor->src[0] == dst) { - mul_d = (float *) mul_tensor->src[1]->data; + mul_d = (float *) tensor_data(mul_tensor->src[1]); mul_src = mul_tensor->src[1]; } else if(mul_tensor->src[1] == dst) { - mul_d = (float *) mul_tensor->src[0]->data; + mul_d = (float *) tensor_data(mul_tensor->src[0]); mul_src = mul_tensor->src[0]; } else { GGML_ASSERT(false); } - float * dst_d = (float *) mul_tensor->data; + float * dst_d = (float *) tensor_data(mul_tensor); cudaStream_t stream = ctx.stream(); GGML_ASSERT(rms_norm_src->type == GGML_TYPE_F32); @@ -498,9 +498,9 @@ void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * d const ggml_tensor * grad = dst->src[0]; // gradients const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass - const float * grad_d = (const float *) grad->data; - const float * src0f_d = (const float *) src0f->data; - float * dst_d = (float *) dst->data; + const float * grad_d = (const float *) tensor_data(grad); + const float * src0f_d = (const float *) tensor_data(src0f); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); @@ -522,8 +522,8 @@ void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * d void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu index 35154f2996652..cbb357896bd83 100644 --- a/ggml/src/ggml-cuda/opt-step-adamw.cu +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -64,11 +64,11 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); GGML_ASSERT(ggml_nelements(adamw_params) == 7); - float * src0_d = (float *) src0->data; - const float * src0_grad_d = (const float *) src0_grad->data; - float * src0_grad_m_d = (float *) src0_grad_m->data; - float * src0_grad_v_d = (float *) src0_grad_v->data; - const float * adamw_params_d = (const float *) adamw_params->data; + float * src0_d = (float *) tensor_data(src0); + const float * src0_grad_d = (const float *) tensor_data(src0_grad); + float * src0_grad_m_d = (float *) tensor_data(src0_grad_m); + float * src0_grad_v_d = (float *) tensor_data(src0_grad_v); + const float * adamw_params_d = (const float *) tensor_data(adamw_params); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index c9b2b699c6a55..a9db2c74a1a5d 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -22,9 +22,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ne2 == src1->ne[2]); GGML_ASSERT(ne3 == src1->ne[3]); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); cublasHandle_t handle = ctx.cublas_handle(); diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 77432b04689be..d1f0fe832bf2a 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -35,8 +35,8 @@ static void pad_f32_cuda(const float * x, float * dst, void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu index c6d51e4d655a3..6ee4bcbb9cde3 100644 --- a/ggml/src/ggml-cuda/pool2d.cu +++ b/ggml/src/ggml-cuda/pool2d.cu @@ -64,8 +64,8 @@ static void pool2d_nchw_kernel_f32_f32_cuda( void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index d058504cd6cc0..ac9ad349f3645 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -326,10 +326,9 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + const float * src1_d = (const float *)tensor_data(src1); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); @@ -383,7 +382,7 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const float * freq_factors = nullptr; if (src2 != nullptr) { - freq_factors = (const float *) src2->data; + freq_factors = (const float *) tensor_data(src2); } rope_corr_dims corr_dims; diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 2ee9e588992f4..002002ce79f0f 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -17,8 +17,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index b2acdf855e900..e0b944e12061d 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -169,8 +169,8 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS - const float * src0_d = (const float *)src0->data; - const int64_t * src1_d = (const int64_t *)src1->data; + const float * src0_d = (const float *)tensor_data(src0); + const int64_t * src1_d = (const int64_t *)tensor_data(src1); cudaStream_t stream = ctx.stream(); @@ -178,7 +178,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if (dst->type == GGML_TYPE_F32) { set_rows_cuda( - src0_d, src1_d, (float*)dst->data, + src0_d, src1_d, (float*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -188,7 +188,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_F16) { set_rows_cuda( - src0_d, src1_d, (half*)dst->data, + src0_d, src1_d, (half*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -198,7 +198,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_BF16) { set_rows_cuda( - src0_d, src1_d, (nv_bfloat16*)dst->data, + src0_d, src1_d, (nv_bfloat16*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -208,7 +208,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_Q4_0) { set_rows_cuda_quant( - src0_d, src1_d, (block_q4_0*)dst->data, + src0_d, src1_d, (block_q4_0*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -218,7 +218,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_Q4_1) { set_rows_cuda_quant( - src0_d, src1_d, (block_q4_1*)dst->data, + src0_d, src1_d, (block_q4_1*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -228,7 +228,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_Q5_0) { set_rows_cuda_quant( - src0_d, src1_d, (block_q5_0*)dst->data, + src0_d, src1_d, (block_q5_0*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -238,7 +238,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_Q5_1) { set_rows_cuda_quant( - src0_d, src1_d, (block_q5_1*)dst->data, + src0_d, src1_d, (block_q5_1*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -248,7 +248,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_Q8_0) { set_rows_cuda_quant( - src0_d, src1_d, (block_q8_0*)dst->data, + src0_d, src1_d, (block_q8_0*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, @@ -258,7 +258,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ); } else if (dst->type == GGML_TYPE_IQ4_NL) { set_rows_cuda_quant( - src0_d, src1_d, (block_iq4_nl*)dst->data, + src0_d, src1_d, (block_iq4_nl*)tensor_data(dst), ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb01, nb02, nb03, diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index 14543e978cf0f..ed78f128f8377 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -250,9 +250,9 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *) src0->data; - const void * src1_d = src1 ? (const void *) src1->data : nullptr; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const void * src1_d = src1 ? (const void *) tensor_data(src1) : nullptr; + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); @@ -319,9 +319,9 @@ void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * d const ggml_tensor * src0 = dst->src[0]; // grad const ggml_tensor * src1 = dst->src[1]; // forward pass output - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 41979733601d2..00e5def43d7a8 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -144,9 +144,9 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src1->nb[0] == sizeof(float)); GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float)); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index c9184398b422c..5783349f03eac 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -274,14 +274,14 @@ void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src5->nb[0] == sizeof(float)); GGML_ASSERT(src6->nb[0] == sizeof(int32_t)); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - const float * src2_d = (const float *) src2->data; - const float * src3_d = (const float *) src3->data; - const float * src4_d = (const float *) src4->data; - const float * src5_d = (const float *) src5->data; - const int32_t * src6_d = (const int32_t *) src6->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + const float * src2_d = (const float *) tensor_data(src2); + const float * src3_d = (const float *) tensor_data(src3); + const float * src4_d = (const float *) tensor_data(src4); + const float * src5_d = (const float *) tensor_data(src5); + const int32_t * src6_d = (const int32_t *) tensor_data(src6); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index eb3d7cdba98a7..10d181ee85dc9 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -33,8 +33,8 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguously_allocated(src0)); - const float * src0_d = (const float *) src0->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + float * dst_d = (float *) tensor_data(dst); const int64_t ne = ggml_nelements(src0); diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 2eee08fa07375..89b046b7a6131 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -8,8 +8,8 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu index 153ddbcda92dc..42529129a6ce0 100644 --- a/ggml/src/ggml-cuda/tsembd.cu +++ b/ggml/src/ggml-cuda/tsembd.cu @@ -33,8 +33,8 @@ static void timestep_embedding_f32_cuda(const float * x, float * dst, const int void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 91c830c4dacc3..68c3262ef4a9d 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -107,8 +107,8 @@ static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { template void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const void * src0_d = src0->data; - void * dst_d = dst->data; + const void * src0_d = tensor_data(src0); + void * dst_d = tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(ggml_is_contiguous(src0)); @@ -230,11 +230,11 @@ template void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - void * src0_d = src0->data; - void * src1_d = src1 ? src1->data : src0->data; + void * src0_d = tensor_data(src0); + void * src1_d = src1 ? tensor_data(src1) : src0_d; const int64_t src0_o = src0->nb[1]; const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; - void * dst_d = dst->data; + void * dst_d = tensor_data(dst); const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2; cudaStream_t stream = ctx.stream(); @@ -328,9 +328,9 @@ void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const ggml_tensor * src0 = dst->src[0]; // input from forward pass const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src0_d = (const float *) tensor_data(src0); + const float * src1_d = (const float *) tensor_data(src1); + float * dst_d = (float *) tensor_data(dst); cudaStream_t stream = ctx.stream(); @@ -372,8 +372,8 @@ static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negat void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const void * src0_d = src0->data; - void * dst_d = dst->data; + const void * src0_d = tensor_data(src0); + void * dst_d = tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index ef48aa5f97bcd..4f0a43ef4a7ee 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -106,8 +106,8 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst, void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = (const float *)tensor_data(src0); + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-cuda/wkv.cu b/ggml/src/ggml-cuda/wkv.cu index d2fced705e095..06ce24bce2d18 100644 --- a/ggml/src/ggml-cuda/wkv.cu +++ b/ggml/src/ggml-cuda/wkv.cu @@ -142,19 +142,19 @@ static __global__ void rwkv_wkv7_f32(const int B, const int T, const int C, cons } void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const float * k_d = (const float *)dst->src[0]->data; - const float * v_d = (const float *)dst->src[1]->data; - const float * r_d = (const float *)dst->src[2]->data; - const float * tf_d = (const float *)dst->src[3]->data; - const float * td_d = (const float *)dst->src[4]->data; - const float * s_d = (const float *)dst->src[5]->data; + const float * k_d = (const float *)tensor_data(dst->src[0]); + const float * v_d = (const float *)tensor_data(dst->src[1]); + const float * r_d = (const float *)tensor_data(dst->src[2]); + const float * tf_d = (const float *)tensor_data(dst->src[3]); + const float * td_d = (const float *)tensor_data(dst->src[4]); + const float * s_d = (const float *)tensor_data(dst->src[5]); const int64_t B = dst->src[5]->ne[1]; const int64_t T = dst->src[0]->ne[2]; const int64_t C = dst->ne[0]; const int64_t H = dst->src[0]->ne[1]; - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); @@ -170,20 +170,20 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) } void ggml_cuda_op_rwkv_wkv7(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const float * r_d = (const float *)dst->src[0]->data; - const float * w_d = (const float *)dst->src[1]->data; - const float * k_d = (const float *)dst->src[2]->data; - const float * v_d = (const float *)dst->src[3]->data; - const float * a_d = (const float *)dst->src[4]->data; - const float * b_d = (const float *)dst->src[5]->data; - const float * s_d = (const float *)dst->src[6]->data; + const float * r_d = (const float *)tensor_data(dst->src[0]); + const float * w_d = (const float *)tensor_data(dst->src[1]); + const float * k_d = (const float *)tensor_data(dst->src[2]); + const float * v_d = (const float *)tensor_data(dst->src[3]); + const float * a_d = (const float *)tensor_data(dst->src[4]); + const float * b_d = (const float *)tensor_data(dst->src[5]); + const float * s_d = (const float *)tensor_data(dst->src[6]); const int64_t B = dst->src[6]->ne[1]; const int64_t T = dst->src[0]->ne[2]; const int64_t C = dst->ne[0]; const int64_t H = dst->src[0]->ne[1]; - float * dst_d = (float *)dst->data; + float * dst_d = (float *)tensor_data(dst); cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index a3c82d6757714..9b02bb8a026a9 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -179,14 +179,14 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) { const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch]; - const char * ptr_data = (const char *) dataset->data->data + ishard*dataset->nbs_data; + const char * ptr_data = (const char *) tensor_data(dataset->data) + ishard*dataset->nbs_data; ggml_backend_tensor_set(data_batch, ptr_data, ishard_batch*dataset->nbs_data, dataset->nbs_data); if (!labels_batch) { continue; } - const char * ptr_labels = (const char *) dataset->labels->data + ishard*dataset->nbs_labels; + const char * ptr_labels = (const char *) tensor_data(dataset->labels) + ishard*dataset->nbs_labels; ggml_backend_tensor_set(labels_batch, ptr_labels, ishard_batch*dataset->nbs_labels, dataset->nbs_labels); } } @@ -202,7 +202,7 @@ void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_bat for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) { const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch]; - const char * ptr_data = (const char *) dataset->data->data + ishard *dataset->nbs_data; + const char * ptr_data = (const char *) tensor_data(dataset->data) + ishard *dataset->nbs_data; char * ptr_data_batch = (char *) data_batch + ishard_batch*dataset->nbs_data; memcpy(ptr_data_batch, ptr_data, dataset->nbs_data); @@ -210,7 +210,7 @@ void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_bat continue; } - const char * ptr_labels = (const char *) dataset->labels->data + ishard *dataset->nbs_labels; + const char * ptr_labels = (const char *) tensor_data(dataset->labels) + ishard *dataset->nbs_labels; char * ptr_labels_batch = (char *) labels_batch + ishard_batch*dataset->nbs_labels; memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels); } @@ -271,7 +271,7 @@ static ggml_tensor * map_tensor(std::map & tensor_ new_tensor->flags = tensor->flags; memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params)); strcpy(new_tensor->name, tensor->name); - new_tensor->data = tensor->data; + tensor_set_data(new_tensor, tensor_data(tensor)); new_tensor->buffer = tensor->buffer; new_tensor->extra = tensor->extra; new_tensor->view_offs = tensor->view_offs; @@ -314,7 +314,7 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { static void ggml_opt_build(ggml_opt_context_t opt_ctx) { GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc"); - GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically"); + GGML_ASSERT((!opt_ctx->static_graphs || tensor_data(opt_ctx->inputs)) && "when using static graphs the inputs must be allocated statically"); const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD && !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5ae1c527df639..111b2ef65aeeb 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -61,6 +61,10 @@ #define m512i(p) (__m512i)(p) #endif +#ifdef GGML_NUMA_MIRROR +__thread int ggml_current_numa_node = -1; +#endif + #if defined(__linux__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH) @@ -1633,7 +1637,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)); - void * data = view_src != NULL ? view_src->data : NULL; + void * data = view_src != NULL ? tensor_data(view_src) : NULL; if (data != NULL) { data = (char *) data + view_offs; } @@ -1661,14 +1665,18 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.src =*/ { NULL }, /*.view_src =*/ view_src, /*.view_offs =*/ view_offs, - /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, + #ifdef GGML_NUMA_MIRROR + /*.data =*/ { .__data = { NULL, NULL } }, +#else + /*.data =*/ NULL, +#endif /*.name =*/ { 0 }, /*.extra =*/ NULL, /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //GGML_ASSERT_ALIGNED(result->data); + //GGML_ASSERT_ALIGNED(tensor_data(result)); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -1765,12 +1773,12 @@ void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * } void * ggml_get_data(const struct ggml_tensor * tensor) { - return tensor->data; + return tensor_data(tensor); } float * ggml_get_data_f32(const struct ggml_tensor * tensor) { assert(tensor->type == GGML_TYPE_F32); - return (float *)(tensor->data); + return (float *)(tensor_data(tensor)); } enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { @@ -6475,8 +6483,8 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { if (tensor->buffer) { ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); } else { - GGML_ASSERT(tensor->data); - memset(tensor->data, 0, ggml_nbytes(tensor)); + GGML_ASSERT(tensor_data(tensor)); + memset(tensor_data(tensor), 0, ggml_nbytes(tensor)); } return tensor; } @@ -6507,8 +6515,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { if (grad_acc->buffer) { ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float)); } else { - GGML_ASSERT(grad_acc->data); - *((float *) grad_acc->data) = onef; + GGML_ASSERT(tensor_data(grad_acc)); + *((float *) tensor_data(grad_acc)) = onef; } } else { ggml_set_zero(grad_acc); @@ -6728,7 +6736,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); - if (ggml_nelements(node) < 5 && node->data != NULL) { + if (ggml_nelements(node) < 5 && tensor_data(node) != NULL) { fprintf(fp, " | ("); for (int j = 0; j < ggml_nelements(node); j++) { // FIXME: use ggml-backend to obtain the tensor data diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 53504399c57f4..f430ba512f1ad 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -681,7 +681,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par } // read the binary blob with the tensor data - ok = ok && gr.read(data->data, ctx->size); + ok = ok && gr.read(tensor_data(data), ctx->size); if (!ok) { GGML_LOG_ERROR("%s: failed to read tensor data binary blob\n", __func__); @@ -691,7 +691,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par return nullptr; } - ctx->data = data->data; + ctx->data = tensor_data(data); } ggml_set_no_alloc(ctx_data, true); @@ -712,7 +712,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par // point the data member to the appropriate location in the binary blob using the tensor info if (!params.no_alloc) { - cur->data = (char *) data->data + info.offset; + tensor_set_data(cur, (char *) tensor_data(data) + info.offset); } } @@ -1163,7 +1163,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo GGML_ABORT("tensor not found: %s", name); } - ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const + tensor_set_data(&ctx->info[tensor_id].t, (void *)(uintptr_t)data); // double cast suppresses warning about casting away const } struct gguf_writer { @@ -1281,8 +1281,8 @@ struct gguf_writer { if (info.t.buffer) { ggml_backend_tensor_get(&info.t, buf.data() + offset, 0, nbytes); } else { - GGML_ASSERT(info.t.data); - memcpy(buf.data() + offset, info.t.data, nbytes); + GGML_ASSERT(tensor_data(&info.t)); + memcpy(buf.data() + offset, tensor_data(&info.t), nbytes); } pad(alignment); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b63a41053b488..35a09f6b35e94 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -90,7 +90,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) pos_bucket->data; + int32_t * data = (int32_t *) tensor_data(pos_bucket); for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -114,7 +114,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); - int32_t * data = (int32_t *) out_ids->data; + int32_t * data = (int32_t *) tensor_data(out_ids); if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { @@ -152,8 +152,8 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(mean); GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); - float * data = (float *) mean->data; - memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean)); + float * data = (float *) tensor_data(mean); + memset(tensor_data(mean), 0, n_tokens*n_seqs_unq*ggml_element_size(mean)); std::vector sums(n_seqs_unq, 0); for (int i = 0; i < n_tokens; i += n_seq_tokens) { @@ -198,8 +198,8 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cls); GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls)); + uint32_t * data = (uint32_t *) tensor_data(cls); + memset(tensor_data(cls), 0, n_seqs_unq*ggml_element_size(cls)); for (int i = 0; i < n_tokens; i += n_seq_tokens) { for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { @@ -215,8 +215,8 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cls); GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls)); + uint32_t * data = (uint32_t *) tensor_data(cls); + memset(tensor_data(cls), 0, n_seqs_unq*ggml_element_size(cls)); std::vector last_pos(n_seqs_unq, -1); std::vector last_row(n_seqs_unq, -1); @@ -250,7 +250,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) { if (s_copy) { GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); - int32_t * data = (int32_t *) s_copy->data; + int32_t * data = (int32_t *) tensor_data(s_copy); // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_rs; ++i) { @@ -276,7 +276,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(kq_mask); GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer)); - float * data = (float *) kq_mask->data; + float * data = (float *) tensor_data(kq_mask); for (int h = 0; h < 1; ++h) { for (int i1 = 0; i1 < n_tokens; ++i1) { @@ -375,7 +375,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer)); GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing - float * data = (float *) cross_kq_mask->data; + float * data = (float *) tensor_data(cross_kq_mask); for (int h = 0; h < 1; ++h) { for (int i = 0; i < n_tokens; ++i) { diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 321dc79fc36ab..2ca4366d25392 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -1204,7 +1204,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - int64_t * data = (int64_t *) dst->data; + int64_t * data = (int64_t *) tensor_data(dst); for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { const int64_t offs = sinfo.strm[s]*get_size(); @@ -1224,7 +1224,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - int64_t * data = (int64_t *) dst->data; + int64_t * data = (int64_t *) tensor_data(dst); if (!v_trans) { for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { @@ -1255,7 +1255,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - int32_t * data = (int32_t *) dst->data; + int32_t * data = (int32_t *) tensor_data(dst); for (uint32_t s = 0; s < n_stream; ++s) { const auto & cells = v_cells[s]; @@ -1270,7 +1270,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - float * data = (float *) dst->data; + float * data = (float *) tensor_data(dst); const int64_t n_kv = dst->ne[0]; const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch @@ -1347,7 +1347,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) dst->data; + int32_t * data = (int32_t *) tensor_data(dst); const int32_t n_kv = dst->ne[0]; diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 47497cf953fd3..97298a2edd739 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -10,6 +10,11 @@ #include #include +#ifdef GGML_NUMA_MIRROR +#include +#include +#endif + #ifdef __has_include #if __has_include() #include @@ -269,13 +274,31 @@ void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } // llama_mmap +#ifdef GGML_NUMA_MIRROR +static uintptr_t base_address_offset = 0; +static int file_name_offset = 0; +#endif + struct llama_mmap::impl { #ifdef _POSIX_MAPPED_FILES std::vector> mapped_fragments; +#ifdef GGML_NUMA_MIRROR + struct numa_mapping { + void* addr; + size_t size; + std::string path; + }; + std::vector numa_mappings; +#endif impl(struct llama_file * file, size_t prefetch, bool numa) { +#ifdef GGML_NUMA_MIRROR + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); +#endif size = file->size(); int fd = file->file_id(); +#ifndef GGML_NUMA_MIRROR int flags = MAP_SHARED; if (numa) { prefetch = 0; } #ifdef __linux__ @@ -285,6 +308,146 @@ struct llama_mmap::impl { } if (prefetch) { flags |= MAP_POPULATE; } #endif +#endif // ifndef GGML_NUMA_MIRROR + +#ifdef GGML_NUMA_MIRROR + int oldpolicy; + struct bitmask* oldmask = numa_allocate_nodemask(); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + LLAMA_LOG_WARN("get_mempolicy failed, errno=%d %s\n", errno, strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + + // Get the number of NUMA nodes + int num_nodes = numa_num_configured_nodes(); + if (num_nodes <= 0) { + LLAMA_LOG_WARN("numa_num_configured_nodes returned %d, defaulting to 1\n", num_nodes); + num_nodes = 1; + } + LLAMA_LOG_INFO("Detected %d NUMA nodes\n", num_nodes); + + size_t total_size = file->size(); + char path[128]; + std::vector is_new_mem(num_nodes, false); + int i; + + // Set addr to the first mapping for node 0 + addr = (void*)(GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + base_address_offset); + + // Calculate number of hugepages needed and total mapping size + size_t hugepages_needed = (total_size + GGML_MMAP_HUGEPAGESZ - 1) / GGML_MMAP_HUGEPAGESZ; + size_t total_mapping_size = hugepages_needed * GGML_MMAP_HUGEPAGESZ; + + LLAMA_LOG_INFO("Creating %zu hugepages (%zu bytes total) for %zu bytes of model data\n", + hugepages_needed, total_mapping_size, total_size); + + for (int node = 0; node < num_nodes; ++node) { + numa_set_preferred(node); + LLAMA_LOG_INFO("numa_set_preferred(%d) - creating single large mapping\n", node); + + // Create one large hugepage file for this entire NUMA node + sprintf(path, "/dev/hugepages/llama-node%d-unified-%d", node, file_name_offset); + if (!is_new_mem[node]) { + is_new_mem[node] = access(path, F_OK) != 0; + } + + int hugefd = open(path, O_CREAT | O_RDWR, 0600); + if (hugefd < 0) { + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("failed to open hugepage fd %s: %d %s\n", + path, errno, strerror(errno)); + throw std::runtime_error(format("failed to open hugepage fd: %s", strerror(errno))); + } + + // Resize the hugepage file to accommodate the entire mapping + if (ftruncate(hugefd, total_mapping_size) != 0) { + close(hugefd); + unlink(path); + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("failed to resize hugepage file %s: %d %s\n", + path, errno, strerror(errno)); + throw std::runtime_error(format("ftruncate failed: %s", strerror(errno))); + } + + // Create one large mapping for the entire model on this NUMA node + uintptr_t address = GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + + node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT + + base_address_offset; + + void* mm = mmap((void*)address, total_mapping_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HUGETLB | MAP_POPULATE, hugefd, 0); + close(hugefd); + + LLAMA_LOG_INFO("mmap(%s) desire=%p size=%zu result=%p is_new_mem[%d]=%s\n", + path, (void*)address, total_mapping_size, mm, node, is_new_mem[node] ? "yes" : "no"); + + if (((uintptr_t)mm) != address) { + // If mmap failed completely, delete the file we just created + if (mm == MAP_FAILED) { + unlink(path); + } + + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("unable to mmap memory: %d %s\n", errno, strerror(errno)); + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + // Store the single large mapping + numa_mappings.push_back({mm, total_mapping_size, std::string(path)}); + + if (is_new_mem[node]) { + memset(mm, 0, total_mapping_size); + } + } + + // Update global offset tracking + i = hugepages_needed; + base_address_offset += i * GGML_MMAP_HUGEPAGESZ; + file_name_offset += i; + if (is_new_mem[0]) { + LLAMA_LOG_INFO("begin to copy from disk to mem ...\n"); + size_t n = 0; + while (n < total_size) { + int nn = read(fd, (void*)((uintptr_t)addr + n), 1024 * 1024); + if (nn < 0) { + LLAMA_LOG_WARN("unable to read from file: %d %s\n", errno, strerror(errno)); + throw std::runtime_error(format("read failed: %s", strerror(errno))); + } + n += nn; + } + } + for (int node = 1; node < num_nodes; ++node) { + if (is_new_mem[node]) { + LLAMA_LOG_INFO("begin to copy from numa0 to numa%d ...\n", node); + memcpy((void*)((uintptr_t)addr + \ + node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT), \ + addr, total_size); + } + } + + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else { + set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1); + } + numa_free_cpumask(oldmask); +#endif // GGML_NUMA_MIRROR + +#ifndef GGML_NUMA_MIRROR addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { throw std::runtime_error(format("mmap failed: %s", strerror(errno))); @@ -302,8 +465,217 @@ struct llama_mmap::impl { strerror(errno)); } } - + mapped_fragments.emplace_back(0, file->size()); +#endif // ifndef GGML_NUMA_MIRROR + } + + // Constructor for unified multi-part file mapping (NUMA-aware) + impl(const std::vector & files, size_t prefetch, bool numa) { +#ifdef GGML_NUMA_MIRROR + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); + + if (files.empty()) { + throw std::runtime_error("Cannot create unified mapping with empty file list"); + } + + // Calculate total size across all files + size_t total_size = 0; + for (const auto * file : files) { + total_size += file->size(); + } + size = total_size; + + int oldpolicy; + struct bitmask* oldmask = numa_allocate_nodemask(); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + LLAMA_LOG_WARN("get_mempolicy failed, errno=%d %s\n", errno, strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + + // Get the number of NUMA nodes + int num_nodes = numa_num_configured_nodes(); + if (num_nodes <= 0) { + LLAMA_LOG_WARN("numa_num_configured_nodes returned %d, defaulting to 1\n", num_nodes); + num_nodes = 1; + } + LLAMA_LOG_INFO("Detected %d NUMA nodes for unified multi-part mapping\n", num_nodes); + LLAMA_LOG_INFO("Total unified model size: %zu bytes across %zu files\n", total_size, files.size()); + + char path[128]; + std::vector is_new_mem(num_nodes, false); + int i; + + // Set addr to the first mapping for node 0 + addr = (void*)(GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + base_address_offset); + + // Calculate number of hugepages needed and total mapping size + size_t hugepages_needed = (total_size + GGML_MMAP_HUGEPAGESZ - 1) / GGML_MMAP_HUGEPAGESZ; + size_t total_mapping_size = hugepages_needed * GGML_MMAP_HUGEPAGESZ; + + LLAMA_LOG_INFO("Creating unified mapping: %zu hugepages (%zu bytes total) for %zu bytes across %zu files\n", + hugepages_needed, total_mapping_size, total_size, files.size()); + + for (int node = 0; node < num_nodes; ++node) { + numa_set_preferred(node); + LLAMA_LOG_INFO("numa_set_preferred(%d) - creating single unified mapping\n", node); + + // Create one large hugepage file for this entire unified mapping + sprintf(path, "/dev/hugepages/llama-unified-node%d-%d", node, file_name_offset); + if (!is_new_mem[node]) { + is_new_mem[node] = access(path, F_OK) != 0; + } + + int hugefd = open(path, O_CREAT | O_RDWR, 0600); + if (hugefd < 0) { + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("failed to open hugepage fd %s: %d %s\n", + path, errno, strerror(errno)); + throw std::runtime_error(format("failed to open hugepage fd: %s", strerror(errno))); + } + + // Resize the hugepage file to accommodate the entire unified mapping + if (ftruncate(hugefd, total_mapping_size) != 0) { + close(hugefd); + unlink(path); + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("failed to resize hugepage file %s: %d %s\n", + path, errno, strerror(errno)); + throw std::runtime_error(format("ftruncate failed: %s", strerror(errno))); + } + + // Create one large mapping for the entire unified model on this NUMA node + uintptr_t address = GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + + node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT + + base_address_offset; + + void* mm = mmap((void*)address, total_mapping_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HUGETLB | MAP_POPULATE, hugefd, 0); + close(hugefd); + + LLAMA_LOG_INFO("mmap(%s) desire=%p size=%zu result=%p is_new_mem[%d]=%s\n", + path, (void*)address, total_mapping_size, mm, node, is_new_mem[node] ? "yes" : "no"); + + if (((uintptr_t)mm) != address) { + // If mmap failed completely, delete the file we just created + if (mm == MAP_FAILED) { + unlink(path); + } + + // Clean up any mappings we've already created before throwing + for (const auto& mapping : numa_mappings) { + munmap(mapping.addr, mapping.size); + unlink(mapping.path.c_str()); + } + LLAMA_LOG_WARN("unable to mmap memory: %d %s\n", errno, strerror(errno)); + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + // Store the single large mapping + numa_mappings.push_back({mm, total_mapping_size, std::string(path)}); + + if (is_new_mem[node]) { + memset(mm, 0, total_mapping_size); + } + } + + // Update global offset tracking + i = hugepages_needed; + base_address_offset += i * GGML_MMAP_HUGEPAGESZ; + file_name_offset += i; + + if (is_new_mem[0]) { + LLAMA_LOG_INFO("begin to copy unified model data from disk to mem...\n"); + size_t offset = 0; + for (const auto * file : files) { + LLAMA_LOG_INFO("copying file data at offset %zu, size %zu\n", offset, file->size()); + int fd = file->file_id(); + size_t file_size = file->size(); + size_t n = 0; + while (n < file_size) { + int nn = read(fd, (void*)((uintptr_t)addr + offset + n), std::min(size_t(1024 * 1024), file_size - n)); + if (nn < 0) { + LLAMA_LOG_WARN("unable to read from file: %d %s\n", errno, strerror(errno)); + throw std::runtime_error(format("read failed: %s", strerror(errno))); + } + n += nn; + } + offset += file_size; + } + } + + for (int node = 1; node < num_nodes; ++node) { + if (is_new_mem[node]) { + LLAMA_LOG_INFO("begin to copy unified model from numa0 to numa%d...\n", node); + memcpy((void*)((uintptr_t)addr + \ + node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT), \ + addr, total_size); + } + } + + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else { + set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1); + } + numa_free_cpumask(oldmask); +#else + // For non-NUMA case, fall back to individual file mappings + // This is a simplified version - in practice you'd want to create + // one large mapping and read all files into it + if (files.empty()) { + throw std::runtime_error("Cannot create mapping with empty file list"); + } + + // For now, just use the first file for non-NUMA case + // This is a limitation that could be improved later + struct llama_file * first_file = files[0]; + size = first_file->size(); + int fd = first_file->file_id(); + + int flags = MAP_SHARED; + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { + LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + strerror(errno)); + } + if (prefetch) { flags |= MAP_POPULATE; } +#endif + + addr = mmap(NULL, first_file->size(), PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + if (prefetch > 0) { + if (posix_madvise(addr, std::min(first_file->size(), prefetch), POSIX_MADV_WILLNEED)) { + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + if (posix_madvise(addr, first_file->size(), POSIX_MADV_RANDOM)) { + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + + mapped_fragments.emplace_back(0, first_file->size()); + + LLAMA_LOG_WARN("Multi-part unified mapping not fully supported in non-NUMA mode\n"); +#endif // GGML_NUMA_MIRROR } static void align_range(size_t * first, size_t * last, size_t page_size) { @@ -355,11 +727,26 @@ struct llama_mmap::impl { } ~impl() { +#ifdef GGML_NUMA_MIRROR + // Unmap all NUMA hugepage mappings + for (const auto& mapping : numa_mappings) { + if (munmap(mapping.addr, mapping.size)) { + LLAMA_LOG_WARN("warning: failed to munmap NUMA hugepage: %s\n", strerror(errno)); + } + // Delete the hugepage file + if (unlink(mapping.path.c_str())) { + LLAMA_LOG_WARN("warning: failed to unlink hugepage file %s: %s\n", + mapping.path.c_str(), strerror(errno)); + } + } +#else + // Only unmap fragments if not using NUMA mirroring for (const auto & frag : mapped_fragments) { if (munmap((char *) addr + frag.first, frag.second - frag.first)) { LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); } } +#endif } #elif defined(_WIN32) impl(struct llama_file * file, size_t prefetch, bool numa) { @@ -406,6 +793,60 @@ struct llama_mmap::impl { } } + // Constructor for unified multi-part file mapping (Windows) + impl(const std::vector & files, size_t prefetch, bool numa) { + GGML_UNUSED(numa); + + if (files.empty()) { + throw std::runtime_error("Cannot create mapping with empty file list"); + } + + // For Windows, we currently only support the first file in multi-part scenarios + // This is a limitation that could be improved by creating multiple mappings + struct llama_file * first_file = files[0]; + size = first_file->size(); + + HANDLE hFile = (HANDLE) _get_osfhandle(first_file->file_id()); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + + if (hMapping == NULL) { + DWORD error = GetLastError(); + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + DWORD error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); + } + + if (prefetch > 0) { +#if _WIN32_WINNT >= 0x602 + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory"); + + if (pPrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T) std::min(size, prefetch); + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n"); +#endif + } + + LLAMA_LOG_WARN("Multi-part unified mapping not fully supported on Windows - using first file only\n"); + } + void unmap_fragment(size_t first, size_t last) { GGML_UNUSED(first); GGML_UNUSED(last); @@ -426,6 +867,15 @@ struct llama_mmap::impl { throw std::runtime_error("mmap not supported"); } + // Constructor for unified multi-part file mapping (unsupported platforms) + impl(const std::vector & files, size_t prefetch, bool numa) { + GGML_UNUSED(files); + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); + + throw std::runtime_error("mmap not supported"); + } + void unmap_fragment(size_t first, size_t last) { GGML_UNUSED(first); GGML_UNUSED(last); @@ -439,6 +889,7 @@ struct llama_mmap::impl { }; llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique(file, prefetch, numa)) {} +llama_mmap::llama_mmap(const std::vector & files, size_t prefetch, bool numa) : pimpl(std::make_unique(files, prefetch, numa)) {} llama_mmap::~llama_mmap() = default; size_t llama_mmap::size() const { return pimpl->size; } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f440d7..422ed4d475a6e 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -37,6 +37,10 @@ struct llama_file { struct llama_mmap { llama_mmap(const llama_mmap &) = delete; llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false); + + // Constructor for unified multi-part file mapping (NUMA-aware) + llama_mmap(const std::vector & files, size_t prefetch = (size_t) -1, bool numa = false); + ~llama_mmap(); size_t size() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..faeade6138859 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -846,27 +846,69 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps if (use_mmap) { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); - for (const auto & file : files) { - bool is_numa = false; - - auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (dev) { - auto * reg = ggml_backend_dev_backend_reg(dev); - auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); - if (is_numa_fn) { - is_numa = is_numa_fn(); - } + + bool is_numa = false; + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (dev) { + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + if (is_numa_fn) { + is_numa = is_numa_fn(); } + } - std::unique_ptr mapping = std::make_unique(file.get(), prefetch ? -1 : 0, is_numa); - mmaps_used.emplace_back(mapping->size(), 0); - if (mlock_mmaps) { - std::unique_ptr mlock_mmap(new llama_mlock()); - mlock_mmap->init(mapping->addr()); - mlock_mmaps->emplace_back(std::move(mlock_mmap)); +#ifdef GGML_NUMA_MIRROR + // For NUMA mirroring with multiple files, create a unified mapping + if (is_numa && files.size() > 1) { + LLAMA_LOG_INFO("Creating unified NUMA mapping for %zu multi-part GGUF files\n", files.size()); + + // Create vector of file pointers + std::vector file_ptrs; + file_ptrs.reserve(files.size()); + for (const auto & file : files) { + file_ptrs.push_back(file.get()); + } + + // Create one unified mapping for all files + std::unique_ptr unified_mapping = std::make_unique(file_ptrs, prefetch ? -1 : 0, is_numa); + + // The unified mapping represents all files, so we need to store it + // for each file index to maintain compatibility with existing code + for (size_t i = 0; i < files.size(); ++i) { + // For mmaps_used, store the individual file size, not the total unified size + mmaps_used.emplace_back(files[i]->size(), 0); + if (mlock_mmaps && i == 0) { // Only lock once for the unified mapping + std::unique_ptr mlock_mmap(new llama_mlock()); + mlock_mmap->init(unified_mapping->addr()); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); + } else if (mlock_mmaps) { + // Add empty entries for consistency + mlock_mmaps->emplace_back(nullptr); + } + // Store the unified mapping only in the first slot + // Other slots remain nullptr - access code will check for unified mapping + if (i == 0) { + mappings.emplace_back(std::move(unified_mapping)); + } else { + mappings.emplace_back(nullptr); + } + } + } else { +#endif + // Original per-file mapping logic + for (const auto & file : files) { + std::unique_ptr mapping = std::make_unique(file.get(), prefetch ? -1 : 0, is_numa); + mmaps_used.emplace_back(mapping->size(), 0); + if (mlock_mmaps) { + std::unique_ptr mlock_mmap(new llama_mlock()); + mlock_mmap->init(mapping->addr()); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); + } + mappings.emplace_back(std::move(mapping)); } - mappings.emplace_back(std::move(mapping)); +#ifdef GGML_NUMA_MIRROR } +#endif } // compute the total size of all tensors for progress reporting @@ -877,40 +919,114 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { GGML_ASSERT(!mappings.empty()); - const auto & mapping = mappings.at(idx); - - *first = mapping->size(); - *last = 0; - *addr = mapping->addr(); - for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const auto * weight = get_weight(ggml_get_name(tensor)); - if (!weight || weight->idx != idx) { - continue; + +#ifdef GGML_NUMA_MIRROR + // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists + bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1]; + + if (is_unified_mapping) { + // For unified mapping, use the first (and only real) mapping + const auto & mapping = mappings[0]; + + // Calculate the offset for this file within the unified mapping + size_t file_offset = 0; + for (int i = 0; i < idx; ++i) { + file_offset += files[i]->size(); + } + + *first = mapping->size(); // Start with full mapping size + *last = 0; + *addr = (uint8_t*)mapping->addr() + file_offset; // Adjust address to file start + + // Find the actual range used by tensors in this file + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; + } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); + } + + // Adjust first and last to be relative to this file's start + if (*first != mapping->size()) { + *first = std::min(*first, files[idx]->size()); + } + if (*last != 0) { + *last = std::min(*last, files[idx]->size()); + } + } else { +#endif + // Original per-file mapping logic + const auto & mapping = mappings.at(idx); + + *first = mapping->size(); + *last = 0; + *addr = mapping->addr(); + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; + } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); } - *first = std::min(*first, weight->offs); - *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); +#ifdef GGML_NUMA_MIRROR } +#endif } void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { const auto & w = require_weight(ggml_get_name(cur)); if (use_mmap) { - const auto & mapping = mappings.at(w.idx); - if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr() + w.offs; +#ifdef GGML_NUMA_MIRROR + // Check if this is a unified mapping by comparing if all mappings point to the same object + bool is_unified_mapping = mappings.size() > 1; + if (is_unified_mapping) { + llama_mmap * first_ptr = mappings[0].get(); + for (size_t i = 1; i < mappings.size(); ++i) { + if (mappings[i].get() != first_ptr) { + is_unified_mapping = false; + break; + } + } + } + + if (is_unified_mapping) { + // For unified mapping, calculate offset within the unified mapping + size_t unified_offset = w.offs; + for (int i = 0; i < w.idx; ++i) { + unified_offset += files[i]->size(); + } + + const auto & mapping = mappings[0]; + if (tensor_data(cur) == nullptr) { + tensor_set_data(cur, (uint8_t *)mapping->addr() + unified_offset); + } else { + memcpy(tensor_data(cur), (uint8_t *)mapping->addr() + unified_offset, ggml_nbytes(cur)); + } } else { - memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); +#endif + // Original per-file mapping logic + const auto & mapping = mappings.at(w.idx); + if (tensor_data(cur) == nullptr) { + tensor_set_data(cur, (uint8_t *)mapping->addr() + w.offs); + } else { + memcpy(tensor_data(cur), (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); + } +#ifdef GGML_NUMA_MIRROR } +#endif } else { - GGML_ASSERT(cur->data != nullptr); + GGML_ASSERT(tensor_data(cur) != nullptr); GGML_ASSERT(w.idx < files.size()); const auto & file = files.at(w.idx); file->seek(w.offs, SEEK_SET); - file->read_raw(cur->data, ggml_nbytes(cur)); + file->read_raw(tensor_data(cur), ggml_nbytes(cur)); } - if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { + if (check_tensors && !ggml_validate_row_data(cur->type, tensor_data(cur), ggml_nbytes(cur))) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } } @@ -1031,12 +1147,34 @@ bool llama_model_loader::load_all_data( size_t n_size = ggml_nbytes(cur); if (use_mmap) { - const auto & mapping = mappings.at(weight->idx); + // Check if this is a unified mapping and get the appropriate mapping + std::unique_ptr * mapping_ptr; + size_t file_offset = 0; + +#ifdef GGML_NUMA_MIRROR + // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists + bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1]; + if (is_unified_mapping) { + // For unified mapping, always use mappings[0] and calculate the file offset + mapping_ptr = &mappings[0]; + // Calculate offset for this file within the unified mapping + for (int i = 0; i < weight->idx; ++i) { + file_offset += files[i]->size(); + } + } else { + // Standard per-file mapping + mapping_ptr = &mappings.at(weight->idx); + } +#else + mapping_ptr = &mappings.at(weight->idx); +#endif + + const auto & mapping = *mapping_ptr; ggml_backend_buffer_t buf_mmap = nullptr; if (bufs.count(weight->idx)) { buf_mmap = bufs.at(weight->idx); } - uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; + uint8_t * data = (uint8_t *) mapping->addr() + file_offset + weight->offs; if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { @@ -1044,8 +1182,8 @@ bool llama_model_loader::load_all_data( })); } - GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated - if (buf_mmap && cur->data == nullptr) { + GGML_ASSERT(buf_mmap || tensor_data(cur)); // either we have a buffer to allocate the tensor in, or it is already allocated + if (buf_mmap && tensor_data(cur) == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); if (lmlocks) { const auto & lmlock = lmlocks->at(weight->idx); @@ -1062,10 +1200,10 @@ bool llama_model_loader::load_all_data( const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); + file->read_raw(tensor_data(cur), n_size); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); + return std::make_pair(cur, ggml_validate_row_data(cur->type, tensor_data(cur), n_size)); })); } } else { @@ -1129,12 +1267,23 @@ bool llama_model_loader::load_all_data( if (size_done >= size_data) { // unmap offloaded tensors and metadata if (use_mmap) { - for (uint32_t idx = 0; idx < mappings.size(); idx++) { - const auto & mmap_used = mmaps_used.at(idx); - auto & mapping = mappings.at(idx); - mapping->unmap_fragment(0, mmap_used.first); - if (mmap_used.second != 0) { - mapping->unmap_fragment(mmap_used.second, mapping->size()); + // Check if this is a unified mapping by seeing if mappings[1] is null but mappings[0] exists + bool is_unified_mapping = mappings.size() > 1 && mappings[0] && !mappings[1]; + + if (is_unified_mapping) { + // For unified mappings, skip unmap_fragment calls entirely + // Cleanup will be handled by the unified mapping destructor + LLAMA_LOG_DEBUG("Skipping unmap_fragment calls for unified mapping\n"); + } else { + // Original per-file mapping cleanup + for (uint32_t idx = 0; idx < mappings.size(); idx++) { + const auto & mmap_used = mmaps_used.at(idx); + auto & mapping = mappings.at(idx); + + mapping->unmap_fragment(0, mmap_used.first); + if (mmap_used.second != 0) { + mapping->unmap_fragment(mmap_used.second, mapping->size()); + } } } } diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a00af7a1d1758..0670d203885b4 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -124,11 +124,11 @@ static void llama_tensor_dequantize_impl( if (nthread < 2) { if (tensor->type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor_data(tensor), f32_output, nelements); } else if (tensor->type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); + ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor_data(tensor), f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { - qtype->to_float(tensor->data, f32_output, nelements); + qtype->to_float(tensor_data(tensor), f32_output, nelements); } else { GGML_ABORT("fatal error"); // unreachable } @@ -167,7 +167,7 @@ static void llama_tensor_dequantize_impl( qtype->to_float(inbuf, outbuf, nels); } }; - workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); + workers.emplace_back(compute, tensor->type, (uint8_t *) tensor_data(tensor) + in_buff_offs, f32_output + out_buff_offs, thr_elems); in_buff_offs += thr_block_bytes; out_buff_offs += thr_elems; } @@ -804,7 +804,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (read_data.size() < ggml_nbytes(tensor)) { read_data.resize(ggml_nbytes(tensor)); } - tensor->data = read_data.data(); + tensor_set_data(tensor, read_data.data()); } ml.load_data_for(tensor); @@ -905,7 +905,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!quantize) { new_type = tensor->type; - new_data = tensor->data; + new_data = tensor_data(tensor); new_size = ggml_nbytes(tensor); LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); } else { @@ -950,7 +950,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: float * f32_data; if (tensor->type == GGML_TYPE_F32) { - f32_data = (float *) tensor->data; + f32_data = (float *) tensor_data(tensor); } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 3f0c312e2f003..96d1856010f1a 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -1056,7 +1056,7 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml } std::vector data_orig(nbytes); ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes); - if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast(t_read->data))) { + if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast(tensor_data(t_read)))) { ok = false; } diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 322b8bb99ec6c..9f301ad37ef22 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -76,13 +76,13 @@ static struct ggml_tensor * get_random_tensor_f32( switch (ndims) { case 1: for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + ((float *)tensor_data(result))[i0] = frand()*(fmax - fmin) + fmin; } break; case 2: for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + ((float *)tensor_data(result))[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; } } break; @@ -90,7 +90,7 @@ static struct ggml_tensor * get_random_tensor_f32( for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + ((float *)tensor_data(result))[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; } } } @@ -100,7 +100,7 @@ static struct ggml_tensor * get_random_tensor_f32( for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + ((float *)tensor_data(result))[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; } } } @@ -159,9 +159,9 @@ int main(int /*argc*/, const char ** /*argv*/) { struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]); for (int i = 0; i < ne[2]; ++i) { - ((int32_t *) p0->data)[i] = n_past_0 + i; - ((int32_t *) p1->data)[i] = n_past_2 - n_past_0; - ((int32_t *) p2->data)[i] = n_past_2 + i; + ((int32_t *) tensor_data(p0))[i] = n_past_0 + i; + ((int32_t *) tensor_data(p1))[i] = n_past_2 - n_past_0; + ((int32_t *) tensor_data(p2))[i] = n_past_2 + i; } // test mode 0, 2, 4 (standard, GPT-NeoX, GLM) mode = m == 0 ? 0 : m == 1 ? 2 : 4; @@ -184,9 +184,9 @@ int main(int /*argc*/, const char ** /*argv*/) { for (int i = 0; i < ne[2]; ++i) { for (int j = 0; j < 4; ++j) { - ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j; - ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0; - ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j; + ((int32_t *) tensor_data(p0))[i + ne[2] * j] = n_past_0 + i + j; + ((int32_t *) tensor_data(p1))[i + ne[2] * j] = n_past_2 - n_past_0; + ((int32_t *) tensor_data(p2))[i + ne[2] * j] = n_past_2 + i + j; } } @@ -225,8 +225,8 @@ int main(int /*argc*/, const char ** /*argv*/) { double sum1 = 0.0f; double diff = 0.0f; - const float * r1_data = (float *) r1->data; - const float * r2_data = (float *) r2->data; + const float * r1_data = (float *) tensor_data(r1); + const float * r2_data = (float *) tensor_data(r2); const int n_elements = ggml_nelements(r1); diff --git a/tools/cvector-generator/cvector-generator.cpp b/tools/cvector-generator/cvector-generator.cpp index d2d97e05cebb0..215f09a8c2079 100644 --- a/tools/cvector-generator/cvector-generator.cpp +++ b/tools/cvector-generator/cvector-generator.cpp @@ -81,8 +81,8 @@ struct callback_data { // copy tensor data auto n_bytes = ggml_nbytes(t); struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); - t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow - ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); + tensor_set_data(t_layer, malloc(n_bytes)); // TODO @ngxson : get rid of this malloc somehow + ggml_backend_tensor_get(t, tensor_data(t_layer), 0, n_bytes); // @dbsanfte: speculative refactor with tensor_data(), and above ggml_set_name(t_layer, ggml_get_name(t)); //print_debug_tensor(t_layer); @@ -98,8 +98,8 @@ struct callback_data { // NOTE: final layer is ignored. we only have (n_layers - 1) to process std::vector calc_diff() { for (float il = 0; il < v_pos.size(); il++) { - float * a = (float *) v_pos[il]->data; - float * b = (float *) v_neg[il]->data; + float * a = (float *) tensor_data(v_pos[il]); + float * b = (float *) tensor_data(v_neg[il]); size_t n_elem = ggml_nelements(v_pos[il]); for (size_t j = 0; j < n_elem; j++) { a[j] -= b[j]; @@ -141,7 +141,7 @@ struct callback_data { struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); - diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); + tensor_set_data(diff_filtered, malloc(ggml_nbytes(diff_filtered))); // copy non-zero rows for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { @@ -159,9 +159,9 @@ struct callback_data { // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors void reset() { - for (auto ptr : v_pos) free(ptr->data); - for (auto ptr : v_neg) free(ptr->data); - for (auto ptr : v_diff_filtered) free(ptr->data); + for (auto ptr : v_pos) free(tensor_data(ptr)); + for (auto ptr : v_neg) free(tensor_data(ptr)); + for (auto ptr : v_diff_filtered) free(tensor_data(ptr)); v_pos.clear(); v_neg.clear(); v_diff_filtered.clear(); @@ -208,7 +208,7 @@ struct train_context { std::vector empty; v_diff_tmp.push_back(empty); auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); - t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible + tensor_set_data(t, malloc(ggml_nbytes(t))); // TODO: get rid of malloc if possible v_final.push_back(t); } } @@ -221,7 +221,7 @@ struct train_context { auto & diff_tmp = v_diff_tmp[il]; size_t curr_size = diff_tmp.size(); diff_tmp.resize(curr_size + ggml_nbytes(t)); - memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); + memcpy(diff_tmp.data() + curr_size, tensor_data(t), ggml_nbytes(t)); } } @@ -238,7 +238,7 @@ struct train_context { ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); - diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible + tensor_set_data(diff, malloc(ggml_nbytes(diff))); // TODO: get rid of this malloc if possible if (transpose) { // copy data & transpose float * arr = (float *) diff_tmp.data(); @@ -250,7 +250,7 @@ struct train_context { } } else { // only copy - memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); + memcpy(tensor_data(diff), diff_tmp.data(), ggml_nbytes(diff)); } v_diff.push_back(diff); print_debug_tensor(diff); @@ -260,8 +260,8 @@ struct train_context { } ~train_context() { - for (auto ptr : v_final) free(ptr->data); - for (auto ptr : v_diff) free(ptr->data); + for (auto ptr : v_final) free(tensor_data(ptr)); + for (auto ptr : v_diff) free(tensor_data(ptr)); // no need to free v_diff_tmp, since we didn't use malloc ggml_free(ctx_ggml); } diff --git a/tools/cvector-generator/pca.hpp b/tools/cvector-generator/pca.hpp index e88bbdde93fde..ade5a65f26a93 100644 --- a/tools/cvector-generator/pca.hpp +++ b/tools/cvector-generator/pca.hpp @@ -102,7 +102,7 @@ struct pca_model { ggml_set_name(dev_square, "dev_square"); ggml_set_name(dev_eigenvector, "dev_eigenvector"); buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input)); + ggml_backend_tensor_set(dev_input, tensor_data(t_input), 0, ggml_nbytes(t_input)); // initialize eigenvector to random normalized vector { @@ -285,7 +285,7 @@ static void power_iteration( // get output tensor GGML_ASSERT(last_eigenvector); - ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); + ggml_backend_tensor_get(last_eigenvector, tensor_data(output), 0, ggml_nbytes(last_eigenvector)); //print_debug_tensor(output); ggml_gallocr_free(allocr); diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 9aad3711bae54..1bd07bb545734 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -247,7 +247,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes); } - const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); + const char * data = is_host ? (const char *) tensor_data(src1) : m_src1_data.data(); GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); // TODO: 4d? (is that even used in practice?) @@ -576,10 +576,10 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const { ggml_format_name(counts, "%s.counts", name.c_str()); for (int32_t j = 0; j < nval; ++j) { - ((float *) in_sum2->data)[j] = (float) stat.values[j]; + ((float *) tensor_data(in_sum2))[j] = (float) stat.values[j]; } for (int32_t j = 0; j < nmat; ++j) { - ((float *) counts->data)[j] = (float) stat.counts[j]; + ((float *) tensor_data(counts))[j] = (float) stat.counts[j]; } gguf_add_tensor(ctx_gguf, in_sum2); @@ -786,10 +786,10 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { // Recreate the state as expected by save_imatrix() for (int64_t j = 0; j < nval; j++) { - e.values[j] += ((const float *) in_sum2->data)[j]; + e.values[j] += ((const float *) tensor_data(in_sum2))[j]; } for (int64_t j = 0; j < ncounts; j++) { - e.counts[j] += std::lround(((const float *) counts->data)[j]); + e.counts[j] += std::lround(((const float *) tensor_data(counts))[j]); } } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index be191404cfc75..81b1f144b8c59 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2586,7 +2586,7 @@ struct clip_model_loader { size_t num_bytes = ggml_nbytes(cur); if (ggml_backend_buft_is_host(buft)) { // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); + fin.read(reinterpret_cast(tensor_data(cur)), num_bytes); } else { // read into a temporary buffer first, then copy to device memory read_buf.resize(num_bytes); @@ -3356,7 +3356,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_f32_ptr img_f32(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); - // res_imgs->data[0] = *res; + // tensor_data(res_imgs)[0] = *res; res_imgs->entries.push_back(std::move(img_f32)); return true; } diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 45c59ecb6fffe..0e77322765f27 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -286,10 +286,10 @@ static int load_imatrix(const std::string & imatrix_file, std::vectordata)[j]; + const float count = ((const float *) tensor_data(counts))[j]; if (count > 0.0f) { for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; + e[j*ne0 + i] = ((const float *) tensor_data(sums))[j*ne0 + i] / count; } } else { // Partial imatrix data, this tensor never got any input during calibration