From 48bc8f6b28635055e6341f6fa052f7c5c6ebce9a Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 18 Sep 2025 07:47:35 -0300 Subject: [PATCH 1/2] refactor: simplify weight conversion code --- model.cpp | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/model.cpp b/model.cpp index 0585e980..6a65bf72 100644 --- a/model.cpp +++ b/model.cpp @@ -2211,26 +2211,23 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); } - if (tensor_storage.type == dst_tensor->type) { - // copy to device memory - t1 = ggml_time_ms(); - convert_time_ms.fetch_add(t1 - t0); - t0 = ggml_time_ms(); - ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); - t1 = ggml_time_ms(); - copy_to_backend_time_ms.fetch_add(t1 - t0); - } else { - // convert first, then copy to device memory + auto* tensor_buffer = &read_buffer; + if (tensor_storage.type != dst_tensor->type) { + // convert first convert_buffer.resize(ggml_nbytes(dst_tensor)); convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - t1 = ggml_time_ms(); - convert_time_ms.fetch_add(t1 - t0); - t0 = ggml_time_ms(); - ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); - t1 = ggml_time_ms(); - copy_to_backend_time_ms.fetch_add(t1 - t0); + tensor_buffer = &convert_buffer; } + + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + + // copy to device memory + t0 = ggml_time_ms(); + ggml_backend_tensor_set(dst_tensor, tensor_buffer->data(), 0, ggml_nbytes(dst_tensor)); + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); } } if (zip != NULL) { From cc6b7baf7ef0a560f0385fa1a58a09334606e861 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 18 Sep 2025 07:48:29 -0300 Subject: [PATCH 2/2] fix: serialize calls to ggml_backend_tensor_set --- model.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/model.cpp b/model.cpp index 6a65bf72..2e7127d9 100644 --- a/model.cpp +++ b/model.cpp @@ -2024,6 +2024,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread const size_t total_tensors_to_process = processed_tensor_storages.size(); const int64_t t_start = ggml_time_ms(); int last_n_threads = 1; + std::mutex tensor_backend_mutex; for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) { std::string file_path = file_paths_[file_index]; @@ -2224,6 +2225,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread convert_time_ms.fetch_add(t1 - t0); // copy to device memory + std::lock_guard lock(tensor_backend_mutex); t0 = ggml_time_ms(); ggml_backend_tensor_set(dst_tensor, tensor_buffer->data(), 0, ggml_nbytes(dst_tensor)); t1 = ggml_time_ms();