diff --git a/benchmark/sparse_buffer_mult.jl b/benchmark/sparse_buffer_mult.jl
new file mode 100644
index 000000000..e25ae849b
--- /dev/null
+++ b/benchmark/sparse_buffer_mult.jl
@@ -0,0 +1,372 @@
+using SparseArrays
+using BenchmarkTools
+using Base.Threads
+using OhMyThreads: tforeach, TaskLocalValue
+using LoopVectorization
+
+"""
+    spmm_buffer!(C, A, B)
+
+Multiply sparse matrices `A` and `B` storing the result in the
+preallocated sparse matrix `C`.
+
+The sparsity pattern of `C` should match the expected result.
+This avoids allocations when repeatedly multiplying the same sized
+matrices.
+"""
+function spmm_buffer!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    fill!(C.nzval, zero(eltype(C)))
+    SparseArrays._spmatmul!(C, A, B, one(eltype(C)), zero(eltype(C)))
+    return C
+end
+
+
+"""
+    _col_map!(colmap, C, j)
+
+Fill `colmap` with a lookup for column `j` of `C`. All entries are reset to
+zero so the same vector can be reused across columns without additional
+allocations.
+"""
+function _col_map!(colmap::Vector{I}, C::SparseMatrixCSC, j::Int) where I <: Integer
+    fill!(colmap, 0)
+    @inbounds for p in C.colptr[j]:(C.colptr[j+1]-1)
+        colmap[C.rowval[p]] = p
+    end
+    return colmap
+end
+
+
+"""
+    spmm_buffer_fast!(C, A, B)
+
+Same as [`spmm_buffer_fast!`](@ref) but builds the lookup table internally.
+This version is convenient when the map is not available beforehand.
+"""
+function spmm_buffer_fast!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    n = size(C, 1)
+    colmap = zeros(UInt, n)
+    fill!(C.nzval, zero(eltype(C)))
+    @inbounds for j in 1:size(B,2)
+        _col_map!(colmap, C, j)
+        for pb in B.colptr[j]:(B.colptr[j+1]-1)
+            k = B.rowval[pb]; b = B.nzval[pb]
+            for pa in A.colptr[k]:(A.colptr[k+1]-1)
+                i = A.rowval[pa]; a = A.nzval[pa]
+                idx = colmap[i]
+                if idx != 0
+                    C.nzval[idx] += a * b
+                end
+            end
+        end
+    end
+    return C
+end
+
+"""
+    spmm_buffer_col!(C, A, B)
+
+Column oriented buffered multiplication that builds the lookup table on each
+call.  Useful for quick tests but allocates the map.
+"""
+function spmm_buffer_col!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    n = size(C, 1)
+    colmap = zeros(UInt, n)
+    fill!(C.nzval, zero(eltype(C)))
+    @inbounds for j in 1:size(B,2)
+        _col_map!(colmap, C, j)
+        for pb in B.colptr[j]:(B.colptr[j+1]-1)
+            k = B.rowval[pb]; b = B.nzval[pb]
+            for pa in A.colptr[k]:(A.colptr[k+1]-1)
+                i = A.rowval[pa]; a = A.nzval[pa]
+                idx = colmap[i]
+                if idx != 0
+                    C.nzval[idx] += a * b
+                end
+            end
+        end
+    end
+    return C
+end
+
+"""
+    spmm_buffer_threaded!(C, A, B)
+
+Threaded buffered multiplication with internal map creation.
+"""
+function spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α, β)
+    n = size(C, 1)
+    maps = [zeros(UInt, n) for _ in 1:nthreads()]
+    if β == zero(β)
+        fill!(C.nzval, zero(eltype(C)))
+    else
+        @. C.nzval *= β
+    end
+    @threads for j in 1:size(B,2)
+        colmap = maps[threadid()]
+        _col_map!(colmap, C, j)
+        @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1)
+            k = B.rowval[pb]; b = B.nzval[pb]
+            for pa in A.colptr[k]:(A.colptr[k+1]-1)
+                i = A.rowval[pa]; a = A.nzval[pa]
+                idx = colmap[i]
+                if idx != 0
+                    C.nzval[idx] += α * a * b
+                end
+            end
+        end
+    end
+    return C
+end
+
+spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) =
+    spmm_buffer_threaded!(C, A, B, one(eltype(C)), zero(eltype(C)))
+
+
+function spmm_buffer_tasks!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α, β)
+    n    = size(C, 1)
+    nB   = size(B, 2)
+    nth  = nthreads()
+    sz   = cld(nB, nth)                # columns per chunk
+
+    # scale or zero C.nzval
+    if β == zero(β)
+        fill!(C.nzval, zero(eltype(C)))
+    else
+        @. C.nzval *= β
+    end
+
+    tasks = Vector{Task}(undef, nth)
+    for t in 1:nth
+        j0 = (t-1)*sz + 1
+        j1 = min(t*sz, nB)
+        tasks[t] = @spawn begin
+            colmap = zeros(UInt, n)
+            for j in j0:j1
+                _col_map!(colmap, C, j)
+                @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1)
+                    k = B.rowval[pb]; b = B.nzval[pb]
+                    @inbounds @simd for pa in A.colptr[k]:(A.colptr[k+1]-1)
+                        i   = A.rowval[pa]
+                        a   = A.nzval[pa]
+                        idx = colmap[i]
+                        if idx != 0
+                            C.nzval[idx] += α * a * b
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+    # wait for all tasks to finish
+    foreach(fetch, tasks)
+
+    return C
+end
+
+spmm_buffer_tasks!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) =
+    spmm_buffer_tasks!(C, A, B, one(eltype(C)), zero(eltype(C)))
+
+
+# function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+#     n = size(C, 1)
+#     maps = [zeros(Int, n) for _ in 1:Threads.nthreads()]
+#     fill!(C.nzval, zero(eltype(C)))
+#     tforeach(1:size(B,2)) do j
+#         colmap = maps[Threads.threadid()]
+#         _col_map!(colmap, C, j)
+#         @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1)
+#             k = B.rowval[pb]; b = B.nzval[pb]
+#             for pa in A.colptr[k]:(A.colptr[k+1]-1)
+#                 i = A.rowval[pa]; a = A.nzval[pa]
+#                 idx = colmap[i]
+#                 if idx != 0
+#                     C.nzval[idx] += a * b
+#                 end
+#             end
+#         end
+#     end
+#     return C
+# end
+
+
+function spmm_buffer_omt_fast!(C::SparseMatrixCSC{Tv,Int},
+                               A::SparseMatrixCSC{Tv,Int},
+                               B::SparseMatrixCSC{Tv,Int}) where {Tv<:Number}
+    n = size(C,1)
+    # zero result
+    fill!(C.nzval, zero(Tv))
+
+    # task-local map buffer
+    tlv = TaskLocalValue{Vector{Int}}(() -> zeros(UInt, n))
+
+    # hoist field accesses
+    Acolptr, Arowval, Anzval = A.colptr, A.rowval, A.nzval
+    Bcolptr, Browval, Bnzval = B.colptr, B.rowval, B.nzval
+    Ccolptr, Cnzval       = C.colptr, C.nzval
+
+    # parallel over columns of B
+    tforeach(1:size(B,2)) do j
+        colmap = tlv[]                     # zeroed Int[n]
+        _col_map!(colmap, C, j)
+
+        # precompute B’s column slice
+        pb_start = Bcolptr[j]
+        pb_end   = Bcolptr[j+1] - 1
+
+        @inbounds for pb in pb_start:pb_end
+            k = Browval[pb]; b = Bnzval[pb]
+            # precompute A’s column slice
+            pa_start = Acolptr[k]
+            pa_end   = Acolptr[k+1] - 1
+
+            @inbounds @simd for pa in pa_start:pa_end
+                i   = Arowval[pa]
+                a   = Anzval[pa]
+                idx = colmap[i]
+                if idx != 0
+                    Cnzval[idx] += a * b
+                end
+            end
+        end
+    end
+
+    return C
+end
+
+function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC)
+    n = size(C, 1)
+    # zero out result
+    fill!(C.nzval, zero(eltype(C)))
+
+    # create a task-local map vector, initialised once per task
+    tlv = TaskLocalValue{Vector{UInt}}(() -> zeros(UInt, n))
+
+    # loop over columns of B in parallel
+    tforeach(1:size(B, 2)) do j
+        colmap = tlv[]                  # per-task zeroed Int[n]
+        _col_map!(colmap, C, j)
+
+        @inbounds for pb in B.colptr[j]:(B.colptr[j+1] - 1)
+            k = B.rowval[pb]; b = B.nzval[pb]
+            for pa in A.colptr[k]:(A.colptr[k+1] - 1)
+                i = A.rowval[pa]; a = A.nzval[pa]
+                idx = colmap[i]
+                if idx != 0
+                    C.nzval[idx] += a * b
+                end
+            end
+        end
+    end
+
+    return C
+end
+
+spmm_no_buffer(A::SparseMatrixCSC, B::SparseMatrixCSC) = A * B
+
+function random_sparse(n, m, nnz; T=Float64)
+    I = rand(1:n, nnz)
+    J = rand(1:m, nnz)
+    V = rand(T, nnz)
+    sparse(I, J, V, n, m)
+end
+
+function expand_pattern(pattern::SparseMatrixCSC, extra::Integer)
+    C = copy(pattern)
+    n, m = size(pattern)
+    added = 0
+    while added < extra
+        i = rand(1:n)
+        j = rand(1:m)
+        if C[i, j] == 0
+            C[i, j] = one(eltype(C))
+            added += 1
+        end
+    end
+    fill!(C.nzval, zero(eltype(C)))
+    return C
+end
+
+function bench_case(n, p, m, nnzA, nnzB; extra_ratio=0.0)
+    A = random_sparse(n, p, nnzA)
+    B = random_sparse(p, m, nnzB)
+
+    pattern = A * B
+    if extra_ratio > 0
+        extra = round(Int, nnz(pattern) * extra_ratio)
+        pattern = expand_pattern(pattern, extra)
+    end
+
+    C1 = copy(pattern); fill!(C1.nzval, zero(eltype(C1)))
+    C2 = copy(pattern); fill!(C2.nzval, zero(eltype(C2)))
+    C3 = copy(pattern); fill!(C3.nzval, zero(eltype(C3)))
+    C4 = copy(pattern); fill!(C4.nzval, zero(eltype(C4)))
+    C5 = copy(pattern); fill!(C5.nzval, zero(eltype(C5)))
+    C6 = copy(pattern); fill!(C6.nzval, zero(eltype(C6)))
+    C7 = copy(pattern); fill!(C7.nzval, zero(eltype(C7)))
+
+    println("non zeros in A: $(nnz(A))  B: $(nnz(B))  pattern: $(nnz(pattern))")
+    println("buffer basic")
+    @btime spmm_buffer_fast!($C1, $A, $B)
+    # println("buffer threaded")
+    # @btime spmm_buffer_threaded!($C2, $A, $B)
+    # println("buffer SparseArrays")
+    # @btime spmm_buffer!($C3, $A, $B)
+    # println("buffer simd")
+    # @btime spmm_buffer_simd!($C3, $A, $B)
+    # println("buffer omt")
+    # @btime spmm_buffer_omt!($C3, $A, $B)
+    println("buffer tasks")
+    @btime spmm_buffer_tasks!($C4, $A, $B)
+    # println("buffer simd threaded")
+    # @btime spmm_buffer_simd_threaded!($C5, $A, $B)
+    # println("buffer simd omt")
+    # @btime spmm_buffer_simd_omt!($C6, $A, $B)
+    # println("buffer threaded five args")
+    # @btime spmm_buffer_threaded!($C7, $A, $B, 1.0, 0.0)
+    println("no buffer")
+    @btime spmm_no_buffer($A, $B)
+
+    C_ref = spmm_no_buffer(A, B)
+    @assert isapprox(C1, C_ref; atol=1e-12, rtol=1e-8)
+    # @assert isapprox(C2, C_ref; atol=1e-12, rtol=1e-8)
+    # @assert isapprox(C3, C_ref; atol=1e-12, rtol=1e-8)
+    @assert isapprox(C4, C_ref; atol=1e-12, rtol=1e-8)
+    # @assert isapprox(C5, C_ref; atol=1e-12, rtol=1e-8)
+    # @assert isapprox(C6, C_ref; atol=1e-12, rtol=1e-8)
+    # @assert isapprox(C7, C_ref; atol=1e-12, rtol=1e-8)
+end
+
+function run()
+    n = 1000
+    p = 100000
+    m = 100000
+    nnz_values = round.(Int, 10 .^ range(4, 6; length=3))
+    for nnzA in nnz_values
+        nnzB = nnzA
+        println("\n---- nnz = $nnzA ----")
+        bench_case(n, p, m, nnzA, nnzB)
+    end
+end
+
+run()
+
+
+
+A = random_sparse(100, 1000000, Int(1e5))
+B = random_sparse(1000000, 1000000, Int(1e5))
+C = A * B
+C.nzval .= 0
+
+@profview for i in 1:100 spmm_buffer_tasks!(C,A,B) end
+@profview for i in 1:1000 spmm_buffer_fast!(C,A,B) end
+
+using ThreadedSparseArrays
+thA = ThreadedSparseMatrixCSC(A)
+thB = ThreadedSparseMatrixCSC(B)
+
+@benchmark $thA * $thB
+@benchmark $A * $B
+@benchmark spmm_buffer_tasks!($C,$A,$B)
+@benchmark spmm_buffer_fast!($C,$A,$B)
\ No newline at end of file
diff --git a/benchmark/sparse_buffer_mult_output.txt b/benchmark/sparse_buffer_mult_output.txt
new file mode 100644
index 000000000..9b852ba33
--- /dev/null
+++ b/benchmark/sparse_buffer_mult_output.txt
@@ -0,0 +1,37 @@
+
+---- nnz = 1000 ----
+non zeros in A: 999  B: 999  pattern: 983
+buffer basic
+  52.428 μs (3 allocations: 7.88 KiB)
+buffer threaded
+  19.601 μs (44 allocations: 42.80 KiB)
+buffer simd
+  52.234 μs (3 allocations: 7.88 KiB)
+buffer omt
+  21.612 μs (64 allocations: 43.69 KiB)
+buffer simd threaded
+  20.130 μs (44 allocations: 42.73 KiB)
+buffer simd omt
+  21.000 μs (64 allocations: 43.69 KiB)
+buffer threaded five args
+  18.944 μs (44 allocations: 42.80 KiB)
+no buffer
+  14.522 μs (11 allocations: 42.05 KiB)
+
+---- nnz = 2418 ----
+non zeros in A: 2414  B: 2414  pattern: 5860
+buffer basic
+  65.425 μs (3 allocations: 7.88 KiB)
+buffer threaded
+  23.428 μs (44 allocations: 42.80 KiB)
+buffer simd
+  64.872 μs (3 allocations: 7.88 KiB)
+buffer omt
+  25.715 μs (64 allocations: 43.69 KiB)
+buffer simd threaded
+  22.563 μs (44 allocations: 42.73 KiB)
+buffer simd omt
+  25.541 μs (64 allocations: 43.69 KiB)
+buffer threaded five args
+  22.690 μs (44 allocations: 42.80 KiB)
+no buffer