PartitionedArrays
diff --git a/‎.github/workflows/ci.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎HPCG/Project.toml
Lines changed: 29 additions & 0 deletions b/‎HPCG/Project.toml
Lines changed: 29 additions & 0 deletions
diff --git a/‎HPCG/src/HPCG.jl
Lines changed: 32 additions & 0 deletions b/‎HPCG/src/HPCG.jl
Lines changed: 32 additions & 0 deletions
diff --git a/‎HPCG/src/compute_optimal_xyz.jl
Lines changed: 64 additions & 0 deletions b/‎HPCG/src/compute_optimal_xyz.jl
Lines changed: 64 additions & 0 deletions
diff --git a/‎HPCG/src/hpcg_benchmark.jl
Lines changed: 150 additions & 0 deletions b/‎HPCG/src/hpcg_benchmark.jl
Lines changed: 150 additions & 0 deletions
diff --git a/‎HPCG/src/hpcg_utils.jl
Lines changed: 18 additions & 0 deletions b/‎HPCG/src/hpcg_utils.jl
Lines changed: 18 additions & 0 deletions
@@ -62,6 +62,32 @@ jobs:
             using Pkg
             Pkg.develop(path=".")
             Pkg.test("PartitionedSolvers")'
+  HPCG:
+    name: HPCG / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1'
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v1
+      - uses: julia-actions/julia-buildpkg@v1
+      - run: |
+          julia --project=HPCG -e '
+            using Pkg
+            Pkg.develop(path=".")
+            Pkg.develop(path="./PartitionedSolvers")
+            Pkg.test("HPCG")'
   docs:
     name: Documentation
     runs-on: ubuntu-latest
 
@@ -8,3 +8,7 @@ Manifest.toml
 docs/build/
 tmp/
 docs/src/examples.md
+
+HPCG/src/results/
+
+HPCG/results/
@@ -0,0 +1,29 @@
+name = "HPCG"
+uuid = "f200b046-3a0e-49a8-9a9e-044aa3bf7874"
+authors = ["Reinier van Elderen <reiniervanelderen@hotmail.com>"]
+version = "0.1.0"
+
+[deps]
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
+PartitionedSolvers = "11b65f7f-80ac-401b-9ef2-3db765482d62"
+Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
+SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+DataStructures = "0.18"
+DelimitedFiles = "1.9"
+JSON = "0.21"
+MPI = "0.20"
+PartitionedArrays = "0.5"
+PartitionedSolvers = "0.2"
+Primes = "0.5"
+SparseMatricesCSR = "0.6"
+julia = "1.1"
@@ -0,0 +1,32 @@
+module HPCG
+
+using PartitionedArrays
+using PartitionedSolvers
+using LinearAlgebra
+using DelimitedFiles
+using Dates
+using Statistics
+using Primes
+using DataStructures
+using JSON
+using SparseMatricesCSR
+import Base: iterate
+
+export hpcg_benchmark_mpi
+export hpcg_benchmark_debug
+export hpcg_benchmark
+include("hpcg_benchmark.jl")
+
+export build_matrix
+export build_p_matrix
+export ref_cg!
+export pc_setup
+export pc_solve!
+include("hpcg_utils.jl")
+include("compute_optimal_xyz.jl")
+include("sparse_matrix.jl")
+include("ref_cg.jl")
+include("opt_cg.jl")
+include("report_results.jl")
+include("mg_preconditioner.jl")
+end # module HPCG
@@ -0,0 +1,64 @@
+include("mixed_base_counter.jl")
+
+"""
+    compute_optimal_shape_XYZ(np)
+    
+    Calculate the optimal way to partition a 3D shape over np processors.
+"""
+function compute_optimal_shape_XYZ(np)
+
+    if np == 1
+        return 1, 1, 1
+    end
+
+    factors = Primes.factor(DataStructures.SortedDict, np)
+    primes = collect(keys(factors))
+    z = 0
+    x = primes[1]
+
+    if (length(primes) > 1)
+        y = primes[2]
+    end
+
+    if length(primes) == 1
+        z = x^(floor(Int, factors[x] / 3))
+        y = x^(floor(Int, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0)))
+        x = x^(floor(Int, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0)))
+    elseif length(primes) == 2 && factors[x] == 1 && factors[y] == 1 # two distinct prime factors
+        z = 1
+    elseif length(primes) == 2 && factors[x] + factors[y] == 3 # three prime factors one repeated
+        z = factors[x] == 2 ? x : y
+    elseif length(primes) == 3 && factors[x] == 1 && factors[y] == 1 && factors[primes[3]] == 1 # three distinct and single prime factors
+        z = primes[3]
+    else # 3 or more prime factors so try all possible 3-subsets
+
+        powers = collect(values(factors))
+        l, m, c = mixedbasecounter(powers, length(primes))
+        c_main = Mixed_base_counter(l, m, c)
+        c1 = Mixed_base_counter(l, m, c)
+
+        min_area = 2.0 * np + 1.0
+
+        c1 = next(c1)
+        while is_zero(c1)
+            c2 = mixedbasecounter1(c_main, c1)
+            c2 = next(c2)
+            while is_zero(c2)
+                tf1 = product(c1, primes)
+                tf2 = product(c2, primes)
+                tf3 = np / tf1 / tf2 # we derive the third dimension, we don't keep track of the factors it has
+
+                area = tf1 * tf2 + tf2 * tf3 + tf1 * tf3
+                if (area < min_area)
+                    min_area = area
+                    x = tf1
+                    y = tf2
+                    z = tf3
+                end
+                c2 = next(c2)
+            end
+            c1 = next(c1)
+        end
+    end
+    return x, y, floor(Int, z)
+end
@@ -0,0 +1,150 @@
+"""
+    hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
+
+    High performance congjugate gradient benchmark. 
+
+    Consists of 3 phases 
+        - Reference phase: get tolerance of reference algorithm after 50 iterations.
+        - Optimisation phase: run optimised version until refrence tolerance is achieved.
+        - Measuring phase: run the optimised version multiple times until the set total runtime.
+
+    # Arguments
+
+    - `distribute`: method of distribution (mpi or debug).
+    - `np`: number of processes.
+    - `nx`: points in the x direction for each process.
+    - `ny`: points in the y direction for each process.
+    - `nz`: points in the z direction for each process.
+    - `total_runtime`: desired total runtime (official time requirement is 1800).
+    - `output_type`: output results to txt or json.
+    - `output_folder`: location of output.
+
+    # Output
+
+    - file output.
+"""
+function hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
+    ranks = distribute(LinearIndices((np,)))
+    timing_data = zeros(Float64, 10)
+    ref_timing_data = zeros(Float64, 10)
+    opt_timing_data = zeros(Float64, 10)
+    ref_max_iters = 50
+
+    timing_data[10] = @elapsed begin # CG setup time
+        l = 4
+        S, geom = pc_setup(np, ranks, l, nx, ny, nz)
+        x = similar(S.x[l])
+        b = S.r[l]
+    end
+
+    ### Reference CG Timing Phase ###
+    nr_of_cg_sets = 2
+    totalNiters_ref = 0
+    statevars = CGStateVariables(zero(x), similar(x), similar(x))
+    iters = 0
+    normr0 = 0
+    normr = 0
+    for i in 1:nr_of_cg_sets
+        x .= 0
+        x, ref_timing_data, normr0, normr, iters = ref_cg!(x, S.A_vec[l], b, ref_timing_data, maxiter = ref_max_iters, tolerance = 0.0, Pl = S, statevars = statevars)
+        totalNiters_ref += iters
+    end
+
+    ref_tol = normr / normr0
+
+    ### Optimized CG Setup Phase ### (only relevant after optimising the algorithm with potential convergence loss)
+    opt_max_iters = 10 * ref_max_iters
+    opt_worst_time = 0.0
+    opt_n_iters = ref_max_iters
+    for i in 1:nr_of_cg_sets
+        last_cummulative_time = opt_timing_data[1]
+        x .= 0
+        x, opt_timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, opt_timing_data, maxiter = opt_max_iters, tolerance = ref_tol, Pl = S, statevars = statevars) # Change ref_cg calls below to own optimised version.
+
+        if iters > opt_n_iters # take largest number of iterations to guarantee convergence.
+            opt_n_iters = iters
+        end
+
+        current_time = opt_timing_data[1] - last_cummulative_time
+        if current_time > opt_worst_time # Save worst time.
+            opt_worst_time = current_time
+        end
+    end
+
+    # All reduce for worst time
+    r = reduction(max, map(rank -> opt_worst_time, ranks); destination = :all)
+    map(r) do r
+        opt_worst_time = r
+    end
+
+    ### Optimized CG Timing Phase ###
+    nr_of_cg_sets = Int64(div(total_runtime, opt_worst_time, RoundUp))
+    opt_tolerance = 0.0
+    norm_data = zeros(Float64, nr_of_cg_sets)
+    for i in 1:nr_of_cg_sets
+        x .= 0
+        x, timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, timing_data, maxiter = opt_n_iters, tolerance = opt_tolerance, Pl = S, statevars = statevars)
+        norm_data[i] = normr / normr0
+    end
+
+    # collect all timing data from procs.
+    timing_data_buf = gather(map(rank -> timing_data, ranks); destination = MAIN)
+    all_timing_data = zeros(Float64, (4, 10))
+    map_main(timing_data_buf) do t
+        all_timing_data = t
+    end
+
+    map_main(ranks) do _
+        report_results(np, all_timing_data, l, ref_max_iters, opt_n_iters, nr_of_cg_sets, norm_data, geom, output_type = output_type, output_folder = output_folder)
+    end
+end
+
+"""
+    hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
+
+    Run the benchmark using MPI.
+
+    # Arguments
+
+    - `np`: number of processes
+    - `nx`: points in the x direction for each process
+    - `ny`: points in the y direction for each process
+    - `nz`: points in the z direction for each process
+    - `total_runtime`: desired total runtime (official requirement is 1800)
+    - `output_type`: output results to txt or json.
+    - `output_folder`: location of output.
+
+    # Output
+
+    - file output.
+"""
+function hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
+    with_mpi() do distribute
+        hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder)
+    end
+end
+
+"""
+    hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
+
+    Run the benchmark using debug array.
+
+    # Arguments
+
+    - `np`: number of processes
+    - `nx`: points in the x direction for each process
+    - `ny`: points in the y direction for each process
+    - `nz`: points in the z direction for each process
+    - `total_runtime`: desired total runtime (official requirement is 1800)
+    - `output_type`: output results to txt or json.
+    - `output_folder`: location of output.
+
+    # Output
+
+    - file output.
+"""
+function hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
+    with_debug() do distribute
+        hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder)
+    end
+end
@@ -0,0 +1,18 @@
+"""
+    Special mul function for hpcg without latency hiding.
+    The version with latency hiding uses a slower csr mul.
+"""
+
+function mul_no_lat!(c::PVector, a::PSparseMatrix, b::PVector)
+    @boundscheck @assert PartitionedArrays.matching_own_indices(axes(c, 1), axes(a, 1))
+    @boundscheck @assert PartitionedArrays.matching_own_indices(axes(a, 2), axes(b, 1))
+    @boundscheck @assert PartitionedArrays.matching_ghost_indices(axes(a, 2), axes(b, 1))
+    if !a.assembled
+        @boundscheck @assert PartitionedArrays.matching_ghost_indices(axes(a, 1), axes(c, 1))
+        return mul!(c, a, b, 1, 0)
+    end
+    consistent!(b) |> wait
+    foreach(spmv!, own_values(c), partition(a), partition(b))
+    c
+end
+