Skip to content

Commit 2c9f414

Browse files
authored
Merge pull request #177 from HRvanElderen/master
HPCG benchmark
2 parents 114bc8b + 618c3e6 commit 2c9f414

24 files changed

+1496
-2
lines changed

.github/workflows/ci.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,32 @@ jobs:
6262
using Pkg
6363
Pkg.develop(path=".")
6464
Pkg.test("PartitionedSolvers")'
65+
HPCG:
66+
name: HPCG / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
67+
runs-on: ${{ matrix.os }}
68+
strategy:
69+
fail-fast: false
70+
matrix:
71+
version:
72+
- '1'
73+
os:
74+
- ubuntu-latest
75+
arch:
76+
- x64
77+
steps:
78+
- uses: actions/checkout@v2
79+
- uses: julia-actions/setup-julia@v1
80+
with:
81+
version: ${{ matrix.version }}
82+
arch: ${{ matrix.arch }}
83+
- uses: julia-actions/cache@v1
84+
- uses: julia-actions/julia-buildpkg@v1
85+
- run: |
86+
julia --project=HPCG -e '
87+
using Pkg
88+
Pkg.develop(path=".")
89+
Pkg.develop(path="./PartitionedSolvers")
90+
Pkg.test("HPCG")'
6591
docs:
6692
name: Documentation
6793
runs-on: ubuntu-latest

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ Manifest.toml
88
docs/build/
99
tmp/
1010
docs/src/examples.md
11+
12+
HPCG/src/results/
13+
14+
HPCG/results/

HPCG/Project.toml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name = "HPCG"
2+
uuid = "f200b046-3a0e-49a8-9a9e-044aa3bf7874"
3+
authors = ["Reinier van Elderen <reiniervanelderen@hotmail.com>"]
4+
version = "0.1.0"
5+
6+
[deps]
7+
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
8+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
9+
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
10+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
11+
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
12+
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
13+
PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
14+
PartitionedSolvers = "11b65f7f-80ac-401b-9ef2-3db765482d62"
15+
Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
16+
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
17+
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
18+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
19+
20+
[compat]
21+
DataStructures = "0.18"
22+
DelimitedFiles = "1.9"
23+
JSON = "0.21"
24+
MPI = "0.20"
25+
PartitionedArrays = "0.5"
26+
PartitionedSolvers = "0.2"
27+
Primes = "0.5"
28+
SparseMatricesCSR = "0.6"
29+
julia = "1.1"

HPCG/src/HPCG.jl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
module HPCG
2+
3+
using PartitionedArrays
4+
using PartitionedSolvers
5+
using LinearAlgebra
6+
using DelimitedFiles
7+
using Dates
8+
using Statistics
9+
using Primes
10+
using DataStructures
11+
using JSON
12+
using SparseMatricesCSR
13+
import Base: iterate
14+
15+
export hpcg_benchmark_mpi
16+
export hpcg_benchmark_debug
17+
export hpcg_benchmark
18+
include("hpcg_benchmark.jl")
19+
20+
export build_matrix
21+
export build_p_matrix
22+
export ref_cg!
23+
export pc_setup
24+
export pc_solve!
25+
include("hpcg_utils.jl")
26+
include("compute_optimal_xyz.jl")
27+
include("sparse_matrix.jl")
28+
include("ref_cg.jl")
29+
include("opt_cg.jl")
30+
include("report_results.jl")
31+
include("mg_preconditioner.jl")
32+
end # module HPCG

HPCG/src/compute_optimal_xyz.jl

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
include("mixed_base_counter.jl")
2+
3+
"""
4+
compute_optimal_shape_XYZ(np)
5+
6+
Calculate the optimal way to partition a 3D shape over np processors.
7+
"""
8+
function compute_optimal_shape_XYZ(np)
9+
10+
if np == 1
11+
return 1, 1, 1
12+
end
13+
14+
factors = Primes.factor(DataStructures.SortedDict, np)
15+
primes = collect(keys(factors))
16+
z = 0
17+
x = primes[1]
18+
19+
if (length(primes) > 1)
20+
y = primes[2]
21+
end
22+
23+
if length(primes) == 1
24+
z = x^(floor(Int, factors[x] / 3))
25+
y = x^(floor(Int, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0)))
26+
x = x^(floor(Int, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0)))
27+
elseif length(primes) == 2 && factors[x] == 1 && factors[y] == 1 # two distinct prime factors
28+
z = 1
29+
elseif length(primes) == 2 && factors[x] + factors[y] == 3 # three prime factors one repeated
30+
z = factors[x] == 2 ? x : y
31+
elseif length(primes) == 3 && factors[x] == 1 && factors[y] == 1 && factors[primes[3]] == 1 # three distinct and single prime factors
32+
z = primes[3]
33+
else # 3 or more prime factors so try all possible 3-subsets
34+
35+
powers = collect(values(factors))
36+
l, m, c = mixedbasecounter(powers, length(primes))
37+
c_main = Mixed_base_counter(l, m, c)
38+
c1 = Mixed_base_counter(l, m, c)
39+
40+
min_area = 2.0 * np + 1.0
41+
42+
c1 = next(c1)
43+
while is_zero(c1)
44+
c2 = mixedbasecounter1(c_main, c1)
45+
c2 = next(c2)
46+
while is_zero(c2)
47+
tf1 = product(c1, primes)
48+
tf2 = product(c2, primes)
49+
tf3 = np / tf1 / tf2 # we derive the third dimension, we don't keep track of the factors it has
50+
51+
area = tf1 * tf2 + tf2 * tf3 + tf1 * tf3
52+
if (area < min_area)
53+
min_area = area
54+
x = tf1
55+
y = tf2
56+
z = tf3
57+
end
58+
c2 = next(c2)
59+
end
60+
c1 = next(c1)
61+
end
62+
end
63+
return x, y, floor(Int, z)
64+
end

HPCG/src/hpcg_benchmark.jl

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""
2+
hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
3+
4+
High performance congjugate gradient benchmark.
5+
6+
Consists of 3 phases
7+
- Reference phase: get tolerance of reference algorithm after 50 iterations.
8+
- Optimisation phase: run optimised version until refrence tolerance is achieved.
9+
- Measuring phase: run the optimised version multiple times until the set total runtime.
10+
11+
# Arguments
12+
13+
- `distribute`: method of distribution (mpi or debug).
14+
- `np`: number of processes.
15+
- `nx`: points in the x direction for each process.
16+
- `ny`: points in the y direction for each process.
17+
- `nz`: points in the z direction for each process.
18+
- `total_runtime`: desired total runtime (official time requirement is 1800).
19+
- `output_type`: output results to txt or json.
20+
- `output_folder`: location of output.
21+
22+
# Output
23+
24+
- file output.
25+
"""
26+
function hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
27+
ranks = distribute(LinearIndices((np,)))
28+
timing_data = zeros(Float64, 10)
29+
ref_timing_data = zeros(Float64, 10)
30+
opt_timing_data = zeros(Float64, 10)
31+
ref_max_iters = 50
32+
33+
timing_data[10] = @elapsed begin # CG setup time
34+
l = 4
35+
S, geom = pc_setup(np, ranks, l, nx, ny, nz)
36+
x = similar(S.x[l])
37+
b = S.r[l]
38+
end
39+
40+
### Reference CG Timing Phase ###
41+
nr_of_cg_sets = 2
42+
totalNiters_ref = 0
43+
statevars = CGStateVariables(zero(x), similar(x), similar(x))
44+
iters = 0
45+
normr0 = 0
46+
normr = 0
47+
for i in 1:nr_of_cg_sets
48+
x .= 0
49+
x, ref_timing_data, normr0, normr, iters = ref_cg!(x, S.A_vec[l], b, ref_timing_data, maxiter = ref_max_iters, tolerance = 0.0, Pl = S, statevars = statevars)
50+
totalNiters_ref += iters
51+
end
52+
53+
ref_tol = normr / normr0
54+
55+
### Optimized CG Setup Phase ### (only relevant after optimising the algorithm with potential convergence loss)
56+
opt_max_iters = 10 * ref_max_iters
57+
opt_worst_time = 0.0
58+
opt_n_iters = ref_max_iters
59+
for i in 1:nr_of_cg_sets
60+
last_cummulative_time = opt_timing_data[1]
61+
x .= 0
62+
x, opt_timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, opt_timing_data, maxiter = opt_max_iters, tolerance = ref_tol, Pl = S, statevars = statevars) # Change ref_cg calls below to own optimised version.
63+
64+
if iters > opt_n_iters # take largest number of iterations to guarantee convergence.
65+
opt_n_iters = iters
66+
end
67+
68+
current_time = opt_timing_data[1] - last_cummulative_time
69+
if current_time > opt_worst_time # Save worst time.
70+
opt_worst_time = current_time
71+
end
72+
end
73+
74+
# All reduce for worst time
75+
r = reduction(max, map(rank -> opt_worst_time, ranks); destination = :all)
76+
map(r) do r
77+
opt_worst_time = r
78+
end
79+
80+
### Optimized CG Timing Phase ###
81+
nr_of_cg_sets = Int64(div(total_runtime, opt_worst_time, RoundUp))
82+
opt_tolerance = 0.0
83+
norm_data = zeros(Float64, nr_of_cg_sets)
84+
for i in 1:nr_of_cg_sets
85+
x .= 0
86+
x, timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, timing_data, maxiter = opt_n_iters, tolerance = opt_tolerance, Pl = S, statevars = statevars)
87+
norm_data[i] = normr / normr0
88+
end
89+
90+
# collect all timing data from procs.
91+
timing_data_buf = gather(map(rank -> timing_data, ranks); destination = MAIN)
92+
all_timing_data = zeros(Float64, (4, 10))
93+
map_main(timing_data_buf) do t
94+
all_timing_data = t
95+
end
96+
97+
map_main(ranks) do _
98+
report_results(np, all_timing_data, l, ref_max_iters, opt_n_iters, nr_of_cg_sets, norm_data, geom, output_type = output_type, output_folder = output_folder)
99+
end
100+
end
101+
102+
"""
103+
hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
104+
105+
Run the benchmark using MPI.
106+
107+
# Arguments
108+
109+
- `np`: number of processes
110+
- `nx`: points in the x direction for each process
111+
- `ny`: points in the y direction for each process
112+
- `nz`: points in the z direction for each process
113+
- `total_runtime`: desired total runtime (official requirement is 1800)
114+
- `output_type`: output results to txt or json.
115+
- `output_folder`: location of output.
116+
117+
# Output
118+
119+
- file output.
120+
"""
121+
function hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
122+
with_mpi() do distribute
123+
hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder)
124+
end
125+
end
126+
127+
"""
128+
hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file
129+
130+
Run the benchmark using debug array.
131+
132+
# Arguments
133+
134+
- `np`: number of processes
135+
- `nx`: points in the x direction for each process
136+
- `ny`: points in the y direction for each process
137+
- `nz`: points in the z direction for each process
138+
- `total_runtime`: desired total runtime (official requirement is 1800)
139+
- `output_type`: output results to txt or json.
140+
- `output_folder`: location of output.
141+
142+
# Output
143+
144+
- file output.
145+
"""
146+
function hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results")
147+
with_debug() do distribute
148+
hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder)
149+
end
150+
end

HPCG/src/hpcg_utils.jl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
Special mul function for hpcg without latency hiding.
3+
The version with latency hiding uses a slower csr mul.
4+
"""
5+
6+
function mul_no_lat!(c::PVector, a::PSparseMatrix, b::PVector)
7+
@boundscheck @assert PartitionedArrays.matching_own_indices(axes(c, 1), axes(a, 1))
8+
@boundscheck @assert PartitionedArrays.matching_own_indices(axes(a, 2), axes(b, 1))
9+
@boundscheck @assert PartitionedArrays.matching_ghost_indices(axes(a, 2), axes(b, 1))
10+
if !a.assembled
11+
@boundscheck @assert PartitionedArrays.matching_ghost_indices(axes(a, 1), axes(c, 1))
12+
return mul!(c, a, b, 1, 0)
13+
end
14+
consistent!(b) |> wait
15+
foreach(spmv!, own_values(c), partition(a), partition(b))
16+
c
17+
end
18+

0 commit comments

Comments
 (0)