|
| 1 | +""" |
| 2 | + hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file |
| 3 | +
|
| 4 | + High performance congjugate gradient benchmark. |
| 5 | +
|
| 6 | + Consists of 3 phases |
| 7 | + - Reference phase: get tolerance of reference algorithm after 50 iterations. |
| 8 | + - Optimisation phase: run optimised version until refrence tolerance is achieved. |
| 9 | + - Measuring phase: run the optimised version multiple times until the set total runtime. |
| 10 | +
|
| 11 | + # Arguments |
| 12 | +
|
| 13 | + - `distribute`: method of distribution (mpi or debug). |
| 14 | + - `np`: number of processes. |
| 15 | + - `nx`: points in the x direction for each process. |
| 16 | + - `ny`: points in the y direction for each process. |
| 17 | + - `nz`: points in the z direction for each process. |
| 18 | + - `total_runtime`: desired total runtime (official time requirement is 1800). |
| 19 | + - `output_type`: output results to txt or json. |
| 20 | + - `output_folder`: location of output. |
| 21 | +
|
| 22 | + # Output |
| 23 | +
|
| 24 | + - file output. |
| 25 | +""" |
| 26 | +function hpcg_benchmark(distribute, np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") |
| 27 | + ranks = distribute(LinearIndices((np,))) |
| 28 | + timing_data = zeros(Float64, 10) |
| 29 | + ref_timing_data = zeros(Float64, 10) |
| 30 | + opt_timing_data = zeros(Float64, 10) |
| 31 | + ref_max_iters = 50 |
| 32 | + |
| 33 | + timing_data[10] = @elapsed begin # CG setup time |
| 34 | + l = 4 |
| 35 | + S, geom = pc_setup(np, ranks, l, nx, ny, nz) |
| 36 | + x = similar(S.x[l]) |
| 37 | + b = S.r[l] |
| 38 | + end |
| 39 | + |
| 40 | + ### Reference CG Timing Phase ### |
| 41 | + nr_of_cg_sets = 2 |
| 42 | + totalNiters_ref = 0 |
| 43 | + statevars = CGStateVariables(zero(x), similar(x), similar(x)) |
| 44 | + iters = 0 |
| 45 | + normr0 = 0 |
| 46 | + normr = 0 |
| 47 | + for i in 1:nr_of_cg_sets |
| 48 | + x .= 0 |
| 49 | + x, ref_timing_data, normr0, normr, iters = ref_cg!(x, S.A_vec[l], b, ref_timing_data, maxiter = ref_max_iters, tolerance = 0.0, Pl = S, statevars = statevars) |
| 50 | + totalNiters_ref += iters |
| 51 | + end |
| 52 | + |
| 53 | + ref_tol = normr / normr0 |
| 54 | + |
| 55 | + ### Optimized CG Setup Phase ### (only relevant after optimising the algorithm with potential convergence loss) |
| 56 | + opt_max_iters = 10 * ref_max_iters |
| 57 | + opt_worst_time = 0.0 |
| 58 | + opt_n_iters = ref_max_iters |
| 59 | + for i in 1:nr_of_cg_sets |
| 60 | + last_cummulative_time = opt_timing_data[1] |
| 61 | + x .= 0 |
| 62 | + x, opt_timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, opt_timing_data, maxiter = opt_max_iters, tolerance = ref_tol, Pl = S, statevars = statevars) # Change ref_cg calls below to own optimised version. |
| 63 | + |
| 64 | + if iters > opt_n_iters # take largest number of iterations to guarantee convergence. |
| 65 | + opt_n_iters = iters |
| 66 | + end |
| 67 | + |
| 68 | + current_time = opt_timing_data[1] - last_cummulative_time |
| 69 | + if current_time > opt_worst_time # Save worst time. |
| 70 | + opt_worst_time = current_time |
| 71 | + end |
| 72 | + end |
| 73 | + |
| 74 | + # All reduce for worst time |
| 75 | + r = reduction(max, map(rank -> opt_worst_time, ranks); destination = :all) |
| 76 | + map(r) do r |
| 77 | + opt_worst_time = r |
| 78 | + end |
| 79 | + |
| 80 | + ### Optimized CG Timing Phase ### |
| 81 | + nr_of_cg_sets = Int64(div(total_runtime, opt_worst_time, RoundUp)) |
| 82 | + opt_tolerance = 0.0 |
| 83 | + norm_data = zeros(Float64, nr_of_cg_sets) |
| 84 | + for i in 1:nr_of_cg_sets |
| 85 | + x .= 0 |
| 86 | + x, timing_data, normr0, normr, iters = opt_cg!(x, S.A_vec[l], b, timing_data, maxiter = opt_n_iters, tolerance = opt_tolerance, Pl = S, statevars = statevars) |
| 87 | + norm_data[i] = normr / normr0 |
| 88 | + end |
| 89 | + |
| 90 | + # collect all timing data from procs. |
| 91 | + timing_data_buf = gather(map(rank -> timing_data, ranks); destination = MAIN) |
| 92 | + all_timing_data = zeros(Float64, (4, 10)) |
| 93 | + map_main(timing_data_buf) do t |
| 94 | + all_timing_data = t |
| 95 | + end |
| 96 | + |
| 97 | + map_main(ranks) do _ |
| 98 | + report_results(np, all_timing_data, l, ref_max_iters, opt_n_iters, nr_of_cg_sets, norm_data, geom, output_type = output_type, output_folder = output_folder) |
| 99 | + end |
| 100 | +end |
| 101 | + |
| 102 | +""" |
| 103 | + hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file |
| 104 | +
|
| 105 | + Run the benchmark using MPI. |
| 106 | +
|
| 107 | + # Arguments |
| 108 | +
|
| 109 | + - `np`: number of processes |
| 110 | + - `nx`: points in the x direction for each process |
| 111 | + - `ny`: points in the y direction for each process |
| 112 | + - `nz`: points in the z direction for each process |
| 113 | + - `total_runtime`: desired total runtime (official requirement is 1800) |
| 114 | + - `output_type`: output results to txt or json. |
| 115 | + - `output_folder`: location of output. |
| 116 | +
|
| 117 | + # Output |
| 118 | +
|
| 119 | + - file output. |
| 120 | +""" |
| 121 | +function hpcg_benchmark_mpi(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") |
| 122 | + with_mpi() do distribute |
| 123 | + hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder) |
| 124 | + end |
| 125 | +end |
| 126 | + |
| 127 | +""" |
| 128 | + hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") -> output to file |
| 129 | +
|
| 130 | + Run the benchmark using debug array. |
| 131 | +
|
| 132 | + # Arguments |
| 133 | +
|
| 134 | + - `np`: number of processes |
| 135 | + - `nx`: points in the x direction for each process |
| 136 | + - `ny`: points in the y direction for each process |
| 137 | + - `nz`: points in the z direction for each process |
| 138 | + - `total_runtime`: desired total runtime (official requirement is 1800) |
| 139 | + - `output_type`: output results to txt or json. |
| 140 | + - `output_folder`: location of output. |
| 141 | +
|
| 142 | + # Output |
| 143 | +
|
| 144 | + - file output. |
| 145 | +""" |
| 146 | +function hpcg_benchmark_debug(np, nx, ny, nz; total_runtime = 60, output_type = "txt", output_folder = "results") |
| 147 | + with_debug() do distribute |
| 148 | + hpcg_benchmark(distribute, np, nx, ny, nz, total_runtime = total_runtime, output_type = output_type, output_folder = output_folder) |
| 149 | + end |
| 150 | +end |
0 commit comments