Skip to content
Open
37 changes: 37 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
CXX = dpcpp
CXXFLAGS = -std=c++20 -Wall -Weverything -Wno-c++98-compat -Wno-c++98-c++11-compat-binary-literal -Wno-c++98-compat-pedantic
OPTFLAGS = -O3
SYCLFLAGS = -fsycl
SYCLCUDAFLAGS = -fsycl-targets=nvptx64-nvidia-cuda
IFLAGS = -I ./include

# Actually compiled code to be executed on host CPU, to be used only for testing functional correctness
Expand Down Expand Up @@ -63,3 +65,38 @@ bench/fpga_emu_bench.out: bench/acorn_fpga.cpp include/*.hpp

fpga_hw_bench: bench/acorn_fpga.cpp include/*.hpp
$(CXX) $(CXXFLAGS) -Wno-padded $(FPGA_HW_FLAGS) $(OPTFLAGS) $(IFLAGS) -reuse-exe=bench/$@.out $< -o bench/$@.out

accel_test: test/accel_test.out
./$<

test/accel_test.out: test/accel_acorn.cpp include/*.hpp
$(CXX) $(CXXFLAGS) $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) $< -o $@

aot_cpu:
@if lscpu | grep -q 'avx512'; then \
echo "Using avx512"; \
$(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx512" bench/accel_acorn.cpp -o bench/a.out; \
elif lscpu | grep -q 'avx2'; then \
echo "Using avx2"; \
$(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx2" bench/accel_acorn.cpp -o bench/a.out; \
elif lscpu | grep -q 'avx'; then \
echo "Using avx"; \
$(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx" bench/accel_acorn.cpp -o bench/a.out; \
elif lscpu | grep -q 'sse4.2'; then \
echo "Using sse4.2"; \
$(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=sse4.2" bench/accel_acorn.cpp -o bench/a.out; \
else \
echo "Can't AOT compile using avx, avx2, avx512 or sse4.2"; \
fi
./bench/a.out

aot_gpu:
# you may want to replace `device` identifier with `0x3e96` if you're targeting *Intel(R) UHD Graphics P630*
#
# otherwise, let it be what it's if you're targeting *Intel(R) Iris(R) Xe MAX Graphics*
$(CXX) -std=c++20 -Wall -DSYCL_TARGET_GPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_gen -Xs "-device 0x4905" bench/accel_acorn.cpp -o bench/a.out
./bench/a.out

cuda:
clang++ -std=c++20 -Wall -DSYCL_TARGET_GPU $(SYCLFLAGS) $(SYCLCUDAFLAGS) $(OPTFLAGS) $(IFLAGS) bench/accel_acorn.cpp -o bench/a.out
./bench/a.out
130 changes: 130 additions & 0 deletions bench/accel_acorn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#include "bench_utils.hpp"
#include "table.hpp"
#include <iostream>

int
main()
{
// associated data byte length, same for all cases
constexpr size_t dt_len = 32ul;
// min # -of work-items to be dispatched
constexpr size_t min_wi_cnt = 1ul << 16;
// max # -of work-items to be dispatched
constexpr size_t max_wi_cnt = 1ul << 18;
// # -of work-items to be grouped during execution
//
// @note, consider taking better decision about appropriate work-group size
// for certain kernel at runtime based on SYCL runtime heuristics
constexpr size_t wg_size = 32ul;
constexpr size_t min_ct_len = 64ul; // bytes
constexpr size_t max_ct_len = 4096ul; // bytes

#if defined SYCL_TARGET_CPU
sycl::cpu_selector s{};
#pragma message("Selecting default CPU accelerator !")
#elif defined SYCL_TARGET_GPU
sycl::gpu_selector s{};
#pragma message("Selecting default GPU accelerator !")
#else
sycl::default_selector s{};
#pragma message("Selecting default SYCL accelerator !")
#endif

sycl::device d{ s };
sycl::context c{ d };
sycl::queue q{ c, d, sycl::property::queue::enable_profiling{} };

std::cout << "running on " << d.get_info<sycl::info::device::name>()
<< std::endl
<< std::endl;

uint64_t* ts = static_cast<uint64_t*>(std::malloc(sizeof(uint64_t) * 3));
size_t* io = static_cast<size_t*>(std::malloc(sizeof(size_t) * 3));

std::cout << "Benchmarking Acorn-128 encrypt" << std::endl << std::endl;

TextTable t0('-', '|', '+');

t0.add("invocation count");
t0.add("plain text len ( bytes )");
t0.add("associated data len ( bytes )");
t0.add("host-to-device b/w");
t0.add("kernel b/w");
t0.add("device-to-host b/w");
t0.endOfRow();

for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) {
for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) {
bench_acorn::exec_kernel(q,
ct_len,
dt_len,
wi,
wg_size,
bench_acorn::acorn_type::accel_acorn_encrypt,
ts,
io);

t0.add(std::to_string(wi));
t0.add(std::to_string(ct_len));
t0.add(std::to_string(dt_len));
t0.add(bench_acorn::to_readable_bandwidth(io[0], ts[0]));
t0.add(bench_acorn::to_readable_bandwidth(io[1], ts[1]));
t0.add(bench_acorn::to_readable_bandwidth(io[2], ts[2]));
t0.endOfRow();
}
}

t0.setAlignment(1, TextTable::Alignment::RIGHT);
t0.setAlignment(2, TextTable::Alignment::RIGHT);
t0.setAlignment(3, TextTable::Alignment::RIGHT);
t0.setAlignment(4, TextTable::Alignment::RIGHT);
t0.setAlignment(5, TextTable::Alignment::RIGHT);
std::cout << t0;

std::cout << std::endl
<< "Benchmarking Acorn-128 decrypt" << std::endl
<< std::endl;

TextTable t1('-', '|', '+');

t1.add("invocation count");
t1.add("cipher text len ( bytes )");
t1.add("associated data len ( bytes )");
t1.add("host-to-device b/w");
t1.add("kernel b/w");
t1.add("device-to-host b/w");
t1.endOfRow();

for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) {
for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) {
bench_acorn::exec_kernel(q,
ct_len,
dt_len,
wi,
wg_size,
bench_acorn::acorn_type::accel_acorn_decrypt,
ts,
io);

t1.add(std::to_string(wi));
t1.add(std::to_string(ct_len));
t1.add(std::to_string(dt_len));
t1.add(bench_acorn::to_readable_bandwidth(io[0], ts[0]));
t1.add(bench_acorn::to_readable_bandwidth(io[1], ts[1]));
t1.add(bench_acorn::to_readable_bandwidth(io[2], ts[2]));
t1.endOfRow();
}
}

t1.setAlignment(1, TextTable::Alignment::RIGHT);
t1.setAlignment(2, TextTable::Alignment::RIGHT);
t1.setAlignment(3, TextTable::Alignment::RIGHT);
t1.setAlignment(4, TextTable::Alignment::RIGHT);
t1.setAlignment(5, TextTable::Alignment::RIGHT);
std::cout << t1;

std::free(ts);
std::free(io);

return EXIT_SUCCESS;
}
42 changes: 22 additions & 20 deletions bench/acorn_fpga.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,21 @@ main()

for (size_t invk = min_invk_cnt; invk <= max_invk_cnt; invk <<= 1) {
for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) {
bench_acorn_fpga::exec_kernel(q,
ct_len,
dt_len,
invk,
bench_acorn_fpga::acorn_type::acorn_encrypt,
ts,
io);
bench_acorn::exec_kernel(q,
ct_len,
dt_len,
invk,
0,
bench_acorn::acorn_type::acorn_encrypt_fpga,
ts,
io);

t0.add(std::to_string(invk));
t0.add(std::to_string(ct_len));
t0.add(std::to_string(dt_len));
t0.add(bench_acorn_fpga::to_readable_bandwidth(io[0], ts[0]));
t0.add(bench_acorn_fpga::to_readable_bandwidth(io[1], ts[1]));
t0.add(bench_acorn_fpga::to_readable_bandwidth(io[2], ts[2]));
t0.add(bench_acorn::to_readable_bandwidth(io[0], ts[0]));
t0.add(bench_acorn::to_readable_bandwidth(io[1], ts[1]));
t0.add(bench_acorn::to_readable_bandwidth(io[2], ts[2]));
t0.endOfRow();
}
}
Expand Down Expand Up @@ -90,20 +91,21 @@ main()

for (size_t invk = min_invk_cnt; invk <= max_invk_cnt; invk <<= 1) {
for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) {
bench_acorn_fpga::exec_kernel(q,
ct_len,
dt_len,
invk,
bench_acorn_fpga::acorn_type::acorn_decrypt,
ts,
io);
bench_acorn::exec_kernel(q,
ct_len,
dt_len,
invk,
0,
bench_acorn::acorn_type::acorn_decrypt_fpga,
ts,
io);

t1.add(std::to_string(invk));
t1.add(std::to_string(ct_len));
t1.add(std::to_string(dt_len));
t1.add(bench_acorn_fpga::to_readable_bandwidth(io[0], ts[0]));
t1.add(bench_acorn_fpga::to_readable_bandwidth(io[1], ts[1]));
t1.add(bench_acorn_fpga::to_readable_bandwidth(io[2], ts[2]));
t1.add(bench_acorn::to_readable_bandwidth(io[0], ts[0]));
t1.add(bench_acorn::to_readable_bandwidth(io[1], ts[1]));
t1.add(bench_acorn::to_readable_bandwidth(io[2], ts[2]));
t1.endOfRow();
}
}
Expand Down
Loading