From e11f14611d9538e76325576c408a39086e9dbef0 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 06:14:09 +0000 Subject: [PATCH 1/9] accelerated Acorn128 on multi-core CPU, GPGPU using SYCL --- kernels written ! --- include/accel_acorn.hpp | 193 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 include/accel_acorn.hpp diff --git a/include/accel_acorn.hpp b/include/accel_acorn.hpp new file mode 100644 index 0000000..da9df1d --- /dev/null +++ b/include/accel_acorn.hpp @@ -0,0 +1,193 @@ +#pragma once +#include "acorn.hpp" +#include + +// Accelerated Acorn-128: A lightweight AEAD ( authenticated encryption with +// associated data ) scheme, targeting accelerators ( i.e. multi-core CPUs, +// GPGPUs ) using SYCL +namespace accel_acorn { + +class kernelAcorn128Encrypt; +class kernelAcorn128Decrypt; + +// Encrypt N -many independent, non-overlapping, equal-length plain text +// byteslices along with N -many independent, non-overlapping, equal-length +// associated data byteslices on multi-core CPU/ GPGPU, using Acorn-128 AEAD +// +// Input: +// +// - N -many secret keys, each of 128 -bit +// - N -many public message nonces, each of 128 -bit +// - N -many plain text byteslices, each of same length +// - N -many associated data byteslices, each of same length +// +// Note, avoid nonce reuse i.e. under same secret key don't use same nonce twice +// +// Note, associated data bytes are never encrypted +// +// Output: +// +// - N -many encrypted text byteslices, each of same length +// +// assert enc_len == text_len +// +// - N -many authentication tags, each of 128 -bit +// - SYCL event, resulting from submission of compute job to SYCL queue +static inline sycl::event +encrypt( + sycl::queue& q, // SYCL job submission queue + const uint8_t* const __restrict key, // secret keys + const size_t key_len, // = wi_cnt * 16 + const uint8_t* const __restrict nonce, // public message nonces + const size_t nonce_len, // = wi_cnt * 16 + const uint8_t* const __restrict text, // plain text + const size_t text_len, // text_len % wi_cnt == 0 + const uint8_t* const __restrict data, // associated data + const size_t data_len, // data_len % wi_cnt == 0 + uint8_t* const __restrict enc, // encrypted data bytes + const size_t enc_len, // = text_len + uint8_t* const __restrict tag, // authentication tags + const size_t tag_len, // = wi_cnt * 16 + const size_t wi_cnt, // # -of work items to be dispatched + const size_t wg_size, // # -of work items to be grouped + const std::vector evts // forms SYCL runtime dependency graph +) +{ + // all work groups to have same number of effective work-items + assert(wi_cnt % wg_size == 0); + // each secret key of 128 -bit + assert(wi_cnt << 4 == key_len); + // each public message nonce of 128 -bit + assert(wi_cnt << 4 == nonce_len); + // each authentication tag of 128 -bit + assert(wi_cnt << 4 == tag_len); + // independent, non-overlapping plain text byteslices + assert(text_len % wi_cnt == 0); + // independent, non-overlapping associated data byteslices + assert(data_len % wi_cnt == 0); + // encrypted bytes length must be same as plain text length + assert(text_len == enc_len); + + // each work item to consume these many plain text bytes during encryption + const size_t per_wi_ct_len = text_len / wi_cnt; + // each work item to consume these many associated data bytes during + // encryption, though note that associated data bytes are never encrypted ! + const size_t per_wi_ad_len = data_len / wi_cnt; + + sycl::event evt = q.submit([&](sycl::handler& h) { + // SYCL dependency graph + h.depends_on(evts); + h.parallel_for( + sycl::nd_range<1>{ wi_cnt, wg_size }, [=](sycl::nd_item<1> it) { + const size_t idx = it.get_global_linear_id(); + + const size_t knt_off = idx << 4; + const size_t ct_off = idx * per_wi_ct_len; + const size_t ad_off = idx * per_wi_ad_len; + + acorn::encrypt(key + knt_off, + nonce + knt_off, + text + ct_off, + per_wi_ct_len, + data + ad_off, + per_wi_ad_len, + enc + ct_off, + tag + knt_off); + }); + }); + return evt; +} + +// Decrypt N -many independent, non-overlapping, equal-length cipher text +// byteslices along with N -many independent, non-overlapping, equal-length +// associated data byteslices on multi-core CPU/ GPGPU, using Acorn-128 AEAD +// +// Input: +// +// - N -many secret keys, each of 128 -bit +// - N -many public message nonces, each of 128 -bit +// - N -many authentication tags, each of 128 -bit +// - N -many cipher text byteslices, each of same length +// - N -many associated data byteslices, each of same length +// +// Note, associated data bytes are never encrypted +// +// Output: +// +// - N -many decrypted text byteslices, each of same length +// +// assert text_len == enc_len +// +// - N -many verification flags, each a boolean value +// - SYCL event, resulting from submission of compute job to SYCL queue +static inline sycl::event +decrypt( + sycl::queue& q, // SYCL job submission queue + const uint8_t* const __restrict key, // secret keys + const size_t key_len, // = wi_cnt * 16 + const uint8_t* const __restrict nonce, // public message nonces + const size_t nonce_len, // = wi_cnt * 16 + const uint8_t* const __restrict tag, // authentication tags + const size_t tag_len, // = wi_cnt * 16 + const uint8_t* const __restrict enc, // encrypted data bytes + const size_t enc_len, // enc_len % wi_cnt == 0 + const uint8_t* const __restrict data, // associated data + const size_t data_len, // data_len % wi_cnt == 0 + uint8_t* const __restrict text, // plain text bytes + const size_t text_len, // = enc_len + bool* const __restrict flag, // verification flags + const size_t flag_len, // wi_cnt * sizeof(bool) + const size_t wi_cnt, // # -of work items to be dispatched + const size_t wg_size, // # -of work items to be grouped + const std::vector evts // forms SYCL runtime dependency graph +) +{ + // all work groups to have same number of effective work-items + assert(wi_cnt % wg_size == 0); + // each secret key of 128 -bit + assert(wi_cnt << 4 == key_len); + // each public message nonce of 128 -bit + assert(wi_cnt << 4 == nonce_len); + // each authentication tag of 128 -bit + assert(wi_cnt << 4 == tag_len); + // independent, non-overlapping cipher text byteslices + assert(enc_len % wi_cnt == 0); + // independent, non-overlapping associated data byteslices + assert(data_len % wi_cnt == 0); + // decrypted bytes length must be same as cipher text length + assert(enc_len == text_len); + // each verification flag is of boolean type + assert(wi_cnt * sizeof(bool) == flag_len); + + // each work item to consume these many cipher text bytes during decryption + const size_t per_wi_ct_len = enc_len / wi_cnt; + // each work item to consume these many associated data bytes during + // decryption, though note that associated data bytes are never encrypted in + // first place ! + const size_t per_wi_ad_len = data_len / wi_cnt; + + sycl::event evt = q.submit([&](sycl::handler& h) { + // SYCL dependency graph + h.depends_on(evts); + h.parallel_for( + sycl::nd_range<1>{ wi_cnt, wg_size }, [=](sycl::nd_item<1> it) { + const size_t idx = it.get_global_linear_id(); + + const size_t knt_off = idx << 4; + const size_t ct_off = idx * per_wi_ct_len; + const size_t ad_off = idx * per_wi_ad_len; + + acorn::decrypt(key + knt_off, + nonce + knt_off, + tag + knt_off, + enc + ct_off, + per_wi_ct_len, + data + ad_off, + per_wi_ad_len, + text + ct_off); + }); + }); + return evt; +} + +} From 7d79ca2cd4db5db6ef1249d403f56432d3bfda45 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 06:43:41 +0000 Subject: [PATCH 2/9] during decryption, write verification flag at proper memory offset --- include/accel_acorn.hpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/accel_acorn.hpp b/include/accel_acorn.hpp index da9df1d..330b0f0 100644 --- a/include/accel_acorn.hpp +++ b/include/accel_acorn.hpp @@ -177,14 +177,16 @@ decrypt( const size_t ct_off = idx * per_wi_ct_len; const size_t ad_off = idx * per_wi_ad_len; - acorn::decrypt(key + knt_off, - nonce + knt_off, - tag + knt_off, - enc + ct_off, - per_wi_ct_len, - data + ad_off, - per_wi_ad_len, - text + ct_off); + const bool flg = acorn::decrypt(key + knt_off, + nonce + knt_off, + tag + knt_off, + enc + ct_off, + per_wi_ct_len, + data + ad_off, + per_wi_ad_len, + text + ct_off); + + flag[idx] = flg; }); }); return evt; From 7843b7caa24c3c87daaf1c9947cb5dd901a6b4b2 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 06:45:21 +0000 Subject: [PATCH 3/9] test accelerated acorn128 implementation --- include/test_accel_acorn.hpp | 190 +++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 include/test_accel_acorn.hpp diff --git a/include/test_accel_acorn.hpp b/include/test_accel_acorn.hpp new file mode 100644 index 0000000..980710e --- /dev/null +++ b/include/test_accel_acorn.hpp @@ -0,0 +1,190 @@ +#pragma once +#include "accel_acorn.hpp" +#include "utils.hpp" + +// Tests Acorn-128 AEAD implementation, targeting multi-core CPUs, GPGPUs using +// SYCL +namespace test_accel_acorn { + +// Test that accelerated Acorn128 AEAD works as expected on muti-core CPUs & +// GPGPUs, while executing encrypt -> decrypt -> byte-by-byte compare +static inline void +encrypt_decrypt( + sycl::queue& q, // SYCL job submission queue + const size_t per_wi_ct_len, // plain/ cipher text length in bytes + const size_t per_wi_ad_len, // associated data length in bytes + const size_t wi_cnt, // # -of work items to be dispatched + const size_t wg_size // # -of work items to be grouped together +) +{ + const size_t ct_len = wi_cnt * per_wi_ct_len; // alloc memory of bytes + const size_t ad_len = wi_cnt * per_wi_ad_len; // alloc memory of bytes + const size_t knt_len = wi_cnt << 4; // alloc memory of bytes + const size_t flg_len = wi_cnt * sizeof(bool); // alloc memory of bytes + + // plain text on host + uint8_t* txt_h = static_cast(std::malloc(ct_len)); + // encrypted text on host + uint8_t* enc_h = static_cast(std::malloc(ct_len)); + // decrypted text on host + uint8_t* dec_h = static_cast(std::malloc(ct_len)); + // associated data on host + uint8_t* data_h = static_cast(std::malloc(ad_len)); + // secret keys on host + uint8_t* keys_h = static_cast(std::malloc(knt_len)); + // public message nonces on host + uint8_t* nonces_h = static_cast(std::malloc(knt_len)); + // authentication tags on host + uint8_t* tags_h = static_cast(std::malloc(knt_len)); + // boolean verification flags on host + bool* flags_h = static_cast(std::malloc(flg_len)); + + // plain text on accelerator + uint8_t* txt_d = static_cast(sycl::malloc_device(ct_len, q)); + // encrypted text on accelerator + uint8_t* enc_d = static_cast(sycl::malloc_device(ct_len, q)); + // decrypted text on accelerator + uint8_t* dec_d = static_cast(sycl::malloc_device(ct_len, q)); + // associated data on accelerator + uint8_t* data_d = static_cast(sycl::malloc_device(ad_len, q)); + // secret keys on accelerator + uint8_t* keys_d = static_cast(sycl::malloc_device(knt_len, q)); + // public message nonces on accelerator + uint8_t* nonces_d = static_cast(sycl::malloc_device(knt_len, q)); + // authentication tags on accelerator + uint8_t* tags_d = static_cast(sycl::malloc_device(knt_len, q)); + // boolean verification flags on accelerator + bool* flags_d = static_cast(sycl::malloc_device(flg_len, q)); + + // prepare random plain text on host + random_data(txt_h, ct_len); + // prepare random associated data on host + random_data(data_h, ad_len); + // prepare random secret keys on host + random_data(keys_h, knt_len); + // prepare random public message nonces on host + random_data(nonces_h, knt_len); + + // zero out to-be-transferred host memory allocations + memset(enc_h, 0, ct_len); + memset(dec_h, 0, ct_len); + memset(tags_h, 0, knt_len); + memset(flags_h, 0, flg_len); + + // transfer prepared ( on host ) random input bytes to accelerator + sycl::event evt0 = q.memcpy(txt_d, txt_h, ct_len); + sycl::event evt1 = q.memcpy(data_d, data_h, ad_len); + sycl::event evt2 = q.memcpy(keys_d, keys_h, knt_len); + sycl::event evt3 = q.memcpy(nonces_d, nonces_h, knt_len); + + // zero out to-be-computed accelerator memory allocations + sycl::event evt4 = q.memset(enc_d, 0, ct_len); + sycl::event evt5 = q.memset(dec_d, 0, ct_len); + sycl::event evt6 = q.memset(tags_d, 0, knt_len); + sycl::event evt7 = q.memset(flags_d, 0, flg_len); + + std::vector evts0{ evt0, evt1, evt2, evt3, evt4, evt6 }; + + // Acorn-128 authenticated encryption on accelerator + sycl::event evt8 = accel_acorn::encrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + txt_d, + ct_len, + data_d, + ad_len, + enc_d, + ct_len, + tags_d, + knt_len, + wi_cnt, + wg_size, + evts0); + + // Acorn-128 verified decryption on accelerator + sycl::event evt9 = accel_acorn::decrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + tags_d, + knt_len, + enc_d, + ct_len, + data_d, + ad_len, + dec_d, + ct_len, + flags_d, + flg_len, + wi_cnt, + wg_size, + { evt5, evt7, evt8 }); + + // transfer deciphered text back to host + sycl::event evt10 = q.submit([&](sycl::handler& h) { + h.depends_on(evt9); + h.memcpy(dec_h, dec_d, ct_len); + }); + + // transfer verification flags back to host + sycl::event evt11 = q.submit([&](sycl::handler& h) { + h.depends_on(evt9); + h.memcpy(flags_h, flags_d, flg_len); + }); + + // transfer encrypted data bytes back to host + sycl::event evt12 = q.submit([&](sycl::handler& h) { + h.depends_on(evt8); + h.memcpy(enc_h, enc_d, ct_len); + }); + + // transfer authentication tags back to host + sycl::event evt13 = q.submit([&](sycl::handler& h) { + h.depends_on(evt8); + h.memcpy(tags_h, tags_d, knt_len); + }); + + std::vector evts1{ evt10, evt11, evt12, evt13 }; + sycl::event evt14 = q.ext_oneapi_submit_barrier(evts1); + + // host synchronization i.e. blocking call ! + evt14.wait(); + + // test on host that everything worked as expected ! + for (size_t i = 0; i < wi_cnt; i++) { + // to be sure that authentication passed during decryption ! + assert(flags_h[i]); + + // now do a byte-by-byte comparison that decrypted bytes are indeed same as + // original input plain text bytes + const size_t ct_off = i * per_wi_ct_len; + for (size_t j = 0; j < per_wi_ct_len; j++) { + assert(txt_h[ct_off + j] == dec_h[ct_off + j]); + } + } + + // deallocate host memory resources + std::free(txt_h); + std::free(enc_h); + std::free(dec_h); + std::free(data_h); + std::free(keys_h); + std::free(nonces_h); + std::free(tags_h); + std::free(flags_h); + + // deallocate SYCL runtime managed accelerator memory resources + sycl::free(txt_d, q); + sycl::free(enc_d, q); + sycl::free(dec_d, q); + sycl::free(data_d, q); + sycl::free(keys_d, q); + sycl::free(nonces_d, q); + sycl::free(tags_d, q); + sycl::free(flags_d, q); +} + +} From 2e5f9229c11aac9cdba69960d666c79a23060fa3 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 06:45:51 +0000 Subject: [PATCH 4/9] added test runner program; added new recipes in Makefile for easily running tests --- Makefile | 7 +++++++ test/accel_acorn.cpp | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 test/accel_acorn.cpp diff --git a/Makefile b/Makefile index 929fead..d183214 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ CXX = dpcpp CXXFLAGS = -std=c++20 -Wall -Weverything -Wno-c++98-compat -Wno-c++98-c++11-compat-binary-literal -Wno-c++98-compat-pedantic OPTFLAGS = -O3 +SYCLFLAGS = -fsycl IFLAGS = -I ./include # Actually compiled code to be executed on host CPU, to be used only for testing functional correctness @@ -63,3 +64,9 @@ bench/fpga_emu_bench.out: bench/acorn_fpga.cpp include/*.hpp fpga_hw_bench: bench/acorn_fpga.cpp include/*.hpp $(CXX) $(CXXFLAGS) -Wno-padded $(FPGA_HW_FLAGS) $(OPTFLAGS) $(IFLAGS) -reuse-exe=bench/$@.out $< -o bench/$@.out + +accel_test: test/accel_test.out + ./$< + +test/accel_test.out: test/accel_acorn.cpp include/*.hpp + $(CXX) $(CXXFLAGS) $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) $< -o $@ diff --git a/test/accel_acorn.cpp b/test/accel_acorn.cpp new file mode 100644 index 0000000..7a3a4b5 --- /dev/null +++ b/test/accel_acorn.cpp @@ -0,0 +1,35 @@ +#include "test_accel_acorn.hpp" +#include + +int +main() +{ + // total work items to be dispatched during each testing round + constexpr size_t wi_cnt = 1ul << 10; + // # -of work-items to be grouped together + constexpr size_t wg_size = 32ul; + // associated data byte length for each work-item + constexpr size_t dt_len = 32ul; + // plain text byte length for each work-item + constexpr size_t ct_len = 32ul; + + sycl::default_selector s{}; + sycl::device d{ s }; + sycl::context c{ d }; + sycl::queue q{ c, d }; + + std::cout << "running on " << d.get_info() + << std::endl + << std::endl; + + for (size_t i = 0; i < ct_len; i++) { + for (size_t j = 0; j < dt_len; j++) { + test_accel_acorn::encrypt_decrypt(q, i, j, wi_cnt, wg_size); + } + } + + std::cout << "[test] passed accelerated Acorn-128 encrypt/ decrypt !" + << std::endl; + + return EXIT_SUCCESS; +} From d8252ec05f1b30b55460637ddf7f95f5f567310c Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 07:35:38 +0000 Subject: [PATCH 5/9] extend to support benchmarking accelerated Acorn128 on multi-core CPU/ GPU --- include/bench_utils.hpp | 152 ++++++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 45 deletions(-) diff --git a/include/bench_utils.hpp b/include/bench_utils.hpp index 52d743d..63d530f 100644 --- a/include/bench_utils.hpp +++ b/include/bench_utils.hpp @@ -1,4 +1,5 @@ #pragma once +#include "accel_acorn.hpp" #include "acorn_fpga.hpp" #include "utils.hpp" @@ -6,17 +7,22 @@ #define MB 1048576. // 1 << 20 bytes #define KB 1024. // 1 << 10 bytes -// Benchmark Acorn-128 AEAD implementation, targeting FPGA using SYCL/ DPC++ -namespace bench_acorn_fpga { +// Benchmark Acorn-128 AEAD implementation, targeting multi-core CPUs/ GPGPUs/ +// FPGAs using SYCL/ DPC++ +namespace bench_acorn { // Which one to benchmark // // 0) Acorn-128 single work-item encrypt routine on FPGA // 1) Acorn-128 single work-item decrypt routine on FPGA +// 2) Accelerated Acorn-128 encrypt routine on multi-core CPU/ GPGPU +// 3) Accelerated Acorn-128 decrypt routine on multi-core CPU/ GPGPU enum acorn_type { - acorn_encrypt, - acorn_decrypt, + acorn_encrypt_fpga, + acorn_decrypt_fpga, + accel_acorn_encrypt, + accel_acorn_decrypt, }; // Time execution of SYCL command, whose submission resulted into given SYCL @@ -59,10 +65,12 @@ to_readable_bandwidth(const size_t bytes, // bytes } // Executes accelerated Acorn-128 encrypt/ decrypt kernels ( chosen using -// `type` parameter ) on FPGA, on `invk_cnt` -many ( read single work-item SYCL -// FPGA kernel is iterated those many times ) independent input byte slices ( -// plain text/ cipher text/ associated data ), while returning how much time -// spent on following +// `type` parameter ) on multi-core CPU/ GPGPU/ FPGA, on `invk_cnt` -many ( read +// single work-item SYCL FPGA kernel is iterated those many times/ when it's +// multi work-item SYCL kernel these many work-items to be dispatched ) +// independent, non-overlapping & equal-length input byte slices ( plain text/ +// cipher text/ associated data ), while returning how much time spent on +// following // // - host -> device input tx time ( total ) // - kernel execution time @@ -74,11 +82,19 @@ to_readable_bandwidth(const size_t bytes, // bytes // - bytes of data transferred from host -> device // - bytes of data consumed during encryption/ decryption // - bytes of data transferred from device -> host +// +// With values returned from this function, one should be able to compute +// following +// +// - host -> device data transfer bandwidth ( say bytes/ sec ) +// - SYCL kernel data processing bandwidth ( say bytes/ sec ) +// - device -> host data transfer bandwidth ( say bytes/ sec ) static inline void exec_kernel(sycl::queue& q, // SYCL job submission queue const size_t per_invk_ct_len, // bytes const size_t per_invk_dt_len, // bytes - const size_t invk_cnt, // to be invoked these many times + const size_t invk_cnt, // # -of work items to be dispatched + const size_t wg_size, // # -of work-items to be grouped acorn_type type, // which Acorn routine to benchmark uint64_t* const __restrict ts, // time spent on activities size_t* const __restrict io // processed bytes during activities @@ -86,6 +102,14 @@ exec_kernel(sycl::queue& q, // SYCL job submission queue { // SYCL queue must have profiling enabled ! assert(q.has_property()); + if (type == acorn_encrypt_fpga || type == acorn_decrypt_fpga) { + // because when offloading compute job to FPGA, it's single work-item kernel + assert(wg_size == 0); + } else { + // while offloading to multi-core CPU/ GPGPU, it's multi work-item kernel ( + // i.e. SYCL `parallel_for` ) + assert(wg_size > 0); + } const size_t ct_len = invk_cnt * per_invk_ct_len; // alloc memory of bytes const size_t dt_len = invk_cnt * per_invk_dt_len; // alloc memory of bytes @@ -156,40 +180,78 @@ exec_kernel(sycl::queue& q, // SYCL job submission queue std::vector evts0{ evt0, evt1, evt2, evt3, evt4, evt6 }; // Acorn-128 authenticated encryption on accelerator - sycl::event evt8 = acorn_fpga::encrypt(q, - keys_d, - knt_len, - nonces_d, - knt_len, - txt_d, - ct_len, - data_d, - dt_len, - enc_d, - ct_len, - tags_d, - knt_len, - invk_cnt, - evts0); + sycl::event evt8 = type == acorn_encrypt_fpga || type == acorn_decrypt_fpga + ? acorn_fpga::encrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + txt_d, + ct_len, + data_d, + dt_len, + enc_d, + ct_len, + tags_d, + knt_len, + invk_cnt, + evts0) + : accel_acorn::encrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + txt_d, + ct_len, + data_d, + dt_len, + enc_d, + ct_len, + tags_d, + knt_len, + invk_cnt, + wg_size, + evts0); + + std::vector evts1{ evt5, evt7, evt8 }; // Acorn-128 verified decryption on accelerator - sycl::event evt9 = acorn_fpga::decrypt(q, - keys_d, - knt_len, - nonces_d, - knt_len, - tags_d, - knt_len, - enc_d, - ct_len, - data_d, - dt_len, - dec_d, - ct_len, - flags_d, - flg_len, - invk_cnt, - { evt5, evt7, evt8 }); + sycl::event evt9 = type == acorn_encrypt_fpga || type == acorn_decrypt_fpga + ? acorn_fpga::decrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + tags_d, + knt_len, + enc_d, + ct_len, + data_d, + dt_len, + dec_d, + ct_len, + flags_d, + flg_len, + invk_cnt, + evts1) + : accel_acorn::decrypt(q, + keys_d, + knt_len, + nonces_d, + knt_len, + tags_d, + knt_len, + enc_d, + ct_len, + data_d, + dt_len, + dec_d, + ct_len, + flags_d, + flg_len, + invk_cnt, + wg_size, + evts1); // transfer deciphered text back to host sycl::event evt10 = q.submit([&](sycl::handler& h) { @@ -215,8 +277,8 @@ exec_kernel(sycl::queue& q, // SYCL job submission queue h.memcpy(tags_h, tags_d, knt_len); }); - std::vector evts1{ evt10, evt11, evt12, evt13 }; - sycl::event evt14 = q.ext_oneapi_submit_barrier(evts1); + std::vector evts2{ evt10, evt11, evt12, evt13 }; + sycl::event evt14 = q.ext_oneapi_submit_barrier(evts2); // host synchronization i.e. blocking call ! evt14.wait(); @@ -231,7 +293,7 @@ exec_kernel(sycl::queue& q, // SYCL job submission queue } } - if (type == acorn_encrypt) { + if (type == acorn_encrypt_fpga || type == accel_acorn_encrypt) { const uint64_t t0 = time_event(evt0) + time_event(evt1); const uint64_t t1 = time_event(evt2) + time_event(evt3); @@ -242,7 +304,7 @@ exec_kernel(sycl::queue& q, // SYCL job submission queue io[0] = ct_len + dt_len + 2 * knt_len; io[1] = ct_len + dt_len; io[2] = ct_len + knt_len; - } else if (type == acorn_decrypt) { + } else if (type == acorn_decrypt_fpga || type == accel_acorn_decrypt) { const uint64_t t0 = time_event(evt0) + time_event(evt1); const uint64_t t1 = time_event(evt2) + time_event(evt3) * 2; From e35749b8ff85cb0001fe394b7bc09631650f1952 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 07:36:45 +0000 Subject: [PATCH 6/9] benchmark runners added/ updated --- bench/accel_acorn.cpp | 130 ++++++++++++++++++++++++++++++++++++++++++ bench/acorn_fpga.cpp | 42 +++++++------- 2 files changed, 152 insertions(+), 20 deletions(-) create mode 100644 bench/accel_acorn.cpp diff --git a/bench/accel_acorn.cpp b/bench/accel_acorn.cpp new file mode 100644 index 0000000..151f89f --- /dev/null +++ b/bench/accel_acorn.cpp @@ -0,0 +1,130 @@ +#include "bench_utils.hpp" +#include "table.hpp" +#include + +int +main() +{ + // associated data byte length, same for all cases + constexpr size_t dt_len = 32ul; + // min # -of work-items to be dispatched + constexpr size_t min_wi_cnt = 1ul << 16; + // max # -of work-items to be dispatched + constexpr size_t max_wi_cnt = 1ul << 18; + // # -of work-items to be grouped during execution + // + // @note, consider taking better decision about appropriate work-group size + // for certain kernel at runtime based on SYCL runtime heuristics + constexpr size_t wg_size = 32ul; + constexpr size_t min_ct_len = 64ul; // bytes + constexpr size_t max_ct_len = 4096ul; // bytes + +#if defined SYCL_TARGET_CPU + sycl::cpu_selector s{}; +#pragma message("Selecting default CPU accelerator !") +#elif defined SYCL_TARGET_GPU + sycl::gpu_selector s{}; +#pragma message("Selecting default GPU accelerator !") +#else + sycl::default_selector s{}; +#pragma message("Selecting default SYCL accelerator !") +#endif + + sycl::device d{ s }; + sycl::context c{ d }; + sycl::queue q{ c, d, sycl::property::queue::enable_profiling{} }; + + std::cout << "running on " << d.get_info() + << std::endl + << std::endl; + + uint64_t* ts = static_cast(std::malloc(sizeof(uint64_t) * 3)); + size_t* io = static_cast(std::malloc(sizeof(size_t) * 3)); + + std::cout << "Benchmarking Acorn-128 encrypt" << std::endl << std::endl; + + TextTable t0('-', '|', '+'); + + t0.add("invocation count"); + t0.add("plain text len ( bytes )"); + t0.add("associated data len ( bytes )"); + t0.add("host-to-device b/w"); + t0.add("kernel b/w"); + t0.add("device-to-host b/w"); + t0.endOfRow(); + + for (size_t invk = min_wi_cnt; invk <= max_wi_cnt; invk <<= 1) { + for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { + bench_acorn::exec_kernel(q, + ct_len, + dt_len, + invk, + wg_size, + bench_acorn::acorn_type::accel_acorn_encrypt, + ts, + io); + + t0.add(std::to_string(invk)); + t0.add(std::to_string(ct_len)); + t0.add(std::to_string(dt_len)); + t0.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); + t0.add(bench_acorn::to_readable_bandwidth(io[1], ts[1])); + t0.add(bench_acorn::to_readable_bandwidth(io[2], ts[2])); + t0.endOfRow(); + } + } + + t0.setAlignment(1, TextTable::Alignment::RIGHT); + t0.setAlignment(2, TextTable::Alignment::RIGHT); + t0.setAlignment(3, TextTable::Alignment::RIGHT); + t0.setAlignment(4, TextTable::Alignment::RIGHT); + t0.setAlignment(5, TextTable::Alignment::RIGHT); + std::cout << t0; + + std::cout << std::endl + << "Benchmarking Acorn-128 decrypt" << std::endl + << std::endl; + + TextTable t1('-', '|', '+'); + + t1.add("invocation count"); + t1.add("cipher text len ( bytes )"); + t1.add("associated data len ( bytes )"); + t1.add("host-to-device b/w"); + t1.add("kernel b/w"); + t1.add("device-to-host b/w"); + t1.endOfRow(); + + for (size_t invk = min_wi_cnt; invk <= max_wi_cnt; invk <<= 1) { + for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { + bench_acorn::exec_kernel(q, + ct_len, + dt_len, + invk, + wg_size, + bench_acorn::acorn_type::accel_acorn_decrypt, + ts, + io); + + t1.add(std::to_string(invk)); + t1.add(std::to_string(ct_len)); + t1.add(std::to_string(dt_len)); + t1.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); + t1.add(bench_acorn::to_readable_bandwidth(io[1], ts[1])); + t1.add(bench_acorn::to_readable_bandwidth(io[2], ts[2])); + t1.endOfRow(); + } + } + + t1.setAlignment(1, TextTable::Alignment::RIGHT); + t1.setAlignment(2, TextTable::Alignment::RIGHT); + t1.setAlignment(3, TextTable::Alignment::RIGHT); + t1.setAlignment(4, TextTable::Alignment::RIGHT); + t1.setAlignment(5, TextTable::Alignment::RIGHT); + std::cout << t1; + + std::free(ts); + std::free(io); + + return EXIT_SUCCESS; +} diff --git a/bench/acorn_fpga.cpp b/bench/acorn_fpga.cpp index 7e7ee85..4a6403f 100644 --- a/bench/acorn_fpga.cpp +++ b/bench/acorn_fpga.cpp @@ -49,20 +49,21 @@ main() for (size_t invk = min_invk_cnt; invk <= max_invk_cnt; invk <<= 1) { for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { - bench_acorn_fpga::exec_kernel(q, - ct_len, - dt_len, - invk, - bench_acorn_fpga::acorn_type::acorn_encrypt, - ts, - io); + bench_acorn::exec_kernel(q, + ct_len, + dt_len, + invk, + 0, + bench_acorn::acorn_type::acorn_encrypt_fpga, + ts, + io); t0.add(std::to_string(invk)); t0.add(std::to_string(ct_len)); t0.add(std::to_string(dt_len)); - t0.add(bench_acorn_fpga::to_readable_bandwidth(io[0], ts[0])); - t0.add(bench_acorn_fpga::to_readable_bandwidth(io[1], ts[1])); - t0.add(bench_acorn_fpga::to_readable_bandwidth(io[2], ts[2])); + t0.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); + t0.add(bench_acorn::to_readable_bandwidth(io[1], ts[1])); + t0.add(bench_acorn::to_readable_bandwidth(io[2], ts[2])); t0.endOfRow(); } } @@ -90,20 +91,21 @@ main() for (size_t invk = min_invk_cnt; invk <= max_invk_cnt; invk <<= 1) { for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { - bench_acorn_fpga::exec_kernel(q, - ct_len, - dt_len, - invk, - bench_acorn_fpga::acorn_type::acorn_decrypt, - ts, - io); + bench_acorn::exec_kernel(q, + ct_len, + dt_len, + invk, + 0, + bench_acorn::acorn_type::acorn_decrypt_fpga, + ts, + io); t1.add(std::to_string(invk)); t1.add(std::to_string(ct_len)); t1.add(std::to_string(dt_len)); - t1.add(bench_acorn_fpga::to_readable_bandwidth(io[0], ts[0])); - t1.add(bench_acorn_fpga::to_readable_bandwidth(io[1], ts[1])); - t1.add(bench_acorn_fpga::to_readable_bandwidth(io[2], ts[2])); + t1.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); + t1.add(bench_acorn::to_readable_bandwidth(io[1], ts[1])); + t1.add(bench_acorn::to_readable_bandwidth(io[2], ts[2])); t1.endOfRow(); } } From 55ddfada9903d16e906b759ba826cecfbb619f4a Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 07:37:20 +0000 Subject: [PATCH 7/9] added new Make recipes to easily build & execute benchmark on accelerated Acorn128 --- Makefile | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Makefile b/Makefile index d183214..997376e 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ CXX = dpcpp CXXFLAGS = -std=c++20 -Wall -Weverything -Wno-c++98-compat -Wno-c++98-c++11-compat-binary-literal -Wno-c++98-compat-pedantic OPTFLAGS = -O3 SYCLFLAGS = -fsycl +SYCLCUDAFLAGS = -fsycl-targets=nvptx64-nvidia-cuda IFLAGS = -I ./include # Actually compiled code to be executed on host CPU, to be used only for testing functional correctness @@ -70,3 +71,32 @@ accel_test: test/accel_test.out test/accel_test.out: test/accel_acorn.cpp include/*.hpp $(CXX) $(CXXFLAGS) $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) $< -o $@ + +aot_cpu: + @if lscpu | grep -q 'avx512'; then \ + echo "Using avx512"; \ + $(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx512" bench/accel_acorn.cpp -o bench/a.out; \ + elif lscpu | grep -q 'avx2'; then \ + echo "Using avx2"; \ + $(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx2" bench/accel_acorn.cpp -o bench/a.out; \ + elif lscpu | grep -q 'avx'; then \ + echo "Using avx"; \ + $(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=avx" bench/accel_acorn.cpp -o bench/a.out; \ + elif lscpu | grep -q 'sse4.2'; then \ + echo "Using sse4.2"; \ + $(CXX) -std=c++20 -Wall -DSYCL_TARGET_CPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_x86_64 -Xs "-march=sse4.2" bench/accel_acorn.cpp -o bench/a.out; \ + else \ + echo "Can't AOT compile using avx, avx2, avx512 or sse4.2"; \ + fi + ./bench/a.out + +aot_gpu: + # you may want to replace `device` identifier with `0x3e96` if you're targeting *Intel(R) UHD Graphics P630* + # + # otherwise, let it be what it's if you're targeting *Intel(R) Iris(R) Xe MAX Graphics* + $(CXX) -std=c++20 -Wall -DSYCL_TARGET_GPU $(SYCLFLAGS) $(OPTFLAGS) $(IFLAGS) -fsycl-targets=spir64_gen -Xs "-device 0x4905" bench/accel_acorn.cpp -o bench/a.out + ./bench/a.out + +cuda: + clang++ -std=c++20 -Wall -DSYCL_TARGET_GPU $(SYCLFLAGS) $(SYCLCUDAFLAGS) $(OPTFLAGS) $(IFLAGS) bench/accel_acorn.cpp -o bench/a.out + ./bench/a.out From 2992f356e1df586f48e38bf185abb56dbacfe8d7 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 13:17:06 +0000 Subject: [PATCH 8/9] rename variable --- bench/accel_acorn.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bench/accel_acorn.cpp b/bench/accel_acorn.cpp index 151f89f..fba60f9 100644 --- a/bench/accel_acorn.cpp +++ b/bench/accel_acorn.cpp @@ -53,18 +53,18 @@ main() t0.add("device-to-host b/w"); t0.endOfRow(); - for (size_t invk = min_wi_cnt; invk <= max_wi_cnt; invk <<= 1) { + for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) { for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { bench_acorn::exec_kernel(q, ct_len, dt_len, - invk, + wi, wg_size, bench_acorn::acorn_type::accel_acorn_encrypt, ts, io); - t0.add(std::to_string(invk)); + t0.add(std::to_string(wi)); t0.add(std::to_string(ct_len)); t0.add(std::to_string(dt_len)); t0.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); @@ -95,18 +95,18 @@ main() t1.add("device-to-host b/w"); t1.endOfRow(); - for (size_t invk = min_wi_cnt; invk <= max_wi_cnt; invk <<= 1) { + for (size_t wi = min_wi_cnt; wi <= max_wi_cnt; wi <<= 1) { for (size_t ct_len = min_ct_len; ct_len <= max_ct_len; ct_len <<= 1) { bench_acorn::exec_kernel(q, ct_len, dt_len, - invk, + wi, wg_size, bench_acorn::acorn_type::accel_acorn_decrypt, ts, io); - t1.add(std::to_string(invk)); + t1.add(std::to_string(wi)); t1.add(std::to_string(ct_len)); t1.add(std::to_string(dt_len)); t1.add(bench_acorn::to_readable_bandwidth(io[0], ts[0])); From 6db52448cf395c4dfb45fb638f9e328b2d6c18fb Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 21 Apr 2022 13:17:20 +0000 Subject: [PATCH 9/9] added benchmark results on multi-core CPUs, GPGPUs --- results/cpu.md | 249 +++++++++++++++++++++++++++++++++++++++++++++++++ results/gpu.md | 234 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 483 insertions(+) create mode 100644 results/cpu.md create mode 100644 results/gpu.md diff --git a/results/cpu.md b/results/cpu.md new file mode 100644 index 0000000..1d95839 --- /dev/null +++ b/results/cpu.md @@ -0,0 +1,249 @@ +# Benchmarking Acorn-128 encrypt/ decrypt kernel on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz + +```bash +make aot_cpu +``` + +Execute above command to generate AOT compiled Acorn-128 encrypt/ decrypt SYCL kernels, to be offloaded to following multi-core CPU. + +```bash +$ lscpu | grep -i cpu\(s\) + +CPU(s): 12 +On-line CPU(s) list: 0-11 +NUMA node0 CPU(s): 0-11 +``` + +## Acorn-128 Authenticated Encryption + +```bash +running on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz + +Benchmarking Acorn-128 encrypt + ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|invocation count|plain text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 64| 32| 1.744760 GB/ s| 722.502286 MB/ s| 2.360613 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 128| 32| 2.633632 GB/ s| 986.853435 MB/ s| 1.047313 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 256| 32| 3.334411 GB/ s| 1.260552 GB/ s| 2.697129 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 512| 32| 5.723309 GB/ s| 1.528530 GB/ s| 3.274888 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 1024| 32| 4.228275 GB/ s| 1.833048 GB/ s| 3.428119 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 2048| 32| 4.967789 GB/ s| 1.170243 GB/ s| 6.304786 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 4096| 32| 5.444293 GB/ s| 1.850683 GB/ s| 3.801584 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 64| 32| 2.328998 GB/ s| 829.789785 MB/ s| 1.632007 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 128| 32| 3.129148 GB/ s| 1.059678 GB/ s| 6.393036 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 256| 32| 4.202825 GB/ s| 1.419446 GB/ s| 5.882276 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 512| 32| 4.398933 GB/ s| 1.600087 GB/ s| 3.600971 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 1024| 32| 4.230430 GB/ s|1006.882722 MB/ s| 6.644130 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 2048| 32| 5.564777 GB/ s| 1.251939 GB/ s| 2.675761 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 4096| 32| 5.536258 GB/ s| 1.698288 GB/ s| 3.084840 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 64| 32| 2.122314 GB/ s| 302.407016 MB/ s| 6.049733 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 128| 32| 2.475158 GB/ s| 1.098923 GB/ s| 5.742393 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 256| 32| 3.441199 GB/ s| 1.364658 GB/ s| 3.915572 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 512| 32| 4.593164 GB/ s| 1.437058 GB/ s| 2.949942 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 1024| 32| 5.000044 GB/ s| 1.861638 GB/ s| 3.055511 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 2048| 32| 4.760129 GB/ s| 1.973006 GB/ s| 4.455322 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 4096| 32| 5.896175 GB/ s| 1.773362 GB/ s| 3.827487 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +``` + +## Acorn-128 Verified Decryption + +```bash +running on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz + +Benchmarking Acorn-128 decrypt + ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|invocation count|cipher text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 64| 32| 3.018709 GB/ s|823.645885 MB/ s| 9.128086 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 128| 32| 1.737919 GB/ s| 1.052830 GB/ s| 9.152198 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 256| 32| 4.208722 GB/ s| 1.397641 GB/ s| 9.614522 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 512| 32| 7.873316 GB/ s| 1.512515 GB/ s| 9.699479 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 1024| 32| 3.658948 GB/ s| 1.612205 GB/ s| 9.979726 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 2048| 32| 6.345428 GB/ s| 1.796593 GB/ s| 9.962872 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 4096| 32| 4.540194 GB/ s| 1.691370 GB/ s| 9.987377 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 64| 32| 4.329870 GB/ s|860.562363 MB/ s| 8.719739 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 128| 32| 2.701259 GB/ s| 1.064063 GB/ s| 7.538714 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 256| 32| 4.541140 GB/ s| 1.388114 GB/ s| 9.553401 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 512| 32| 5.474911 GB/ s|626.457850 MB/ s| 9.933608 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 1024| 32| 4.188122 GB/ s| 1.780403 GB/ s| 9.965354 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 2048| 32| 5.049153 GB/ s| 1.714761 GB/ s| 10.009037 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 4096| 32| 2.973463 GB/ s| 1.422591 GB/ s| 12.218516 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 64| 32| 1.928450 GB/ s|821.189340 MB/ s| 8.818462 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 128| 32| 2.690804 GB/ s|388.099130 MB/ s| 9.969690 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 256| 32| 3.217995 GB/ s| 1.359359 GB/ s| 9.485938 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 512| 32| 4.368475 GB/ s|938.351273 MB/ s| 9.962083 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 1024| 32| 2.650337 GB/ s| 1.726032 GB/ s| 8.857380 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 2048| 32| 5.019398 GB/ s| 1.845092 GB/ s| 12.214536 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 4096| 32| 5.127787 GB/ s| 1.597458 GB/ s| 11.988787 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +``` + +--- + +# Benchmarking Acorn-128 encrypt/ decrypt kernel on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz + +```bash +make aot_cpu +``` + +Execute above command to generate AOT compiled Acorn-128 encrypt/ decrypt SYCL kernels, to be offloaded to following multi-core CPU. + +```bash +$ lscpu | grep -i cpu\(s\) + +CPU(s): 4 +On-line CPU(s) list: 0-3 +NUMA node0 CPU(s): 0-3 +``` + +## Acorn-128 Authenticated Encryption + +```bash +running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz + +Benchmarking Acorn-128 encrypt + ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|invocation count|plain text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 64| 32| 916.728403 MB/ s|180.077504 MB/ s| 4.023965 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 128| 32| 1.346736 GB/ s|247.083574 MB/ s| 3.263991 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 256| 32| 1010.898446 MB/ s|308.438418 MB/ s| 2.750420 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 512| 32| 1.046790 GB/ s|365.556973 MB/ s| 2.541908 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 1024| 32| 1.059440 GB/ s|408.143916 MB/ s| 3.266465 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 2048| 32| 1.082405 GB/ s|355.643484 MB/ s| 1.092773 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 4096| 32| 1013.483577 MB/ s|323.612539 MB/ s| 5.622608 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 64| 32| 1.283246 GB/ s|181.643105 MB/ s| 3.629014 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 128| 32| 1.068575 GB/ s|240.896068 MB/ s| 3.365300 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 256| 32| 1008.052474 MB/ s|308.848677 MB/ s| 2.858016 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 512| 32| 1.478106 GB/ s|365.628546 MB/ s| 2.963070 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 1024| 32| 1.783158 GB/ s|408.139537 MB/ s| 2.991937 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 2048| 32| 1.773467 GB/ s|436.102593 MB/ s| 3.416502 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 4096| 32| 1.074002 GB/ s|392.885497 MB/ s| 3.399872 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 64| 32| 1.104651 GB/ s|186.289332 MB/ s| 3.290440 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 128| 32| 998.124088 MB/ s|246.606695 MB/ s| 2.886964 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 256| 32| 1.623997 GB/ s|309.586456 MB/ s| 915.547218 MB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 512| 32| 1.056367 GB/ s|367.621737 MB/ s| 1.056735 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 1024| 32| 1.571051 GB/ s|408.355499 MB/ s| 3.370581 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 2048| 32| 1.076246 GB/ s|436.155603 MB/ s| 3.401738 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 4096| 32| 1.115083 GB/ s|389.433352 MB/ s| 3.496724 GB/ s| ++----------------+------------------------+-----------------------------+------------------+----------------+------------------+ +``` + +## Acorn-128 Verified Decryption + +```bash +running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz + +Benchmarking Acorn-128 decrypt + ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|invocation count|cipher text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 64| 32| 1.131957 GB/ s|187.409393 MB/ s| 7.702061 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 128| 32| 922.244109 MB/ s|245.410258 MB/ s| 9.014356 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 256| 32| 1.326662 GB/ s|302.532591 MB/ s| 8.105435 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 512| 32| 1.034770 GB/ s|347.279499 MB/ s| 5.789368 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 1024| 32| 1.817787 GB/ s|393.065465 MB/ s| 6.966629 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 2048| 32| 1.688513 GB/ s|410.610579 MB/ s| 6.163327 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|65536 | 4096| 32| 1.702856 GB/ s|372.969528 MB/ s| 6.605669 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 64| 32| 1.021246 GB/ s|182.369482 MB/ s| 7.330853 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 128| 32| 1.013463 GB/ s|238.289004 MB/ s| 7.556977 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 256| 32| 1003.553871 MB/ s|294.023002 MB/ s| 4.754144 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 512| 32| 1.076742 GB/ s|352.866340 MB/ s| 6.854882 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 1024| 32| 1.681887 GB/ s|399.748029 MB/ s| 5.645492 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 2048| 32| 2.024216 GB/ s|410.871669 MB/ s| 7.459973 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|131072 | 4096| 32| 1.087032 GB/ s|372.045419 MB/ s| 7.463435 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 64| 32| 1013.398394 MB/ s|181.728751 MB/ s| 6.597591 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 128| 32| 1.294747 GB/ s|238.857130 MB/ s| 5.717015 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 256| 32| 1.271128 GB/ s|300.135781 MB/ s| 6.122214 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 512| 32| 1.712555 GB/ s|355.997644 MB/ s| 6.987948 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 1024| 32| 1.045637 GB/ s|390.171557 MB/ s| 7.421163 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 2048| 32| 1.781212 GB/ s|419.096479 MB/ s| 7.310198 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +|262144 | 4096| 32| 1.070717 GB/ s|370.164649 MB/ s| 6.107808 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+----------------+------------------+ +``` diff --git a/results/gpu.md b/results/gpu.md new file mode 100644 index 0000000..df2df1f --- /dev/null +++ b/results/gpu.md @@ -0,0 +1,234 @@ +# Benchmarking Acorn-128 encrypt/ decrypt kernel on Tesla V100-SXM2-16GB + +Run following command to generate binary which can offload Acorn-128 encrypt/ decrypt kernel to CUDA backend. + +```bash +make cuda +``` + +## Acorn-128 Authenticated Encryption + +```bash +running on Tesla V100-SXM2-16GB + +Benchmarking Acorn-128 encrypt + ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|invocation count|plain text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 64| 32| 8.743020 GB/ s|30.436256 GB/ s| 5.138683 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 128| 32| 8.576507 GB/ s|23.316831 GB/ s| 5.704664 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 256| 32| 7.384249 GB/ s|21.218600 GB/ s| 7.096271 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 512| 32| 7.221481 GB/ s|12.480787 GB/ s| 10.075568 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 1024| 32| 7.374272 GB/ s| 4.503621 GB/ s| 10.584596 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 2048| 32| 7.566963 GB/ s| 4.147789 GB/ s| 10.728766 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 4096| 32| 7.681173 GB/ s| 3.640237 GB/ s| 10.866118 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 64| 32| 8.184142 GB/ s|37.914696 GB/ s| 5.829204 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 128| 32| 7.326007 GB/ s|26.455035 GB/ s| 7.421151 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 256| 32| 7.188427 GB/ s|14.402880 GB/ s| 9.828010 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 512| 32| 7.271626 GB/ s| 7.958801 GB/ s| 10.546502 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 1024| 32| 7.519421 GB/ s| 3.355577 GB/ s| 10.726959 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 2048| 32| 7.677990 GB/ s| 3.019218 GB/ s| 10.865445 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 4096| 32| 7.754019 GB/ s| 2.889801 GB/ s| 10.923620 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 64| 32| 7.200721 GB/ s|40.955605 GB/ s| 8.090613 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 128| 32| 7.278241 GB/ s|28.228653 GB/ s| 10.106678 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 256| 32| 7.410152 GB/ s|18.633538 GB/ s| 10.552450 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 512| 32| 7.573367 GB/ s| 8.029283 GB/ s| 10.747435 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 1024| 32| 7.693613 GB/ s| 2.567194 GB/ s| 10.850513 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 2048| 32| 7.752849 GB/ s| 2.356481 GB/ s| 10.896190 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 4096| 32| 7.546294 GB/ s| 2.149910 GB/ s| 10.923620 GB/ s| ++----------------+------------------------+-----------------------------+------------------+---------------+------------------+ + +``` + +## Acorn-128 Verified Decryption + +```bash +running on Tesla V100-SXM2-16GB + +Benchmarking Acorn-128 decrypt + ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|invocation count|cipher text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 64| 32| 1.705837 GB/ s|35.714177 GB/ s| 5.289714 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 128| 32| 9.129220 GB/ s|27.472508 GB/ s| 4.940260 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 256| 32| 7.597684 GB/ s|26.315780 GB/ s| 6.947447 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 512| 32| 7.198445 GB/ s|16.732283 GB/ s| 8.857045 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 1024| 32| 7.384417 GB/ s| 4.400000 GB/ s| 10.698481 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 2048| 32| 7.549954 GB/ s| 3.982843 GB/ s| 10.827063 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|65536 | 4096| 32| 7.680085 GB/ s| 3.794787 GB/ s| 10.890715 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 64| 32| 8.620688 GB/ s|39.999966 GB/ s| 4.978554 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 128| 32| 7.295171 GB/ s|27.472547 GB/ s| 7.059982 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 256| 32| 7.226426 GB/ s|22.556387 GB/ s| 9.316994 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 512| 32| 7.388180 GB/ s| 8.952081 GB/ s| 10.637858 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 1024| 32| 7.562472 GB/ s| 3.467115 GB/ s| 10.795837 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 2048| 32| 7.693197 GB/ s| 3.207026 GB/ s| 10.897081 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|131072 | 4096| 32| 7.757698 GB/ s| 3.035437 GB/ s| 10.906487 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 64| 32| 7.246376 GB/ s|42.857143 GB/ s| 7.827552 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 128| 32| 7.311584 GB/ s|28.409091 GB/ s| 10.027985 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 256| 32| 7.399577 GB/ s|25.139665 GB/ s| 10.567434 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 512| 32| 7.568011 GB/ s|12.018380 GB/ s| 10.817308 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 1024| 32| 7.687166 GB/ s| 2.760351 GB/ s| 10.872793 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 2048| 32| 7.760079 GB/ s| 2.426188 GB/ s| 10.904504 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +|262144 | 4096| 32| 7.800825 GB/ s| 2.244298 GB/ s| 10.939102 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+---------------+------------------+ +``` + +--- + +# Benchmarking Acorn-128 encrypt/ decrypt kernel on Intel(R) UHD Graphics P630 [0x3e96] + +Run following command to generate binary ( along with AOT compiled SYCL kernels ) which can offload Acorn-128 encrypt/ decrypt kernel to Intel Integrated Graphics. + +```bash +make aot_gpu +``` + +## Acorn-128 Authenticated Encryption + +```bash +running on Intel(R) UHD Graphics P630 [0x3e96] + +Benchmarking Acorn-128 encrypt + ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|invocation count|plain text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 64| 32| 9.798720 GB/ s| 482.062688 MB/ s| 19.888122 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 128| 32| 8.658761 GB/ s| 703.026840 MB/ s| 13.984723 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 256| 32| 6.150451 GB/ s|1001.142415 MB/ s| 15.970842 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 512| 32| 9.462698 GB/ s| 1.188586 GB/ s| 16.086834 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 1024| 32| 9.517681 GB/ s| 1.233725 GB/ s| 16.433406 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 2048| 32| 9.618308 GB/ s| 901.265967 MB/ s| 16.204624 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 4096| 32| 9.668983 GB/ s| 639.348362 MB/ s| 16.342644 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 64| 32| 10.093995 GB/ s| 487.850212 MB/ s| 12.398117 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 128| 32| 9.140271 GB/ s| 710.284025 MB/ s| 13.927702 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 256| 32| 9.284891 GB/ s| 1.018523 GB/ s| 16.161831 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 512| 32| 9.479852 GB/ s| 1.285675 GB/ s| 16.376559 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 1024| 32| 9.630179 GB/ s| 1.388897 GB/ s| 16.528590 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 2048| 32| 9.794456 GB/ s| 1.030221 GB/ s| 16.378262 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 4096| 32| 9.843429 GB/ s| 679.699062 MB/ s| 16.525109 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 64| 32| 10.000691 GB/ s| 492.528567 MB/ s| 16.204122 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 128| 32| 9.360544 GB/ s| 716.815124 MB/ s| 16.176645 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 256| 32| 9.510033 GB/ s| 1.029032 GB/ s| 16.398346 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 512| 32| 9.643670 GB/ s| 1.355006 GB/ s| 15.000457 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 1024| 32| 9.776031 GB/ s| 1.532205 GB/ s| 16.547544 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 2048| 32| 9.818233 GB/ s|1006.160282 MB/ s| 16.512075 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 4096| 32| 9.855083 GB/ s| 801.346873 MB/ s| 16.469396 GB/ s| ++----------------+------------------------+-----------------------------+------------------+-----------------+------------------+ +``` + +## Acorn-128 Verified Decryption + +```bash +running on Intel(R) UHD Graphics P630 [0x3e96] + +Benchmarking Acorn-128 decrypt + ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|invocation count|cipher text len ( bytes )|associated data len ( bytes )|host-to-device b/w| kernel b/w|device-to-host b/w| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 64| 32| 10.789925 GB/ s| 485.233770 MB/ s| 24.613088 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 128| 32| 10.763939 GB/ s| 705.686919 MB/ s| 16.595848 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 256| 32| 9.335830 GB/ s|1014.556176 MB/ s| 14.235340 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 512| 32| 9.158779 GB/ s| 1.105730 GB/ s| 14.962771 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 1024| 32| 9.501886 GB/ s| 990.477043 MB/ s| 15.660006 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 2048| 32| 9.503080 GB/ s| 687.493278 MB/ s| 16.006198 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|65536 | 4096| 32| 9.698753 GB/ s| 568.693186 MB/ s| 16.332814 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 64| 32| 10.049570 GB/ s| 488.949019 MB/ s| 14.752659 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 128| 32| 9.303867 GB/ s| 716.650970 MB/ s| 14.290731 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 256| 32| 9.143061 GB/ s| 1.009075 GB/ s| 15.153013 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 512| 32| 6.387896 GB/ s| 1.215706 GB/ s| 15.714463 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 1024| 32| 9.659371 GB/ s| 1.052234 GB/ s| 16.175184 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 2048| 32| 9.721819 GB/ s| 758.871043 MB/ s| 16.305862 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|131072 | 4096| 32| 9.783073 GB/ s| 595.716943 MB/ s| 16.432319 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 64| 32| 9.677160 GB/ s| 493.869475 MB/ s| 14.563868 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 128| 32| 9.487676 GB/ s| 720.874559 MB/ s| 15.433477 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 256| 32| 9.512303 GB/ s| 1.011941 GB/ s| 15.830824 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 512| 32| 6.518066 GB/ s| 1.097036 GB/ s| 16.204527 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 1024| 32| 9.788530 GB/ s| 1.559364 GB/ s| 16.341762 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 2048| 32| 9.829264 GB/ s| 735.993784 MB/ s| 16.462639 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +|262144 | 4096| 32| 9.869760 GB/ s| 691.507268 MB/ s| 16.506594 GB/ s| ++----------------+-------------------------+-----------------------------+------------------+-----------------+------------------+ +```