Skip to content

Commit 4fb2e8e

Browse files
committed
Rewrite some of the x86 backend implementations to broadcast
16-bit constants instead of loading them from memory. This significantly simplifies the formal verification effort. Signed-off-by: Dusan Kostic <dkostic@protonmail.com>
1 parent 18b4bd7 commit 4fb2e8e

31 files changed

+313
-697
lines changed

dev/x86_64/meta.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
8484
return MLK_NATIVE_FUNC_FALLBACK;
8585
}
8686

87-
mlk_reduce_avx2(data, mlk_qdata);
87+
mlk_reduce_avx2(data);
8888
return MLK_NATIVE_FUNC_SUCCESS;
8989
}
9090

@@ -121,7 +121,7 @@ static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
121121
return MLK_NATIVE_FUNC_FALLBACK;
122122
}
123123

124-
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache, mlk_qdata);
124+
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
125125
return MLK_NATIVE_FUNC_SUCCESS;
126126
}
127127
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -136,7 +136,7 @@ static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
136136
return MLK_NATIVE_FUNC_FALLBACK;
137137
}
138138

139-
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache, mlk_qdata);
139+
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
140140
return MLK_NATIVE_FUNC_SUCCESS;
141141
}
142142
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -151,7 +151,7 @@ static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
151151
return MLK_NATIVE_FUNC_FALLBACK;
152152
}
153153

154-
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache, mlk_qdata);
154+
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
155155
return MLK_NATIVE_FUNC_SUCCESS;
156156
}
157157
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
@@ -164,7 +164,7 @@ static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
164164
return MLK_NATIVE_FUNC_FALLBACK;
165165
}
166166

167-
mlk_ntttobytes_avx2(r, a, mlk_qdata);
167+
mlk_ntttobytes_avx2(r, a);
168168
return MLK_NATIVE_FUNC_SUCCESS;
169169
}
170170

@@ -176,7 +176,7 @@ static MLK_INLINE int mlk_poly_frombytes_native(
176176
return MLK_NATIVE_FUNC_FALLBACK;
177177
}
178178

179-
mlk_nttfrombytes_avx2(r, a, mlk_qdata);
179+
mlk_nttfrombytes_avx2(r, a);
180180
return MLK_NATIVE_FUNC_SUCCESS;
181181
}
182182

dev/x86_64/src/arith_native_x86_64.h

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ void mlk_invntt_avx2(int16_t *r, const int16_t *mlk_qdata);
3131
void mlk_nttunpack_avx2(int16_t *r);
3232

3333
#define mlk_reduce_avx2 MLK_NAMESPACE(reduce_avx2)
34-
void mlk_reduce_avx2(int16_t *r, const int16_t *mlk_qdata);
34+
void mlk_reduce_avx2(int16_t *r);
3535

3636
#define mlk_poly_mulcache_compute_avx2 MLK_NAMESPACE(poly_mulcache_compute_avx2)
3737
void mlk_poly_mulcache_compute_avx2(int16_t *out, const int16_t *in,
@@ -42,32 +42,27 @@ void mlk_poly_mulcache_compute_avx2(int16_t *out, const int16_t *in,
4242
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
4343
const int16_t *a,
4444
const int16_t *b,
45-
const int16_t *b_cache,
46-
const int16_t *qdata);
45+
const int16_t *b_cache);
4746

4847
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
4948
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
5049
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
5150
const int16_t *a,
5251
const int16_t *b,
53-
const int16_t *b_cache,
54-
const int16_t *qdata);
52+
const int16_t *b_cache);
5553

5654
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
5755
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
5856
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
5957
const int16_t *a,
6058
const int16_t *b,
61-
const int16_t *b_cache,
62-
const int16_t *qdata);
59+
const int16_t *b_cache);
6360

6461
#define mlk_ntttobytes_avx2 MLK_NAMESPACE(ntttobytes_avx2)
65-
void mlk_ntttobytes_avx2(uint8_t *r, const int16_t *a,
66-
const int16_t *mlk_qdata);
62+
void mlk_ntttobytes_avx2(uint8_t *r, const int16_t *a);
6763

6864
#define mlk_nttfrombytes_avx2 MLK_NAMESPACE(nttfrombytes_avx2)
69-
void mlk_nttfrombytes_avx2(int16_t *r, const uint8_t *a,
70-
const int16_t *mlk_qdata);
65+
void mlk_nttfrombytes_avx2(int16_t *r, const uint8_t *a);
7166

7267
#define mlk_tomont_avx2 MLK_NAMESPACE(tomont_avx2)
7368
void mlk_tomont_avx2(int16_t *r, const int16_t *mlk_qdata);

dev/x86_64/src/compress_avx2.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@
2828
#include "arith_native_x86_64.h"
2929
#include "consts.h"
3030

31+
/* check-magic: 20159 == round(2^26/MLKEM_Q) */
32+
#define MLK_AVX2_V 20159
33+
3134
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
3235
void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
3336
const int16_t *MLK_RESTRICT a)
3437
{
3538
unsigned int i;
3639
__m256i f0, f1, f2, f3;
37-
const __m256i v = _mm256_load_si256(
38-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
40+
const __m256i v = _mm256_set1_epi16(MLK_AVX2_V);
3941
const __m256i shift1 = _mm256_set1_epi16(1 << 9);
4042
const __m256i mask = _mm256_set1_epi16(15);
4143
const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
@@ -75,8 +77,8 @@ void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
7577
unsigned int i;
7678
__m128i t;
7779
__m256i f;
78-
const __m256i q = _mm256_load_si256(
79-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
80+
const __m256i q = _mm256_set1_epi16(MLKEM_Q);
81+
8082
const __m256i shufbidx =
8183
_mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
8284
3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
@@ -101,8 +103,7 @@ void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
101103
unsigned int i;
102104
__m256i f0, f1, f2;
103105
__m128i t0, t1;
104-
const __m256i v = _mm256_load_si256(
105-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
106+
const __m256i v = _mm256_set1_epi16(MLK_AVX2_V);
106107
const __m256i v8 = _mm256_slli_epi16(v, 3);
107108
const __m256i off = _mm256_set1_epi16(15);
108109
const __m256i shift1 = _mm256_set1_epi16(1 << 12);
@@ -187,8 +188,7 @@ void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
187188
unsigned int i;
188189
__m256i f0, f1;
189190
__m128i t0, t1;
190-
const __m256i v = _mm256_load_si256(
191-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
191+
const __m256i v = _mm256_set1_epi16(MLK_AVX2_V);
192192
const __m256i shift1 = _mm256_set1_epi16(1 << 10);
193193
const __m256i mask = _mm256_set1_epi16(31);
194194
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
@@ -230,8 +230,7 @@ void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
230230
__m128i t;
231231
__m256i f;
232232
int16_t ti;
233-
const __m256i q = _mm256_load_si256(
234-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
233+
const __m256i q = _mm256_set1_epi16(MLKEM_Q);
235234
const __m256i shufbidx =
236235
_mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4,
237236
3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
@@ -262,8 +261,7 @@ void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
262261
unsigned int i;
263262
__m256i f0, f1, f2;
264263
__m128i t0, t1;
265-
const __m256i v = _mm256_load_si256(
266-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
264+
const __m256i v = _mm256_set1_epi16(MLK_AVX2_V);
267265
const __m256i v8 = _mm256_slli_epi16(v, 3);
268266
const __m256i off = _mm256_set1_epi16(36);
269267
const __m256i shift1 = _mm256_set1_epi16(1 << 13);
@@ -334,8 +332,7 @@ void mlk_poly_decompress_d11_avx2(
334332
{
335333
unsigned int i;
336334
__m256i f;
337-
const __m256i q = _mm256_load_si256(
338-
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
335+
const __m256i q = _mm256_set1_epi16(MLKEM_Q);
339336
const __m256i shufbidx =
340337
_mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 10,
341338
9, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 2, 1, 1, 0);
@@ -385,3 +382,7 @@ MLK_EMPTY_CU(avx2_poly_compress)
385382

386383
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
387384
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */
385+
386+
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
387+
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
388+
#undef MLK_AVX2_V

0 commit comments

Comments
 (0)