28
28
#include "arith_native_x86_64.h"
29
29
#include "consts.h"
30
30
31
+ /* check-magic: 20159 == round(2^26/MLKEM_Q) */
32
+ #define MLK_AVX2_V 20159
33
+
31
34
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED ) || (MLKEM_K == 2 || MLKEM_K == 3 )
32
35
void mlk_poly_compress_d4_avx2 (uint8_t r [MLKEM_POLYCOMPRESSEDBYTES_D4 ],
33
36
const int16_t * MLK_RESTRICT a )
34
37
{
35
38
unsigned int i ;
36
39
__m256i f0 , f1 , f2 , f3 ;
37
- const __m256i v = _mm256_load_si256 (
38
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XV ]);
40
+ const __m256i v = _mm256_set1_epi16 (MLK_AVX2_V );
39
41
const __m256i shift1 = _mm256_set1_epi16 (1 << 9 );
40
42
const __m256i mask = _mm256_set1_epi16 (15 );
41
43
const __m256i shift2 = _mm256_set1_epi16 ((16 << 8 ) + 1 );
@@ -75,8 +77,8 @@ void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
75
77
unsigned int i ;
76
78
__m128i t ;
77
79
__m256i f ;
78
- const __m256i q = _mm256_load_si256 (
79
- ( __m256i * ) & mlk_qdata [ MLK_AVX2_BACKEND_DATA_OFFSET_16XQ ]);
80
+ const __m256i q = _mm256_set1_epi16 ( MLKEM_Q );
81
+
80
82
const __m256i shufbidx =
81
83
_mm256_set_epi8 (7 , 7 , 7 , 7 , 6 , 6 , 6 , 6 , 5 , 5 , 5 , 5 , 4 , 4 , 4 , 4 , 3 , 3 , 3 ,
82
84
3 , 2 , 2 , 2 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 );
@@ -101,8 +103,7 @@ void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
101
103
unsigned int i ;
102
104
__m256i f0 , f1 , f2 ;
103
105
__m128i t0 , t1 ;
104
- const __m256i v = _mm256_load_si256 (
105
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XV ]);
106
+ const __m256i v = _mm256_set1_epi16 (MLK_AVX2_V );
106
107
const __m256i v8 = _mm256_slli_epi16 (v , 3 );
107
108
const __m256i off = _mm256_set1_epi16 (15 );
108
109
const __m256i shift1 = _mm256_set1_epi16 (1 << 12 );
@@ -187,8 +188,7 @@ void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
187
188
unsigned int i ;
188
189
__m256i f0 , f1 ;
189
190
__m128i t0 , t1 ;
190
- const __m256i v = _mm256_load_si256 (
191
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XV ]);
191
+ const __m256i v = _mm256_set1_epi16 (MLK_AVX2_V );
192
192
const __m256i shift1 = _mm256_set1_epi16 (1 << 10 );
193
193
const __m256i mask = _mm256_set1_epi16 (31 );
194
194
const __m256i shift2 = _mm256_set1_epi16 ((32 << 8 ) + 1 );
@@ -230,8 +230,7 @@ void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
230
230
__m128i t ;
231
231
__m256i f ;
232
232
int16_t ti ;
233
- const __m256i q = _mm256_load_si256 (
234
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XQ ]);
233
+ const __m256i q = _mm256_set1_epi16 (MLKEM_Q );
235
234
const __m256i shufbidx =
236
235
_mm256_set_epi8 (9 , 9 , 9 , 8 , 8 , 8 , 8 , 7 , 7 , 6 , 6 , 6 , 6 , 5 , 5 , 5 , 4 , 4 , 4 ,
237
236
3 , 3 , 3 , 3 , 2 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 );
@@ -262,8 +261,7 @@ void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
262
261
unsigned int i ;
263
262
__m256i f0 , f1 , f2 ;
264
263
__m128i t0 , t1 ;
265
- const __m256i v = _mm256_load_si256 (
266
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XV ]);
264
+ const __m256i v = _mm256_set1_epi16 (MLK_AVX2_V );
267
265
const __m256i v8 = _mm256_slli_epi16 (v , 3 );
268
266
const __m256i off = _mm256_set1_epi16 (36 );
269
267
const __m256i shift1 = _mm256_set1_epi16 (1 << 13 );
@@ -334,8 +332,7 @@ void mlk_poly_decompress_d11_avx2(
334
332
{
335
333
unsigned int i ;
336
334
__m256i f ;
337
- const __m256i q = _mm256_load_si256 (
338
- (__m256i * )& mlk_qdata [MLK_AVX2_BACKEND_DATA_OFFSET_16XQ ]);
335
+ const __m256i q = _mm256_set1_epi16 (MLKEM_Q );
339
336
const __m256i shufbidx =
340
337
_mm256_set_epi8 (13 , 12 , 12 , 11 , 10 , 9 , 9 , 8 , 8 , 7 , 6 , 5 , 5 , 4 , 4 , 3 , 10 ,
341
338
9 , 9 , 8 , 7 , 6 , 6 , 5 , 5 , 4 , 3 , 2 , 2 , 1 , 1 , 0 );
@@ -385,3 +382,7 @@ MLK_EMPTY_CU(avx2_poly_compress)
385
382
386
383
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
387
384
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */
385
+
386
+ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
387
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
388
+ #undef MLK_AVX2_V
0 commit comments