mirror of https://github.com/oxen-io/lokinet
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
885 lines
27 KiB
C
885 lines
27 KiB
C
#if __AVX2__
|
|
#include <string.h>
|
|
#include <immintrin.h>
|
|
#include "mod3.h"
|
|
#include "rq.h"
|
|
#include "r3.h"
|
|
|
|
// 32-bit hosts: use compiler builtins and let compiler
|
|
// perform register allocation and/or spilling to core
|
|
//
|
|
// Confirmed working 3/10/18 -despair
|
|
// Less than 5% performance hit,
|
|
// all in register loads/stores to/from core
|
|
#ifndef __amd64__
|
|
#define MULSTEP_gcc(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
h0 += f0 * gj; \
|
|
_mm256_storeu_ps((float *)&h[i + j], h0); \
|
|
h1 += f1 * gj; \
|
|
h2 += f2 * gj; \
|
|
h3 += f3 * gj; \
|
|
h4 += f4 * gj; \
|
|
h0 = _mm256_loadu_ps((float *)&h[i + j + 5]); \
|
|
h0 += f5 * gj;
|
|
|
|
#define MULSTEP MULSTEP_gcc
|
|
|
|
#define MULSTEP_noload(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
h0 += gj * f0; \
|
|
_mm256_storeu_ps((float *)&h[i + j], h0); \
|
|
h1 += gj * f1; \
|
|
h2 += gj * f2; \
|
|
h3 += gj * f3; \
|
|
h4 += gj * f4; \
|
|
h0 = gj * f5;
|
|
|
|
#define MULSTEP_fromzero(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
h0 = gj * f0; \
|
|
_mm256_storeu_ps((float *)&h[i + j], h0); \
|
|
h1 = gj * f1; \
|
|
h2 = gj * f2; \
|
|
h3 = gj * f3; \
|
|
h4 = gj * f4; \
|
|
h0 = gj * f5;
|
|
#else
|
|
// 64-bit hosts: use inline asm as before
|
|
#define MULSTEP MULSTEP_asm
|
|
#define MULSTEP_asm(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
__asm__( \
|
|
"vfmadd231ps %5,%6,%0 \n\t" \
|
|
"vmovups %0,%12 \n\t" \
|
|
"vmovups %13,%0 \n\t" \
|
|
"vfmadd231ps %5,%7,%1 \n\t" \
|
|
"vfmadd231ps %5,%8,%2 \n\t" \
|
|
"vfmadd231ps %5,%9,%3 \n\t" \
|
|
"vfmadd231ps %5,%10,%4 \n\t" \
|
|
"vfmadd231ps %5,%11,%0 \n\t" \
|
|
: "+x"(h0), "+x"(h1), "+x"(h2), "+x"(h3), "+x"(h4) \
|
|
: "x"(gj), "x"(f0), "x"(f1), "x"(f2), "x"(f3), "x"(f4), "x"(f5), \
|
|
"m"(h[i + j]), "m"(h[i + j + 5]));
|
|
|
|
#define MULSTEP_noload(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
__asm__( \
|
|
"vfmadd231ps %5,%6,%0 \n\t" \
|
|
"vmovups %0,%12 \n\t" \
|
|
"vfmadd231ps %5,%7,%1 \n\t" \
|
|
"vfmadd231ps %5,%8,%2 \n\t" \
|
|
"vfmadd231ps %5,%9,%3 \n\t" \
|
|
"vfmadd231ps %5,%10,%4 \n\t" \
|
|
"vmulps %5,%11,%0 \n\t" \
|
|
: "+x"(h0), "+x"(h1), "+x"(h2), "+x"(h3), "+x"(h4) \
|
|
: "x"(gj), "x"(f0), "x"(f1), "x"(f2), "x"(f3), "x"(f4), "x"(f5), \
|
|
"m"(h[i + j]));
|
|
|
|
#define MULSTEP_fromzero(j, h0, h1, h2, h3, h4) \
|
|
gj = g[j]; \
|
|
__asm__( \
|
|
"vmulps %5,%6,%0 \n\t" \
|
|
"vmovups %0,%12 \n\t" \
|
|
"vmulps %5,%7,%1 \n\t" \
|
|
"vmulps %5,%8,%2 \n\t" \
|
|
"vmulps %5,%9,%3 \n\t" \
|
|
"vmulps %5,%10,%4 \n\t" \
|
|
"vmulps %5,%11,%0 \n\t" \
|
|
: "=&x"(h0), "=&x"(h1), "=&x"(h2), "=&x"(h3), "=&x"(h4) \
|
|
: "x"(gj), "x"(f0), "x"(f1), "x"(f2), "x"(f3), "x"(f4), "x"(f5), \
|
|
"m"(h[i + j]));
|
|
#endif
|
|
|
|
static inline __m128i
|
|
_mm_load_cvtepi8_epi16(const long long *x)
|
|
{
|
|
__m128i result;
|
|
__asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x));
|
|
return result;
|
|
}
|
|
|
|
#define v0 _mm256_set1_epi32(0)
|
|
#define v0_128 _mm_set1_epi32(0)
|
|
#define v7 _mm256_set1_epi16(7)
|
|
#define v4591_16 _mm256_set1_epi16(4591)
|
|
#define v2296_16 _mm256_set1_epi16(2296)
|
|
|
|
#define alpha_32 _mm256_set1_epi32(0x4b400000)
|
|
#define alpha_32_128 _mm_set1_epi32(0x4b400000)
|
|
#define alpha_float _mm256_set1_ps(12582912.0)
|
|
|
|
#define v0_float _mm256_set1_ps(0)
|
|
#define v1_float _mm256_set1_ps(1)
|
|
#define vm1_float _mm256_set1_ps(-1)
|
|
#define vm4591_float _mm256_set1_ps(-4591)
|
|
#define recip4591_float \
|
|
_mm256_set1_ps(0.00021781746896101067305597908952297974298)
|
|
|
|
static inline __m256
|
|
add(__m256 x, __m256 y)
|
|
{
|
|
return x + y;
|
|
}
|
|
|
|
static inline __m256
|
|
fastadd(__m256 x, __m256 y)
|
|
{
|
|
return _mm256_fmadd_ps(y, v1_float, x);
|
|
}
|
|
|
|
static inline __m256
|
|
fastsub(__m256 x, __m256 y)
|
|
{
|
|
return _mm256_fmadd_ps(y, vm1_float, x);
|
|
}
|
|
|
|
static inline __m256
|
|
reduce(__m256 x)
|
|
{
|
|
__m256 q = x * recip4591_float;
|
|
q = _mm256_round_ps(q, 8);
|
|
return _mm256_fmadd_ps(q, vm4591_float, x);
|
|
}
|
|
|
|
static inline __m256i
|
|
squeeze(__m256i x)
|
|
{
|
|
__m256i q = _mm256_mulhrs_epi16(x, v7);
|
|
q = _mm256_mullo_epi16(q, v4591_16);
|
|
return _mm256_sub_epi16(x, q);
|
|
}
|
|
|
|
static inline __m256i
|
|
squeezeadd16(__m256i x, __m256i y)
|
|
{
|
|
__m256i q;
|
|
x = _mm256_add_epi16(x, y);
|
|
q = _mm256_mulhrs_epi16(x, v7);
|
|
q = _mm256_mullo_epi16(q, v4591_16);
|
|
return _mm256_sub_epi16(x, q);
|
|
}
|
|
|
|
static inline __m256i
|
|
freeze(__m256i x)
|
|
{
|
|
__m256i mask, x2296, x4591;
|
|
x4591 = _mm256_add_epi16(x, v4591_16);
|
|
mask = _mm256_srai_epi16(x, 15);
|
|
x = _mm256_blendv_epi8(x, x4591, mask);
|
|
x2296 = _mm256_sub_epi16(x, v2296_16);
|
|
mask = _mm256_srai_epi16(x2296, 15);
|
|
x4591 = _mm256_sub_epi16(x, v4591_16);
|
|
x = _mm256_blendv_epi8(x4591, x, mask);
|
|
return x;
|
|
}
|
|
|
|
/* 24*8*float32 f inputs between -10000 and 10000 */
|
|
/* 24*8*float32 g inputs between -32 and 32 */
|
|
/* 48*8*float32 h outputs between -7680000 and 7680000 */
|
|
static void
|
|
mult24x8_float(__m256 h[48], const __m256 f[24], const __m256 g[24])
|
|
{
|
|
int i, j;
|
|
__m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4;
|
|
|
|
i = 0;
|
|
f0 = f[i];
|
|
f1 = f[i + 1];
|
|
f2 = f[i + 2];
|
|
f3 = f[i + 3];
|
|
f4 = f[i + 4];
|
|
f5 = f[i + 5];
|
|
MULSTEP_fromzero(0, h0, h1, h2, h3, h4);
|
|
for(j = 0; j < 20; j += 5)
|
|
{
|
|
MULSTEP_noload(j + 1, h1, h2, h3, h4, h0);
|
|
MULSTEP_noload(j + 2, h2, h3, h4, h0, h1);
|
|
MULSTEP_noload(j + 3, h3, h4, h0, h1, h2);
|
|
MULSTEP_noload(j + 4, h4, h0, h1, h2, h3);
|
|
MULSTEP_noload(j + 5, h0, h1, h2, h3, h4);
|
|
}
|
|
MULSTEP_noload(j + 1, h1, h2, h3, h4, h0);
|
|
MULSTEP_noload(j + 2, h2, h3, h4, h0, h1);
|
|
MULSTEP_noload(j + 3, h3, h4, h0, h1, h2);
|
|
|
|
h[i + j + 4] = h4;
|
|
h[i + j + 5] = h0;
|
|
h[i + j + 6] = h1;
|
|
h[i + j + 7] = h2;
|
|
h[i + j + 8] = h3;
|
|
|
|
for(i = 6; i < 24; i += 6)
|
|
{
|
|
f0 = f[i];
|
|
f1 = f[i + 1];
|
|
f2 = f[i + 2];
|
|
f3 = f[i + 3];
|
|
f4 = f[i + 4];
|
|
f5 = f[i + 5];
|
|
h0 = h[i];
|
|
h1 = h[i + 1];
|
|
h2 = h[i + 2];
|
|
h3 = h[i + 3];
|
|
h4 = h[i + 4];
|
|
for(j = 0; j < 15; j += 5)
|
|
{
|
|
MULSTEP(j + 0, h0, h1, h2, h3, h4);
|
|
MULSTEP(j + 1, h1, h2, h3, h4, h0);
|
|
MULSTEP(j + 2, h2, h3, h4, h0, h1);
|
|
MULSTEP(j + 3, h3, h4, h0, h1, h2);
|
|
MULSTEP(j + 4, h4, h0, h1, h2, h3);
|
|
}
|
|
MULSTEP(j + 0, h0, h1, h2, h3, h4);
|
|
MULSTEP(j + 1, h1, h2, h3, h4, h0);
|
|
MULSTEP(j + 2, h2, h3, h4, h0, h1);
|
|
MULSTEP_noload(j + 3, h3, h4, h0, h1, h2);
|
|
MULSTEP_noload(j + 4, h4, h0, h1, h2, h3);
|
|
MULSTEP_noload(j + 5, h0, h1, h2, h3, h4);
|
|
MULSTEP_noload(j + 6, h1, h2, h3, h4, h0);
|
|
MULSTEP_noload(j + 7, h2, h3, h4, h0, h1);
|
|
MULSTEP_noload(j + 8, h3, h4, h0, h1, h2);
|
|
h[i + j + 9] = h4;
|
|
h[i + j + 10] = h0;
|
|
h[i + j + 11] = h1;
|
|
h[i + j + 12] = h2;
|
|
h[i + j + 13] = h3;
|
|
}
|
|
|
|
h[47] = v0_float;
|
|
}
|
|
|
|
/* 48*8*float32 f inputs between -5000 and 5000 */
|
|
/* 48*8*float32 g inputs between -16 and 16 */
|
|
/* 96*8*float32 h outputs between -3840000 and 3840000 */
|
|
static void
|
|
mult48x8_float(__m256 h[96], const __m256 f[48], const __m256 g[48])
|
|
{
|
|
__m256 h01[48];
|
|
__m256 g01[24];
|
|
__m256 *f01 = h01 + 24;
|
|
int i;
|
|
|
|
for(i = 24; i > 0;)
|
|
{
|
|
i -= 2;
|
|
f01[i] = f[i] + f[i + 24];
|
|
g01[i] = g[i] + g[i + 24];
|
|
f01[i + 1] = f[i + 1] + f[i + 1 + 24];
|
|
g01[i + 1] = g[i + 1] + g[i + 1 + 24];
|
|
}
|
|
|
|
mult24x8_float(h, f, g);
|
|
mult24x8_float(h + 48, f + 24, g + 24);
|
|
mult24x8_float(h01, f01, g01);
|
|
|
|
for(i = 0; i < 24; ++i)
|
|
{
|
|
__m256 h0i = h[i];
|
|
__m256 h0itop = h[i + 24];
|
|
__m256 h1i = h[i + 48];
|
|
__m256 h1itop = h[i + 72];
|
|
__m256 h01i = h01[i];
|
|
__m256 h01itop = h01[i + 24];
|
|
__m256 c = fastsub(h0itop, h1i);
|
|
h[i + 24] = c + fastsub(h01i, h0i);
|
|
h[i + 48] = fastsub(h01itop, h1itop) - c;
|
|
}
|
|
}
|
|
|
|
/* 96*8*float32 f inputs between -2500 and 2500 */
|
|
/* 96*8*float32 g inputs between -8 and 8 */
|
|
/* 192*8*float32 h outputs between -1920000 and 1920000 */
|
|
static void
|
|
mult96x8_float(__m256 h[192], const __m256 f[96], const __m256 g[96])
|
|
{
|
|
__m256 h01[96];
|
|
__m256 g01[48];
|
|
__m256 *f01 = h01 + 48;
|
|
int i;
|
|
|
|
for(i = 48; i > 0;)
|
|
{
|
|
i -= 4;
|
|
f01[i] = f[i] + f[i + 48];
|
|
g01[i] = g[i] + g[i + 48];
|
|
f01[i + 1] = f[i + 1] + f[i + 1 + 48];
|
|
g01[i + 1] = g[i + 1] + g[i + 1 + 48];
|
|
f01[i + 2] = f[i + 2] + f[i + 2 + 48];
|
|
g01[i + 2] = g[i + 2] + g[i + 2 + 48];
|
|
f01[i + 3] = f[i + 3] + f[i + 3 + 48];
|
|
g01[i + 3] = g[i + 3] + g[i + 3 + 48];
|
|
}
|
|
|
|
mult48x8_float(h, f, g);
|
|
mult48x8_float(h + 96, f + 48, g + 48);
|
|
mult48x8_float(h01, f01, g01);
|
|
|
|
for(i = 0; i < 48; ++i)
|
|
{
|
|
__m256 h0i = h[i];
|
|
__m256 h0itop = h[i + 48];
|
|
__m256 h1i = h[i + 96];
|
|
__m256 h1itop = h[i + 144];
|
|
__m256 h01i = h01[i];
|
|
__m256 h01itop = h01[i + 48];
|
|
__m256 c = fastsub(h0itop, h1i);
|
|
h[i + 48] = c + fastsub(h01i, h0i);
|
|
h[i + 96] = fastsub(h01itop, h1itop) - c;
|
|
}
|
|
}
|
|
|
|
/* 96*16*int16 f inputs between -2500 and 2500 */
|
|
/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */
|
|
/* 192*16*int16 h outputs between -2400 and 2400 */
|
|
static void
|
|
mult96x16(__m256i * h, const __m256i * f, const __m256i * g)
|
|
{
|
|
__m256 hfloat[192];
|
|
__m256 gfloat[96];
|
|
__m256 *ffloat = hfloat + 96;
|
|
int i, p;
|
|
|
|
for(p = 0; p < 2; ++p)
|
|
{
|
|
for(i = 96; i > 0;)
|
|
{
|
|
i -= 2;
|
|
__m256i fi =
|
|
_mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *)&f[i]));
|
|
__m256i gi = _mm256_cvtepi16_epi32(
|
|
_mm_load_cvtepi8_epi16(p + (const long long *)&g[i]));
|
|
__m256 storage;
|
|
*(__m256i *)&storage = _mm256_add_epi32(fi, alpha_32);
|
|
ffloat[i] = storage - alpha_float;
|
|
*(__m256i *)&storage = _mm256_add_epi32(gi, alpha_32);
|
|
gfloat[i] = storage - alpha_float;
|
|
fi = _mm256_cvtepi16_epi32(
|
|
_mm_loadu_si128(p + (const __m128i *)&f[i + 1]));
|
|
gi = _mm256_cvtepi16_epi32(
|
|
_mm_load_cvtepi8_epi16(p + (const long long *)&g[i + 1]));
|
|
*(__m256i *)&storage = _mm256_add_epi32(fi, alpha_32);
|
|
ffloat[i + 1] = storage - alpha_float;
|
|
*(__m256i *)&storage = _mm256_add_epi32(gi, alpha_32);
|
|
gfloat[i + 1] = storage - alpha_float;
|
|
}
|
|
mult96x8_float(hfloat, ffloat, gfloat);
|
|
for(i = 192; i > 0;)
|
|
{
|
|
__m128i h0, h1;
|
|
i -= 4;
|
|
hfloat[i] = add(alpha_float, reduce(hfloat[i]));
|
|
hfloat[i + 1] = fastadd(alpha_float, reduce(hfloat[i + 1]));
|
|
hfloat[i + 2] = add(alpha_float, reduce(hfloat[i + 2]));
|
|
hfloat[i + 3] = fastadd(alpha_float, reduce(hfloat[i + 3]));
|
|
h0 = 0 [(__m128i *)&hfloat[i]];
|
|
h0 = _mm_sub_epi32(h0, alpha_32_128);
|
|
h1 = 1 [(__m128i *)&hfloat[i]];
|
|
h1 = _mm_sub_epi32(h1, alpha_32_128);
|
|
_mm_storeu_si128(p + (__m128i *)&h[i], _mm_packs_epi32(h0, h1));
|
|
h0 = 0 [(__m128i *)&hfloat[i + 1]];
|
|
h0 = _mm_sub_epi32(h0, alpha_32_128);
|
|
h1 = 1 [(__m128i *)&hfloat[i + 1]];
|
|
h1 = _mm_sub_epi32(h1, alpha_32_128);
|
|
_mm_storeu_si128(p + (__m128i *)&h[i + 1], _mm_packs_epi32(h0, h1));
|
|
h0 = 0 [(__m128i *)&hfloat[i + 2]];
|
|
h0 = _mm_sub_epi32(h0, alpha_32_128);
|
|
h1 = 1 [(__m128i *)&hfloat[i + 2]];
|
|
h1 = _mm_sub_epi32(h1, alpha_32_128);
|
|
_mm_storeu_si128(p + (__m128i *)&h[i + 2], _mm_packs_epi32(h0, h1));
|
|
h0 = 0 [(__m128i *)&hfloat[i + 3]];
|
|
h0 = _mm_sub_epi32(h0, alpha_32_128);
|
|
h1 = 1 [(__m128i *)&hfloat[i + 3]];
|
|
h1 = _mm_sub_epi32(h1, alpha_32_128);
|
|
_mm_storeu_si128(p + (__m128i *)&h[i + 3], _mm_packs_epi32(h0, h1));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* int16 i of output x[j] is int16 j of input x[i] */
|
|
static void
|
|
transpose16(__m256i x[16])
|
|
{
|
|
static const int rev[4] = {0, 4, 2, 6};
|
|
int i;
|
|
__m256i y[16];
|
|
|
|
for(i = 0; i < 16; i += 4)
|
|
{
|
|
__m256i a0 = x[i];
|
|
__m256i a1 = x[i + 1];
|
|
__m256i a2 = x[i + 2];
|
|
__m256i a3 = x[i + 3];
|
|
__m256i b0 = _mm256_unpacklo_epi16(a0, a1);
|
|
__m256i b1 = _mm256_unpackhi_epi16(a0, a1);
|
|
__m256i b2 = _mm256_unpacklo_epi16(a2, a3);
|
|
__m256i b3 = _mm256_unpackhi_epi16(a2, a3);
|
|
__m256i c0 = _mm256_unpacklo_epi32(b0, b2);
|
|
__m256i c2 = _mm256_unpackhi_epi32(b0, b2);
|
|
__m256i c1 = _mm256_unpacklo_epi32(b1, b3);
|
|
__m256i c3 = _mm256_unpackhi_epi32(b1, b3);
|
|
y[i] = c0;
|
|
y[i + 2] = c2;
|
|
y[i + 1] = c1;
|
|
y[i + 3] = c3;
|
|
}
|
|
for(i = 0; i < 4; ++i)
|
|
{
|
|
int r = rev[i];
|
|
__m256i c0 = y[i];
|
|
__m256i c4 = y[i + 4];
|
|
__m256i c8 = y[i + 8];
|
|
__m256i c12 = y[i + 12];
|
|
__m256i d0 = _mm256_unpacklo_epi64(c0, c4);
|
|
__m256i d4 = _mm256_unpackhi_epi64(c0, c4);
|
|
__m256i d8 = _mm256_unpacklo_epi64(c8, c12);
|
|
__m256i d12 = _mm256_unpackhi_epi64(c8, c12);
|
|
__m256i e0 = _mm256_permute2x128_si256(d0, d8, 0x20);
|
|
__m256i e8 = _mm256_permute2x128_si256(d0, d8, 0x31);
|
|
__m256i e4 = _mm256_permute2x128_si256(d4, d12, 0x20);
|
|
__m256i e12 = _mm256_permute2x128_si256(d4, d12, 0x31);
|
|
x[r] = e0;
|
|
x[r + 8] = e8;
|
|
x[r + 1] = e4;
|
|
x[r + 9] = e12;
|
|
}
|
|
}
|
|
|
|
/* byte i of output x[j] is byte j of input x[i] */
|
|
static void
|
|
transpose32(__m256i x[32])
|
|
{
|
|
static const int rev[4] = {0, 8, 4, 12};
|
|
int i;
|
|
__m256i y[32];
|
|
|
|
for(i = 0; i < 32; i += 4)
|
|
{
|
|
__m256i a0 = x[i];
|
|
__m256i a1 = x[i + 1];
|
|
__m256i a2 = x[i + 2];
|
|
__m256i a3 = x[i + 3];
|
|
__m256i b0 = _mm256_unpacklo_epi8(a0, a1);
|
|
__m256i b1 = _mm256_unpackhi_epi8(a0, a1);
|
|
__m256i b2 = _mm256_unpacklo_epi8(a2, a3);
|
|
__m256i b3 = _mm256_unpackhi_epi8(a2, a3);
|
|
__m256i c0 = _mm256_unpacklo_epi16(b0, b2);
|
|
__m256i c2 = _mm256_unpackhi_epi16(b0, b2);
|
|
__m256i c1 = _mm256_unpacklo_epi16(b1, b3);
|
|
__m256i c3 = _mm256_unpackhi_epi16(b1, b3);
|
|
y[i] = c0;
|
|
y[i + 2] = c2;
|
|
y[i + 1] = c1;
|
|
y[i + 3] = c3;
|
|
}
|
|
for(i = 0; i < 4; ++i)
|
|
{
|
|
int r = rev[i];
|
|
__m256i c0 = y[i];
|
|
__m256i c8 = y[i + 8];
|
|
__m256i c16 = y[i + 16];
|
|
__m256i c24 = y[i + 24];
|
|
__m256i c4 = y[i + 4];
|
|
__m256i c12 = y[i + 12];
|
|
__m256i c20 = y[i + 20];
|
|
__m256i c28 = y[i + 28];
|
|
__m256i d0 = _mm256_unpacklo_epi32(c0, c4);
|
|
__m256i d4 = _mm256_unpackhi_epi32(c0, c4);
|
|
__m256i d8 = _mm256_unpacklo_epi32(c8, c12);
|
|
__m256i d12 = _mm256_unpackhi_epi32(c8, c12);
|
|
__m256i d16 = _mm256_unpacklo_epi32(c16, c20);
|
|
__m256i d20 = _mm256_unpackhi_epi32(c16, c20);
|
|
__m256i d24 = _mm256_unpacklo_epi32(c24, c28);
|
|
__m256i d28 = _mm256_unpackhi_epi32(c24, c28);
|
|
__m256i e0 = _mm256_unpacklo_epi64(d0, d8);
|
|
__m256i e8 = _mm256_unpackhi_epi64(d0, d8);
|
|
__m256i e16 = _mm256_unpacklo_epi64(d16, d24);
|
|
__m256i e24 = _mm256_unpackhi_epi64(d16, d24);
|
|
__m256i e4 = _mm256_unpacklo_epi64(d4, d12);
|
|
__m256i e12 = _mm256_unpackhi_epi64(d4, d12);
|
|
__m256i e20 = _mm256_unpacklo_epi64(d20, d28);
|
|
__m256i e28 = _mm256_unpackhi_epi64(d20, d28);
|
|
__m256i f0 = _mm256_permute2x128_si256(e0, e16, 0x20);
|
|
__m256i f16 = _mm256_permute2x128_si256(e0, e16, 0x31);
|
|
__m256i f8 = _mm256_permute2x128_si256(e8, e24, 0x20);
|
|
__m256i f24 = _mm256_permute2x128_si256(e8, e24, 0x31);
|
|
__m256i f4 = _mm256_permute2x128_si256(e4, e20, 0x20);
|
|
__m256i f20 = _mm256_permute2x128_si256(e4, e20, 0x31);
|
|
__m256i f12 = _mm256_permute2x128_si256(e12, e28, 0x20);
|
|
__m256i f28 = _mm256_permute2x128_si256(e12, e28, 0x31);
|
|
x[r] = f0;
|
|
x[r + 16] = f16;
|
|
x[r + 1] = f8;
|
|
x[r + 17] = f24;
|
|
x[r + 2] = f4;
|
|
x[r + 18] = f20;
|
|
x[r + 3] = f12;
|
|
x[r + 19] = f28;
|
|
}
|
|
}
|
|
|
|
/* 48*16*int16 f inputs between -2295 and 2295 */
|
|
/* 24*32*int8 g inputs between -1 and 1 */
|
|
/* 96*16*int16 h outputs between -2295 and 2295 */
|
|
static void
|
|
mult768_mix2_m256i(__m256i h[96], const __m256i f[48], const __m256i g[24])
|
|
{
|
|
__m256i hkara[24][16];
|
|
__m256i gkara[3][32];
|
|
#define fkara hkara
|
|
int i;
|
|
|
|
for(i = 6; i-- > 0;)
|
|
{
|
|
__m256i f0, f1, f2, f3, f4, f5, f6, f7;
|
|
__m256i f01, f23, f45, f67;
|
|
__m256i f02, f46, f04, f26, f0426;
|
|
__m256i f13, f57, f15, f37, f1537;
|
|
__m256i f0213, f4657, f04261537, f0415, f2637;
|
|
|
|
f0 = _mm256_loadu_si256(&f[i + 0]);
|
|
f1 = _mm256_loadu_si256(&f[i + 6]);
|
|
f2 = _mm256_loadu_si256(&f[i + 12]);
|
|
f3 = _mm256_loadu_si256(&f[i + 18]);
|
|
f4 = _mm256_loadu_si256(&f[i + 24]);
|
|
f5 = _mm256_loadu_si256(&f[i + 30]);
|
|
f6 = _mm256_loadu_si256(&f[i + 36]);
|
|
f7 = _mm256_loadu_si256(&f[i + 42]);
|
|
f01 = squeezeadd16(f0, f1);
|
|
fkara[i][8] = f01;
|
|
f23 = squeezeadd16(f2, f3);
|
|
fkara[i][9] = f23;
|
|
f45 = squeezeadd16(f4, f5);
|
|
fkara[i][10] = f45;
|
|
f67 = squeezeadd16(f6, f7);
|
|
fkara[i][11] = f67;
|
|
|
|
fkara[i][0] = f0;
|
|
fkara[i][2] = f2;
|
|
fkara[i][4] = f4;
|
|
fkara[i][6] = f6;
|
|
|
|
f02 = squeezeadd16(f0, f2);
|
|
fkara[i + 6][0] = f02;
|
|
f04 = squeezeadd16(f0, f4);
|
|
fkara[i + 6][6] = f04;
|
|
f46 = squeezeadd16(f4, f6);
|
|
fkara[i + 6][3] = f46;
|
|
f26 = squeezeadd16(f2, f6);
|
|
fkara[i + 6][8] = f26;
|
|
|
|
fkara[i][1] = f1;
|
|
fkara[i][3] = f3;
|
|
fkara[i][5] = f5;
|
|
fkara[i][7] = f7;
|
|
|
|
f13 = squeezeadd16(f1, f3);
|
|
fkara[i + 6][1] = f13;
|
|
f15 = squeezeadd16(f1, f5);
|
|
fkara[i + 6][7] = f15;
|
|
f57 = squeezeadd16(f5, f7);
|
|
fkara[i + 6][4] = f57;
|
|
f37 = squeezeadd16(f3, f7);
|
|
fkara[i + 6][9] = f37;
|
|
|
|
f0426 = squeezeadd16(f04, f26);
|
|
fkara[i + 6][12] = f0426;
|
|
f1537 = squeezeadd16(f15, f37);
|
|
fkara[i + 6][13] = f1537;
|
|
f0213 = squeezeadd16(f02, f13);
|
|
fkara[i + 6][2] = f0213;
|
|
f4657 = squeezeadd16(f46, f57);
|
|
fkara[i + 6][5] = f4657;
|
|
f0415 = squeezeadd16(f04, f15);
|
|
fkara[i + 6][10] = f0415;
|
|
f2637 = squeezeadd16(f26, f37);
|
|
fkara[i + 6][11] = f2637;
|
|
f04261537 = squeezeadd16(f0426, f1537);
|
|
fkara[i + 6][14] = f04261537;
|
|
|
|
fkara[i][12] = v0;
|
|
fkara[i][13] = v0;
|
|
fkara[i][14] = v0;
|
|
fkara[i][15] = v0;
|
|
fkara[i + 6][15] = v0;
|
|
}
|
|
|
|
for(i = 3; i-- > 0;)
|
|
{
|
|
__m256i g0, g1, g2, g3, g4, g5, g6, g7;
|
|
__m256i g01, g23, g45, g67;
|
|
__m256i g02, g46, g04, g26, g0426;
|
|
__m256i g13, g57, g15, g37, g1537;
|
|
__m256i g0213, g4657, g04261537, g0415, g2637;
|
|
|
|
g0 = _mm256_loadu_si256(&g[i + 0]);
|
|
g1 = _mm256_loadu_si256(&g[i + 3]);
|
|
g2 = _mm256_loadu_si256(&g[i + 6]);
|
|
g3 = _mm256_loadu_si256(&g[i + 9]);
|
|
g4 = _mm256_loadu_si256(&g[i + 12]);
|
|
g5 = _mm256_loadu_si256(&g[i + 15]);
|
|
g6 = _mm256_loadu_si256(&g[i + 18]);
|
|
g7 = _mm256_loadu_si256(&g[i + 21]);
|
|
g01 = _mm256_add_epi8(g0, g1);
|
|
gkara[i][8] = g01;
|
|
g23 = _mm256_add_epi8(g2, g3);
|
|
gkara[i][9] = g23;
|
|
g45 = _mm256_add_epi8(g4, g5);
|
|
gkara[i][10] = g45;
|
|
g67 = _mm256_add_epi8(g6, g7);
|
|
gkara[i][11] = g67;
|
|
|
|
gkara[i][0] = g0;
|
|
gkara[i][2] = g2;
|
|
gkara[i][4] = g4;
|
|
gkara[i][6] = g6;
|
|
|
|
g02 = _mm256_add_epi8(g0, g2);
|
|
gkara[i][16] = g02;
|
|
g04 = _mm256_add_epi8(g0, g4);
|
|
gkara[i][22] = g04;
|
|
g46 = _mm256_add_epi8(g4, g6);
|
|
gkara[i][19] = g46;
|
|
g26 = _mm256_add_epi8(g2, g6);
|
|
gkara[i][24] = g26;
|
|
|
|
gkara[i][1] = g1;
|
|
gkara[i][3] = g3;
|
|
gkara[i][5] = g5;
|
|
gkara[i][7] = g7;
|
|
|
|
g13 = _mm256_add_epi8(g1, g3);
|
|
gkara[i][17] = g13;
|
|
g15 = _mm256_add_epi8(g1, g5);
|
|
gkara[i][23] = g15;
|
|
g57 = _mm256_add_epi8(g5, g7);
|
|
gkara[i][20] = g57;
|
|
g37 = _mm256_add_epi8(g3, g7);
|
|
gkara[i][25] = g37;
|
|
|
|
g0426 = _mm256_add_epi8(g04, g26);
|
|
gkara[i][28] = g0426;
|
|
g1537 = _mm256_add_epi8(g15, g37);
|
|
gkara[i][29] = g1537;
|
|
g0213 = _mm256_add_epi8(g02, g13);
|
|
gkara[i][18] = g0213;
|
|
g4657 = _mm256_add_epi8(g46, g57);
|
|
gkara[i][21] = g4657;
|
|
g0415 = _mm256_add_epi8(g04, g15);
|
|
gkara[i][26] = g0415;
|
|
g2637 = _mm256_add_epi8(g26, g37);
|
|
gkara[i][27] = g2637;
|
|
g04261537 = _mm256_add_epi8(g0426, g1537);
|
|
gkara[i][30] = g04261537;
|
|
|
|
gkara[i][12] = v0;
|
|
gkara[i][13] = v0;
|
|
gkara[i][14] = v0;
|
|
gkara[i][15] = v0;
|
|
gkara[i][31] = v0;
|
|
}
|
|
|
|
for(i = 12; i-- > 0;)
|
|
transpose16(fkara[i]);
|
|
for(i = 3; i-- > 0;)
|
|
transpose32(gkara[i]);
|
|
|
|
mult96x16(hkara[12], fkara[6], (__m256i *)(1 + (__m128i *)gkara));
|
|
mult96x16(hkara[0], fkara[0], gkara[0]);
|
|
|
|
for(i = 24; i-- > 0;)
|
|
transpose16(hkara[i]);
|
|
|
|
for(i = 6; i-- > 0;)
|
|
{
|
|
__m256i h0, h1, h2, h3, h4, h5, h6, h7, h8, h9;
|
|
__m256i h10, h11, h12, h13, h14, h15, h16, h17, h18, h19;
|
|
__m256i h20, h21, h22, h23;
|
|
__m256i h32, h33, h34, h35, h36, h37, h38, h39;
|
|
__m256i h40, h41, h42, h43, h44, h45, h46, h47, h48, h49;
|
|
__m256i h50, h51, h52, h53, h54, h55, h56, h57, h58, h59;
|
|
__m256i h60, h61;
|
|
__m256i c;
|
|
|
|
#define COMBINE(h0, h1, h2, h3, x0, x1) \
|
|
c = _mm256_sub_epi16(h1, h2); \
|
|
h1 = _mm256_sub_epi16(_mm256_add_epi16(c, x0), h0); \
|
|
h2 = _mm256_sub_epi16(x1, _mm256_add_epi16(c, h3)); \
|
|
h1 = squeeze(h1); \
|
|
h2 = squeeze(h2);
|
|
|
|
h56 = hkara[i + 12][12];
|
|
h57 = hkara[i + 18][12];
|
|
h58 = hkara[i + 12][13];
|
|
h59 = hkara[i + 18][13];
|
|
h60 = hkara[i + 12][14];
|
|
h61 = hkara[i + 18][14];
|
|
COMBINE(h56, h57, h58, h59, h60, h61)
|
|
|
|
h44 = hkara[i + 12][6];
|
|
h45 = hkara[i + 18][6];
|
|
h46 = hkara[i + 12][7];
|
|
h47 = hkara[i + 18][7];
|
|
h52 = hkara[i + 12][10];
|
|
h53 = hkara[i + 18][10];
|
|
COMBINE(h44, h45, h46, h47, h52, h53)
|
|
|
|
h48 = hkara[i + 12][8];
|
|
h49 = hkara[i + 18][8];
|
|
h50 = hkara[i + 12][9];
|
|
h51 = hkara[i + 18][9];
|
|
h54 = hkara[i + 12][11];
|
|
h55 = hkara[i + 18][11];
|
|
COMBINE(h48, h49, h50, h51, h54, h55)
|
|
COMBINE(h44, h46, h48, h50, h56, h58)
|
|
COMBINE(h45, h47, h49, h51, h57, h59)
|
|
|
|
h0 = hkara[i][0];
|
|
h1 = hkara[i + 6][0];
|
|
h2 = hkara[i][1];
|
|
h3 = hkara[i + 6][1];
|
|
h16 = hkara[i][8];
|
|
h17 = hkara[i + 6][8];
|
|
COMBINE(h0, h1, h2, h3, h16, h17)
|
|
|
|
h4 = hkara[i][2];
|
|
h5 = hkara[i + 6][2];
|
|
h6 = hkara[i][3];
|
|
h7 = hkara[i + 6][3];
|
|
h18 = hkara[i][9];
|
|
h19 = hkara[i + 6][9];
|
|
COMBINE(h4, h5, h6, h7, h18, h19)
|
|
|
|
h32 = hkara[i + 12][0];
|
|
h33 = hkara[i + 18][0];
|
|
h34 = hkara[i + 12][1];
|
|
h35 = hkara[i + 18][1];
|
|
h36 = hkara[i + 12][2];
|
|
h37 = hkara[i + 18][2];
|
|
COMBINE(h32, h33, h34, h35, h36, h37)
|
|
COMBINE(h1, h3, h5, h7, h33, h35)
|
|
COMBINE(h0, h2, h4, h6, h32, h34)
|
|
|
|
h8 = hkara[i][4];
|
|
h9 = hkara[i + 6][4];
|
|
h10 = hkara[i][5];
|
|
h11 = hkara[i + 6][5];
|
|
h20 = hkara[i][10];
|
|
h21 = hkara[i + 6][10];
|
|
COMBINE(h8, h9, h10, h11, h20, h21)
|
|
|
|
h12 = hkara[i][6];
|
|
h13 = hkara[i + 6][6];
|
|
h14 = hkara[i][7];
|
|
h15 = hkara[i + 6][7];
|
|
h22 = hkara[i][11];
|
|
h23 = hkara[i + 6][11];
|
|
COMBINE(h12, h13, h14, h15, h22, h23)
|
|
|
|
h38 = hkara[i + 12][3];
|
|
h39 = hkara[i + 18][3];
|
|
h40 = hkara[i + 12][4];
|
|
h41 = hkara[i + 18][4];
|
|
h42 = hkara[i + 12][5];
|
|
h43 = hkara[i + 18][5];
|
|
COMBINE(h38, h39, h40, h41, h42, h43)
|
|
COMBINE(h8, h10, h12, h14, h38, h40)
|
|
COMBINE(h9, h11, h13, h15, h39, h41)
|
|
|
|
COMBINE(h0, h4, h8, h12, h44, h48)
|
|
h0 = freeze(h0);
|
|
h4 = freeze(h4);
|
|
h8 = freeze(h8);
|
|
h12 = freeze(h12);
|
|
_mm256_storeu_si256(&h[i + 0], h0);
|
|
_mm256_storeu_si256(&h[i + 24], h4);
|
|
_mm256_storeu_si256(&h[i + 48], h8);
|
|
_mm256_storeu_si256(&h[i + 72], h12);
|
|
|
|
COMBINE(h1, h5, h9, h13, h45, h49)
|
|
h1 = freeze(h1);
|
|
h5 = freeze(h5);
|
|
h9 = freeze(h9);
|
|
h13 = freeze(h13);
|
|
_mm256_storeu_si256(&h[i + 6], h1);
|
|
_mm256_storeu_si256(&h[i + 30], h5);
|
|
_mm256_storeu_si256(&h[i + 54], h9);
|
|
_mm256_storeu_si256(&h[i + 78], h13);
|
|
|
|
COMBINE(h2, h6, h10, h14, h46, h50)
|
|
h2 = freeze(h2);
|
|
h6 = freeze(h6);
|
|
h10 = freeze(h10);
|
|
h14 = freeze(h14);
|
|
_mm256_storeu_si256(&h[i + 12], h2);
|
|
_mm256_storeu_si256(&h[i + 36], h6);
|
|
_mm256_storeu_si256(&h[i + 60], h10);
|
|
_mm256_storeu_si256(&h[i + 84], h14);
|
|
|
|
COMBINE(h3, h7, h11, h15, h47, h51)
|
|
h3 = freeze(h3);
|
|
h7 = freeze(h7);
|
|
h11 = freeze(h11);
|
|
h15 = freeze(h15);
|
|
_mm256_storeu_si256(&h[i + 18], h3);
|
|
_mm256_storeu_si256(&h[i + 42], h7);
|
|
_mm256_storeu_si256(&h[i + 66], h11);
|
|
_mm256_storeu_si256(&h[i + 90], h15);
|
|
}
|
|
}
|
|
|
|
#define p 761
|
|
|
|
/* 761 f inputs between -2295 and 2295 */
|
|
/* 761 g inputs between -1 and 1 */
|
|
/* 761 h outputs between -2295 and 2295 */
|
|
void
|
|
rq_mult(modq *h, const modq *f, const small *g)
|
|
{
|
|
__m256i fgvec[96];
|
|
modq *fg;
|
|
int i;
|
|
|
|
mult768_mix2_m256i(fgvec, (__m256i *)f, (__m256i *)g);
|
|
fg = (modq *)fgvec;
|
|
|
|
h[0] = modq_freeze(fg[0] + fg[p]);
|
|
for(i = 1; i < 9; ++i)
|
|
h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
|
|
for(i = 9; i < 761; i += 16)
|
|
{
|
|
__m256i fgi = _mm256_loadu_si256((__m256i *)&fg[i]);
|
|
__m256i fgip = _mm256_loadu_si256((__m256i *)&fg[i + p]);
|
|
__m256i fgip1 = _mm256_loadu_si256((__m256i *)&fg[i + p - 1]);
|
|
__m256i x = _mm256_add_epi16(fgi, _mm256_add_epi16(fgip, fgip1));
|
|
x = freeze(squeeze(x));
|
|
_mm256_storeu_si256((__m256i *)&h[i], x);
|
|
}
|
|
for(i = 761; i < 768; ++i)
|
|
h[i] = 0;
|
|
}
|
|
|
|
void
|
|
r3_mult(small *h, const small *f, const small *g)
|
|
{
|
|
__m256i fgvec[96];
|
|
__m256i fvec[48];
|
|
modq *fg;
|
|
int i;
|
|
|
|
memset(fvec, 0, sizeof fvec);
|
|
|
|
for(i = 0; i < 761; ++i)
|
|
i[(modq *)fvec] = f[i];
|
|
|
|
mult768_mix2_m256i(fgvec, fvec, (__m256i *)g);
|
|
fg = (modq *)fgvec;
|
|
|
|
h[0] = mod3_freeze(fg[0] + fg[p]);
|
|
for(i = 1; i < p; ++i)
|
|
h[i] = mod3_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
|
|
for(i = p; i < 768; ++i)
|
|
h[i] = 0;
|
|
}
|
|
#endif
|