Merge pull request #787 from despair86/master

make build internally consistent, bump windows version to 0.5.0
Jeff 5 years ago committed by GitHub
commit 325b697e90
No known key found for this signature in database

@ -14,6 +14,7 @@ option(USE_AVX2 "enable avx2 code" )
option(USE_NETNS "enable networking namespace support. Linux only" )
option(AMD_RYZEN_HACK "hack for AMD Ryzen FPU bug (support FMA3 and FMA4 in FPU, but does not show in CPUID)" )
option(NATIVE_BUILD "optimise for host system and FPU, may not be portable" )
option(EMBEDDED_CFG "optimise for older hardware or embedded systems")
option(STATIC_LINK_RUNTIME "link statically against compiler runtime, standard library and pthreads")
@ -31,6 +32,7 @@ option(WARNINGS_AS_ERRORS "treat all warnings as errors. turn off for developmen
@ -111,8 +113,6 @@ endif(WITH_SHELLHOOKS)
set(ABSEIL_DIR vendor/abseil-cpp)
include_directories(SYSTEM ${ABSEIL_DIR})
@ -163,7 +163,21 @@ if(NATIVE_BUILD)
set(CRYPTO_FLAGS -march=native -mfpmath=sse -mtune=native)
add_compile_options(${OPTIMIZE_FLAGS} ${CRYPTO_FLAGS})
message(WARNING "This configuration is optimised for older hardware and/or constrained node operation, may result in poor performance on desktop systems")
message(WARNING "For deployment on such systems, all external code (currently, libuv) must also be compiled for the target!")
set(CRYPTO_FLAGS -march=i486 -mtune=i486)
add_compile_options(${OPTIMIZE_FLAGS} ${CRYPTO_FLAGS})
if (FS_LIB STREQUAL "cppbackport")

@ -0,0 +1,49 @@
function(check_working_cxx_atomics64 varname)
# CMAKE_CXX_STANDARD does not propagate to cmake compile tests
#include <atomic>
#include <cstdint>
std::atomic<uint64_t> x (0);
int main() {
uint64_t i = x.load(std::memory_order_relaxed);
return 0;
" ${varname})
message(STATUS "Have working 64bit atomics")
check_library_exists(atomic __atomic_load_8 "" HAVE_CXX_LIBATOMICS64)
message(STATUS "Have 64bit atomics via library")
message(STATUS "Can link with libatomic")
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic! (What does MSVC do to inline atomics?)")
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic!")

@ -24,53 +24,16 @@ endif()
function(check_working_cxx_atomics64 varname)
#include <atomic>
#include <cstdint>
std::atomic<uint64_t> x (0);
int main() {
uint64_t i = x.load(std::memory_order_relaxed);
return 0;
" ${varname})
message(STATUS "Have working 64bit atomics")
check_library_exists(atomic __atomic_load_8 "" HAVE_CXX_LIBATOMICS64)
message(STATUS "Have 64bit atomics via library")
message(STATUS "Can link with libatomic")
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic!")
set(FS_LIB stdc++fs)
get_filename_component(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-unix-linux.c ABSOLUTE)
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Android")
find_library(FS_LIB NAMES c++fs c++experimental stdc++fs)
set(FS_LIB cppbackport)
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
find_library(FS_LIB NAMES c++fs c++experimental stdc++fs)
set(FS_LIB cppbackport)
@ -96,8 +58,10 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
set(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-unix-sunos.c)
# Apple C++ screws up name decorations in stdc++fs, causing link to fail
# Samsung does not build c++experimental or c++fs in their Apple libc++ pkgsrc build
link_libraries(-lkstat -lsendfile)
set(FS_LIB cppbackport)

@ -8,7 +8,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
add_compile_options(/EHca /arch:AVX2 /MD)
@ -25,11 +24,13 @@ if(NOT MSVC_VERSION)
add_definitions(-DWINVER=0x0500 -D_WIN32_WINNT=0x0500)
# Wait a minute, if we're not Microsoft C++, nor a Clang paired with Microsoft C++,
# then the only possible option has to be GNU or a GNU-linked Clang!
set(FS_LIB stdc++fs)
set(FS_LIB stdc++fs)
get_filename_component(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-windows.c ABSOLUTE)
get_filename_component(EV_SRC "llarp/ev/ev_win32.cpp" ABSOLUTE)
add_definitions(-DWIN32_LEAN_AND_MEAN -DWIN32 -DWINVER=0x0500)

@ -26,9 +26,9 @@
<method_credential user="lokinet" group="lokinet"/>
<exec_method type="method" name="start" exec="/usr/bin/lokinet" timeout_seconds="60"/>
<exec_method type="method" name="start" exec="/usr/bin/lokinet %{config_file}" timeout_seconds="60"/>
<exec_method type="method" name="stop" exec="/usr/bin/kill -INT &lt;&lt;&lt; /path/to/" timeout_seconds="60"/>
<exec_method type="method" name="stop" exec="/usr/bin/kill -INT &lt;&lt;&lt; `pgrep lokinet`" timeout_seconds="60"/>
<property_group name="startd" type="framework">
<propval name="duration" type="astring" value="child"/>
@ -38,7 +38,7 @@
<property_group name="application" type="application">
<propval name="config_file" type="astring" value="/etc/lokinet.ini"/>
<propval name="config_file" type="astring" value="/etc/loki/lokinet.ini"/>
@ -50,11 +50,11 @@
<loctext xml:lang="C">
LokiNET: Anonymous Network layer thingydoo.

@ -2,30 +2,30 @@
#ifndef blake2b_compress_avx2_H
#define blake2b_compress_avx2_H
#define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
#define LOADU128(p) _mm_loadu_si128((__m128i *)(p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
#define LOAD(p) _mm256_load_si256((__m256i *) (p))
#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
#define LOAD(p) _mm256_load_si256((__m256i *)(p))
#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *)(p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
static inline uint64_t
LOADU64(const void *p)
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, \
4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, \
5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ADD(a, b) _mm256_add_epi64(a, b)
#define SUB(a, b) _mm256_sub_epi64(a, b)
@ -40,98 +40,104 @@ LOADU64(const void *p)
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
#define BLAKE2B_G1_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while(0)
#define BLAKE2B_G2_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while (0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while (0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while(0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while(0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while(0)
#include "blake2b-load-avx2.h"
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
do { \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while (0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do { \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while (0)
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do { \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = \
XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while (0)
do \
{ \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while(0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do \
{ \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while(0)
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do \
{ \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while(0)

@ -2,102 +2,99 @@
#ifndef blake2b_compress_sse41_H
#define blake2b_compress_sse41_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse41.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

@ -2,102 +2,99 @@
#ifndef blake2b_compress_ssse3_H
#define blake2b_compress_ssse3_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse2.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

@ -1,340 +1,388 @@
#ifndef blake2b_load_avx2_H
#define blake2b_load_avx2_H
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do { \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do { \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do { \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do { \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do { \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do \
{ \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)

@ -16,149 +16,149 @@
#ifndef blake2b_load_sse2_H
#define blake2b_load_sse2_H
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)

@ -16,292 +16,340 @@
#ifndef blake2b_load_sse41_H
#define blake2b_load_sse41_H
#define LOAD_MSG_0_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_0_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_1_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_1_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_1_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_1_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_2_1(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while (0)
#define LOAD_MSG_2_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while (0)
#define LOAD_MSG_2_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while (0)
#define LOAD_MSG_2_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while (0)
#define LOAD_MSG_3_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while (0)
#define LOAD_MSG_3_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_3_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_3_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_4_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while (0)
#define LOAD_MSG_4_2(b0, b1) \
do { \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_4_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while (0)
#define LOAD_MSG_4_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while (0)
#define LOAD_MSG_5_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_5_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while (0)
#define LOAD_MSG_5_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while (0)
#define LOAD_MSG_5_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while (0)
#define LOAD_MSG_6_1(b0, b1) \
do { \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while (0)
#define LOAD_MSG_6_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while (0)
#define LOAD_MSG_6_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while (0)
#define LOAD_MSG_6_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while (0)
#define LOAD_MSG_7_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while (0)
#define LOAD_MSG_7_2(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_7_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while (0)
#define LOAD_MSG_7_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while (0)
#define LOAD_MSG_8_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while (0)
#define LOAD_MSG_8_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while (0)
#define LOAD_MSG_8_3(b0, b1) \
do { \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while (0)
#define LOAD_MSG_8_4(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while (0)
#define LOAD_MSG_9_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while (0)
#define LOAD_MSG_9_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while (0)
#define LOAD_MSG_9_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while (0)
#define LOAD_MSG_9_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while (0)
#define LOAD_MSG_10_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_10_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_11_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_11_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_11_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_11_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)

@ -24,15 +24,17 @@
#ifndef __amd64__
#ifdef __clang__
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __target__("sse2")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a)
return (__m128i){ __a, 0 };
return (__m128i){__a, 0};

@ -1,86 +1,89 @@
if (bytes > 0) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
_mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
_mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
_mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
_mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
sodium_memzero(partialblock, sizeof partialblock);
if(bytes > 0)
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
_mm_storeu_si128((__m128i*)(partialblock + 0), x_0);
_mm_storeu_si128((__m128i*)(partialblock + 16), x_1);
_mm_storeu_si128((__m128i*)(partialblock + 32), x_2);
_mm_storeu_si128((__m128i*)(partialblock + 48), x_3);
for(i = 0; i < bytes; i++)
c[i] = m[i] ^ partialblock[i];
sodium_memzero(partialblock, sizeof partialblock);

@ -1,98 +1,101 @@
while (bytes >= 64) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48)));
_mm_storeu_si128((__m128i*) (c + 0), x_0);
_mm_storeu_si128((__m128i*) (c + 16), x_1);
_mm_storeu_si128((__m128i*) (c + 32), x_2);
_mm_storeu_si128((__m128i*) (c + 48), x_3);
in12 = x[12];
in13 = x[13];
if (in12 == 0) {
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;
while(bytes >= 64)
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
_mm_storeu_si128((__m128i*)(c + 0), x_0);
_mm_storeu_si128((__m128i*)(c + 16), x_1);
_mm_storeu_si128((__m128i*)(c + 32), x_2);
_mm_storeu_si128((__m128i*)(c + 48), x_3);
in12 = x[12];
in13 = x[13];
if(in12 == 0)
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;

@ -1,174 +1,177 @@
#define VEC4_ROT(A, IMM) \
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
if (bytes >= 256) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while (bytes >= 256) {
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
if(bytes >= 256)
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while(bytes >= 256)
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
#undef ONEQUAD
bytes -= 256;
c += 256;
m += 256;
bytes -= 256;
c += 256;
m += 256;
#undef VEC4_ROT

@ -1,346 +1,344 @@
#define VEC8_ROT(A, IMM) \
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
/* implements a vector quarter round by-the-book (naive!) */
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
* (8 & 16) (not as good as previous) */
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
A4, B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
D4) \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
if (bytes >= 512) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while (bytes >= 512) {
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, D4)
if(bytes >= 512)
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13,
12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14,
13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while(bytes >= 512)
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = \
_mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
t_##B = \
_mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
t_##C = \
_mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
t_##D = \
_mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
t_##A2 = _mm256_xor_si256(t_##A2, \
_mm256_loadu_si256((__m256i*) (m + 256))); \
t_##B2 = _mm256_xor_si256(t_##B2, \
_mm256_loadu_si256((__m256i*) (m + 320))); \
t_##C2 = _mm256_xor_si256(t_##C2, \
_mm256_loadu_si256((__m256i*) (m + 384))); \
t_##D2 = _mm256_xor_si256(t_##D2, \
_mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), t_##A); \
_mm256_storeu_si256((__m256i*) (c + 64), t_##B); \
_mm256_storeu_si256((__m256i*) (c + 128), t_##C); \
_mm256_storeu_si256((__m256i*) (c + 192), t_##D); \
_mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
{ \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
t_##B = _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
t_##C = _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
t_##D = _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
t_##A2 = \
_mm256_xor_si256(t_##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
t_##B2 = \
_mm256_xor_si256(t_##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
t_##C2 = \
_mm256_xor_si256(t_##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
t_##D2 = \
_mm256_xor_si256(t_##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), t_##A); \
_mm256_storeu_si256((__m256i*)(c + 64), t_##B); \
_mm256_storeu_si256((__m256i*)(c + 128), t_##C); \
_mm256_storeu_si256((__m256i*)(c + 192), t_##D); \
_mm256_storeu_si256((__m256i*)(c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), t_##D2); \
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
bytes -= 512;
c += 512;
m += 512;
#undef VEC8_ROT

@ -4,19 +4,18 @@
#include <stdint.h>
typedef struct crypto_stream_chacha20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
typedef struct crypto_stream_chacha20_implementation
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint32_t ic, const unsigned char *k);
} crypto_stream_chacha20_implementation;

@ -74,8 +74,8 @@ randombytes_stir(void)
var window_ = 'object' == = typeof window ? window : self;
var crypto_ = typeof window_.crypto != = 'undefined' ? window_.crypto
: window_.msCrypto;
var crypto_ = typeof window_.crypto != =
'undefined' ? window_.crypto : window_.msCrypto;
var randomValuesStandard = function()
var buf = new Uint32Array(1);

#ifndef TLS
#ifdef _WIN32
#ifdef _WIN32
#ifdef _MSC_VER
#define TLS __declspec(thread)

File diff suppressed because it is too large Load Diff

@ -1,40 +1,52 @@
{ 25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626, -11754271, -6079156, 2047605 },
{ -12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384, 19500929, -15469378 },
{ -8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919, 11864899, -24514362, -4438546 }
{ 15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600, -14772189, 28944400, -1550024 },
{ 16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577, -11775962, 7689662, 11199574 },
{ 30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774, 10017326, -17749093, -9920357 }
{ 10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885, 14515107, -15438304, 10819380 },
{ 4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668, 12483688, -12668491, 5581306 },
{ 19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350, 13850243, -23678021, -15815942 }
{ 5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134, -23952439, -15175766 },
{ -30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025, 16520125, 30598449, 7715701 },
{ 28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660, 1370708, 29794553, -1409300 }
{ -22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211, -1361450, -13062696, 13821877 },
{ -6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028, -7212327, 18853322, -14220951 },
{ 4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358, -10431137, 2207753, -3209784 }
{ -25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364, -663000, -31111463, -16132436 },
{ 25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789, 15725684, 171356, 6466918 },
{ 23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339, -14088058, -30714912, 16193877 }
{ -33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398, 4729455, -18074513, 9256800 },
{ -25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405, 9761698, -19827198, 630305 },
{ -13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551, -15960994, -2449256, -14291300 }
{ -3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575, 15033784, 25105118, -7894876 },
{ -24326370, 15950226, -31801215, -14592823, -11662737, -5090925, 1573892, -2625887, 2198790, -15804619 },
{ -3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022, -16236442, -32461234, -12290683 }
{{25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
-11754271, -6079156, 2047605},
{-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384,
19500929, -15469378},
{-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
11864899, -24514362, -4438546}},
{{15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
-14772189, 28944400, -1550024},
{16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
-11775962, 7689662, 11199574},
{30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
10017326, -17749093, -9920357}},
{{10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
14515107, -15438304, 10819380},
{4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
12483688, -12668491, 5581306},
{19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
13850243, -23678021, -15815942}},
{{5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134,
-23952439, -15175766},
{-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
16520125, 30598449, 7715701},
{28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
1370708, 29794553, -1409300}},
{{-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
-1361450, -13062696, 13821877},
{-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
-7212327, 18853322, -14220951},
{4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
-10431137, 2207753, -3209784}},
{{-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
-663000, -31111463, -16132436},
{25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
15725684, 171356, 6466918},
{23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
-14088058, -30714912, 16193877}},
{{-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
4729455, -18074513, 9256800},
{-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
9761698, -19827198, 630305},
{-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
-15960994, -2449256, -14291300}},
{-3151181, -5046075, 9282714, 6866145, -31907062,
-863023, -18940575, 15033784, 25105118, -7894876},
{-24326370, 15950226, -31801215, -14592823, -11662737,
-5090925, 1573892, -2625887, 2198790, -15804619},
-3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
-16236442, -32461234, -12290683

@ -1,20 +1,18 @@
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
static const fe25519 d = {
-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555
static const fe25519 d = {-10913610, 13857413, -15372611, 6949391, 114729,
-8787816, -6275908, -3247719, -18696448, -12055116};
/* 2 * d =
* 16295367250680780974490674513165176452449235426866156013048779062215315747161
static const fe25519 d2 = {
-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199 };
static const fe25519 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
15978800, -12551817, -6495438, 29715968, 9444199};
/* sqrt(-1) */
static const fe25519 sqrtm1 = {
-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
static const fe25519 sqrtm1 = {-32595792, -7943725, 9377950, 3500415,
12389472, -272473, -25146209, -2005654,
326686, 11406482};
/* A = 486662 */
static const fe25519 curve25519_A = {
486662, 0, 0, 0, 0, 0, 0, 0, 0, 0
static const fe25519 curve25519_A = {486662, 0, 0, 0, 0, 0, 0, 0, 0, 0};

@ -5,70 +5,70 @@
fe25519_frombytes(fe25519 h, const unsigned char *s)
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t) 1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t) 1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t) 1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t) 1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t) 1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t) 1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t) 1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t) 1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t) 1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t) 1L << 26);
h[0] = (int32_t) h0;
h[1] = (int32_t) h1;
h[2] = (int32_t) h2;
h[3] = (int32_t) h3;
h[4] = (int32_t) h4;
h[5] = (int32_t) h5;
h[6] = (int32_t) h6;
h[7] = (int32_t) h7;
h[8] = (int32_t) h8;
h[9] = (int32_t) h9;
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t)1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t)1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t)1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t)1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t)1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t)1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t)1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t)1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t)1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t)1L << 26);
h[0] = (int32_t)h0;
h[1] = (int32_t)h1;
h[2] = (int32_t)h2;
h[3] = (int32_t)h3;
h[4] = (int32_t)h4;
h[5] = (int32_t)h5;
h[6] = (int32_t)h6;
h[7] = (int32_t)h7;
h[8] = (int32_t)h8;
h[9] = (int32_t)h9;
@ -99,76 +99,77 @@ fe25519_frombytes(fe25519 h, const unsigned char *s)
static void
fe25519_reduce(fe25519 h, const fe25519 f)
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t) 1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t) 1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t) 1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t) 1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t) 1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t) 1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t) 1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t) 1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t) 1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t) 1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7,
carry8, carry9;
q = (19 * h9 + ((uint32_t)1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t)1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t)1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t)1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t)1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t)1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t)1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t)1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t)1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t)1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t)1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
@ -182,39 +183,39 @@ fe25519_reduce(fe25519 h, const fe25519 f)
fe25519_tobytes(unsigned char *s, const fe25519 h)
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t) 1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t) 1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t) 1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t) 1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t) 1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t) 1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t) 1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t) 1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t)1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t)1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t)1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t)1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t)1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t)1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t)1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t)1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;

@ -17,4 +17,3 @@
#define REDMASK51 crypto_scalarmult_curve25519_sandy2x_REDMASK51
#endif /* ifndef consts_namespace_H */

@ -26,13 +26,14 @@ crypto_scalarmult_curve25519_sandy2x(unsigned char *q, const unsigned char *n,
const unsigned char *p)
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0; i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
t[i] = n[i];
t[0] &= 248;
t[31] &= 127;
@ -72,13 +73,14 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
const unsigned char *n)
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0;i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
t[i] = n[i];
t[0] &= 248;
t[31] &= 127;
@ -106,9 +108,8 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
struct crypto_scalarmult_curve25519_implementation
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult = ) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base = ) crypto_scalarmult_curve25519_sandy2x_base
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult =) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base =) crypto_scalarmult_curve25519_sandy2x_base};

@ -21,6 +21,7 @@ Bounds on each t[i] vary depending on context.
#define fe_frombytes crypto_scalarmult_curve25519_sandy2x_fe_frombytes
extern void fe_frombytes(fe, const unsigned char *);
extern void
fe_frombytes(fe, const unsigned char *);

@ -9,7 +9,8 @@
#define fe51_H
#ifdef __cplusplus
extern "C" {
extern "C"
#include <stdint.h>
@ -17,16 +18,19 @@ extern "C" {
#include "fe51_namespace.h"
typedef struct
typedef struct
uint64_t v[5];
extern void fe51_pack(unsigned char *, const fe51 *);
extern void fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void fe51_nsquare(fe51 *, const fe51 *, int);
extern void fe51_invert(fe51 *, const fe51 *);
} fe51;
extern void
fe51_pack(unsigned char *, const fe51 *);
extern void
fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void
fe51_nsquare(fe51 *, const fe51 *, int);
extern void
fe51_invert(fe51 *, const fe51 *);
#ifdef __cplusplus

@ -12,47 +12,47 @@
fe51_invert(fe51 *r, const fe51 *x)
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2,x);
/* 4 */ fe51_square(&t,&z2);
/* 8 */ fe51_square(&t,&t);
/* 9 */ fe51_mul(&z9,&t,x);
/* 11 */ fe51_mul(&z11,&z9,&z2);
/* 22 */ fe51_square(&t,&z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0,&t,&z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t,&z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0,&t,&z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t,&z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0,&t,&z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t,&z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t,&t,&z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t,&t,10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0,&t,&z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t,&z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0,&t,&z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t,&z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t,&t,&z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t,&t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t,&t,&z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t,&t,5);
/* 2^255 - 21 */ fe51_mul(r,&t,&z11);
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2, x);
/* 4 */ fe51_square(&t, &z2);
/* 8 */ fe51_square(&t, &t);
/* 9 */ fe51_mul(&z9, &t, x);
/* 11 */ fe51_mul(&z11, &z9, &z2);
/* 22 */ fe51_square(&t, &z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0, &t, &z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t, &z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0, &t, &z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t, &z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0, &t, &z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t, &z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t, &t, &z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t, &t, 10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0, &t, &z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t, &z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0, &t, &z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t, &z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t, &t, &z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t, &t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t, &t, &z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t, &t, 5);
/* 2^255 - 21 */ fe51_mul(r, &t, &z11);

@ -1,16 +1,15 @@
#ifndef fe51_namespace_H
#define fe51_namespace_H
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#endif /* ifndef fe51_namespace_H */

@ -10,9 +10,9 @@ static uint64_t
load_3(const unsigned char *in)
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
return result;
@ -20,10 +20,10 @@ static uint64_t
load_4(const unsigned char *in)
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result |= ((uint64_t) in[3]) << 24;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
result |= ((uint64_t)in[3]) << 24;
return result;
@ -51,17 +51,37 @@ fe_frombytes(fe h, const unsigned char *s)
uint64_t carry8;
uint64_t carry9;
carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF;
carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF;
carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF;
carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF;
carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF;
carry9 = h9 >> 25;
h0 += carry9 * 19;
h9 &= 0x1FFFFFF;
carry1 = h1 >> 25;
h2 += carry1;
h1 &= 0x1FFFFFF;
carry3 = h3 >> 25;
h4 += carry3;
h3 &= 0x1FFFFFF;
carry5 = h5 >> 25;
h6 += carry5;
h5 &= 0x1FFFFFF;
carry7 = h7 >> 25;
h8 += carry7;
h7 &= 0x1FFFFFF;
carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF;
carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF;
carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF;
carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF;
carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF;
carry0 = h0 >> 26;
h1 += carry0;
h0 &= 0x3FFFFFF;
carry2 = h2 >> 26;
h3 += carry2;
h2 &= 0x3FFFFFF;
carry4 = h4 >> 26;
h5 += carry4;
h4 &= 0x3FFFFFF;
carry6 = h6 >> 26;
h7 += carry6;
h6 &= 0x3FFFFFF;
carry8 = h8 >> 26;
h9 += carry8;
h8 &= 0x3FFFFFF;
h[0] = h0;
h[1] = h1;

@ -2,17 +2,18 @@
#define ladder_H
#ifdef __cplusplus
extern "C" {
extern "C"
#include "fe.h"
#include "ladder_namespace.h"
extern void ladder(fe *, const unsigned char *);
extern void
ladder(fe *, const unsigned char *);
#ifdef __cplusplus
#endif /* ifndef ladder_H */

@ -2,17 +2,18 @@
#define ladder_base_H
#ifdef __cplusplus
extern "C" {
extern "C"
#include "fe.h"
#include "ladder_base_namespace.h"
extern void ladder_base(fe *, const unsigned char *);
extern void
ladder_base(fe *, const unsigned char *);
#ifdef __cplusplus
#endif /* ifndef ladder_base_H */

@ -1,8 +1,7 @@
#ifndef ladder_base_namespace_H
#define ladder_base_namespace_H
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define _ladder_base _crypto_scalarmult_curve25519_sandy2x_ladder_base
#endif /* ifndef ladder_base_namespace_H */

@ -1,8 +1,7 @@
#ifndef ladder_namespace_H
#define ladder_namespace_H
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define _ladder _crypto_scalarmult_curve25519_sandy2x_ladder
#endif /* ifndef ladder_namespace_H */

@ -2,10 +2,10 @@
#ifndef scalarmult_poly1305_H
#define scalarmult_poly1305_H
typedef struct crypto_scalarmult_curve25519_implementation {
int (*mult)(unsigned char *q, const unsigned char *n,
const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
typedef struct crypto_scalarmult_curve25519_implementation
int (*mult)(unsigned char *q, const unsigned char *n, const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
} crypto_scalarmult_curve25519_implementation;

@ -1,18 +1,17 @@
#ifndef sign_ed25519_ref10_H
#define sign_ed25519_ref10_H
void _crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs,
int prehashed);
_crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs, int prehashed);
int _crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk, int prehashed);
_crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk, int prehashed);
int _crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk,
int prehashed);
_crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk, int prehashed);

@ -1,12 +1,22 @@
int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
crypto_kem_keypair_ref(unsigned char *pk, unsigned char *sk);
int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);
crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
crypto_kem_keypair_avx2(unsigned char *pk, unsigned char *sk);

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
for(i = 0; i < n; i++)
out[i] ^= in[i];
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#define __attribute__(a)
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# if _MSC_VER >= 1600
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#if _MSC_VER >= 1600
#if _MSC_VER >= 1700 && defined(_M_X64)
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0

@ -14,157 +14,189 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_box_SEEDBYTES crypto_box_curve25519xsalsa20poly1305_SEEDBYTES
size_t crypto_box_seedbytes(void);
#define crypto_box_PUBLICKEYBYTES crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
size_t crypto_box_publickeybytes(void);
#define crypto_box_SECRETKEYBYTES crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
size_t crypto_box_secretkeybytes(void);
#define crypto_box_PUBLICKEYBYTES \
#define crypto_box_SECRETKEYBYTES \
#define crypto_box_NONCEBYTES crypto_box_curve25519xsalsa20poly1305_NONCEBYTES
size_t crypto_box_noncebytes(void);
#define crypto_box_MACBYTES crypto_box_curve25519xsalsa20poly1305_MACBYTES
size_t crypto_box_macbytes(void);
#define crypto_box_MESSAGEBYTES_MAX crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
size_t crypto_box_messagebytes_max(void);
#define crypto_box_MESSAGEBYTES_MAX \
#define crypto_box_PRIMITIVE "curve25519xsalsa20poly1305"
const char *crypto_box_primitive(void);
int crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
int crypto_box_keypair(unsigned char *pk, unsigned char *sk);
int crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
size_t crypto_box_beforenmbytes(void);
int crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
int crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
int crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
/* -- Ephemeral SK interface -- */
const char *
crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
crypto_box_keypair(unsigned char *pk, unsigned char *sk);
crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac, unsigned long long clen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES \
crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
/* -- Ephemeral SK interface -- */
#define crypto_box_SEALBYTES (crypto_box_PUBLICKEYBYTES + crypto_box_MACBYTES)
size_t crypto_box_sealbytes(void);
int crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
int crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *pk,
const unsigned char *sk)
/* -- NaCl compatibility interface ; Requires padding -- */
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_ZEROBYTES crypto_box_curve25519xsalsa20poly1305_ZEROBYTES
size_t crypto_box_zerobytes(void);
#define crypto_box_BOXZEROBYTES crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
size_t crypto_box_boxzerobytes(void);
int crypto_box(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
#define crypto_box_BOXZEROBYTES \
crypto_box(unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk) __attribute__((warn_unused_result));
crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
#ifdef __cplusplus

@ -6,101 +6,114 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_box_curve25519xsalsa20poly1305_SEEDBYTES 32U
size_t crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES 32U
size_t crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES 32U
size_t crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES 32U
size_t crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_NONCEBYTES 24U
size_t crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
#define crypto_box_curve25519xsalsa20poly1305_MACBYTES 16U
size_t crypto_box_curve25519xsalsa20poly1305_macbytes(void);
/* Only for the libsodium API - The NaCl compatibility API would require BOXZEROBYTES extra bytes */
/* Only for the libsodium API - The NaCl compatibility API would require
* BOXZEROBYTES extra bytes */
#define crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX \
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_box_curve25519xsalsa20poly1305_MACBYTES)
size_t crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
int crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
int crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
int crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX \
- crypto_box_curve25519xsalsa20poly1305_MACBYTES)
crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES 16U
size_t crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
#define crypto_box_curve25519xsalsa20poly1305_ZEROBYTES \
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES + \
size_t crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
int crypto_box_curve25519xsalsa20poly1305(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_curve25519xsalsa20poly1305_open(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
int crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
int crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES \
+ crypto_box_curve25519xsalsa20poly1305_MACBYTES)
unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
unsigned char *m, const unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
#ifdef __cplusplus

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#define crypto_core_hchacha20_OUTPUTBYTES 32U
size_t crypto_core_hchacha20_outputbytes(void);
#define crypto_core_hchacha20_INPUTBYTES 16U
size_t crypto_core_hchacha20_inputbytes(void);
#define crypto_core_hchacha20_KEYBYTES 32U
size_t crypto_core_hchacha20_keybytes(void);
#define crypto_core_hchacha20_CONSTBYTES 16U
size_t crypto_core_hchacha20_constbytes(void);
int crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#define crypto_core_salsa20_OUTPUTBYTES 64U
size_t crypto_core_salsa20_outputbytes(void);
#define crypto_core_salsa20_INPUTBYTES 16U
size_t crypto_core_salsa20_inputbytes(void);
#define crypto_core_salsa20_KEYBYTES 32U
size_t crypto_core_salsa20_keybytes(void);
#define crypto_core_salsa20_CONSTBYTES 16U
size_t crypto_core_salsa20_constbytes(void);
int crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus

@ -7,66 +7,79 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_generichash_BYTES_MIN crypto_generichash_blake2b_BYTES_MIN
size_t crypto_generichash_bytes_min(void);
#define crypto_generichash_BYTES_MAX crypto_generichash_blake2b_BYTES_MAX
size_t crypto_generichash_bytes_max(void);
#define crypto_generichash_BYTES crypto_generichash_blake2b_BYTES
size_t crypto_generichash_bytes(void);
#define crypto_generichash_KEYBYTES_MIN crypto_generichash_blake2b_KEYBYTES_MIN
size_t crypto_generichash_keybytes_min(void);
#define crypto_generichash_KEYBYTES_MAX crypto_generichash_blake2b_KEYBYTES_MAX
size_t crypto_generichash_keybytes_max(void);
#define crypto_generichash_KEYBYTES crypto_generichash_blake2b_KEYBYTES
size_t crypto_generichash_keybytes(void);
#define crypto_generichash_PRIMITIVE "blake2b"
const char *crypto_generichash_primitive(void);
typedef crypto_generichash_blake2b_state crypto_generichash_state;
size_t crypto_generichash_statebytes(void);
int crypto_generichash(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
int crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
int crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in,
unsigned long long inlen);
int crypto_generichash_final(crypto_generichash_state *state,
unsigned char *out, const size_t outlen);
void crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
const char *
typedef crypto_generichash_blake2b_state crypto_generichash_state;
crypto_generichash(unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key,
size_t keylen);
crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in, unsigned long long inlen);
crypto_generichash_final(crypto_generichash_state *state, unsigned char *out,
const size_t outlen);
crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
#ifdef __cplusplus

@ -8,107 +8,120 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack(1)
#pragma pack(1)
# pragma pack(push, 1)
#pragma pack(push, 1)
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state {
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack()
#pragma pack()
# pragma pack(pop)
#pragma pack(pop)
#define crypto_generichash_blake2b_BYTES_MIN 16U
size_t crypto_generichash_blake2b_bytes_min(void);
#define crypto_generichash_blake2b_BYTES_MAX 64U
size_t crypto_generichash_blake2b_bytes_max(void);
#define crypto_generichash_blake2b_BYTES 32U
size_t crypto_generichash_blake2b_bytes(void);
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
size_t crypto_generichash_blake2b_keybytes_min(void);
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
size_t crypto_generichash_blake2b_keybytes_max(void);
#define crypto_generichash_blake2b_KEYBYTES 32U
size_t crypto_generichash_blake2b_keybytes(void);
#define crypto_generichash_blake2b_SALTBYTES 16U
size_t crypto_generichash_blake2b_saltbytes(void);
#define crypto_generichash_blake2b_BYTES_MIN 16U
#define crypto_generichash_blake2b_BYTES_MAX 64U
#define crypto_generichash_blake2b_BYTES 32U
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
#define crypto_generichash_blake2b_KEYBYTES 32U
#define crypto_generichash_blake2b_SALTBYTES 16U
#define crypto_generichash_blake2b_PERSONALBYTES 16U
size_t crypto_generichash_blake2b_personalbytes(void);
size_t crypto_generichash_blake2b_statebytes(void);
int crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key, size_t keylen);
int crypto_generichash_blake2b_salt_personal(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key,
size_t keylen,
const unsigned char *salt,
const unsigned char *personal);
int crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
int crypto_generichash_blake2b_init_salt_personal(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen,
const unsigned char *salt,
const unsigned char *personal);
int crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
int crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out,
const size_t outlen);
void crypto_generichash_blake2b_keygen(unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key, size_t keylen,
const unsigned char *salt, const unsigned char *personal);
crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
crypto_generichash_blake2b_state *state, const unsigned char *key,
const size_t keylen, const size_t outlen, const unsigned char *salt,
const unsigned char *personal);
crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out, const size_t outlen);
unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
#ifdef __cplusplus

@ -7,36 +7,41 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#define crypto_scalarmult_BYTES crypto_scalarmult_curve25519_BYTES
size_t crypto_scalarmult_bytes(void);
#define crypto_scalarmult_SCALARBYTES crypto_scalarmult_curve25519_SCALARBYTES
size_t crypto_scalarmult_scalarbytes(void);
#define crypto_scalarmult_PRIMITIVE "curve25519"
const char *crypto_scalarmult_primitive(void);
int crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
int crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
const char *
crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p) __attribute__((warn_unused_result));
#ifdef __cplusplus

@ -6,32 +6,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#define crypto_scalarmult_curve25519_BYTES 32U
size_t crypto_scalarmult_curve25519_bytes(void);
#define crypto_scalarmult_curve25519_SCALARBYTES 32U
size_t crypto_scalarmult_curve25519_scalarbytes(void);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
int crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
int crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus

@ -7,32 +7,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#define crypto_scalarmult_ed25519_BYTES 32U
size_t crypto_scalarmult_ed25519_bytes(void);
#define crypto_scalarmult_ed25519_SCALARBYTES 32U
size_t crypto_scalarmult_ed25519_scalarbytes(void);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
int crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
int crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
* NOTE: Do not use the result of this function directly.
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
* Or unless this is not an option, use the crypto_kx() API instead.
crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus

@ -14,87 +14,102 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
typedef crypto_sign_ed25519ph_state crypto_sign_state;
typedef crypto_sign_ed25519ph_state crypto_sign_state;
size_t crypto_sign_statebytes(void);
#define crypto_sign_BYTES crypto_sign_ed25519_BYTES
size_t crypto_sign_bytes(void);
#define crypto_sign_SEEDBYTES crypto_sign_ed25519_SEEDBYTES
size_t crypto_sign_seedbytes(void);
#define crypto_sign_PUBLICKEYBYTES crypto_sign_ed25519_PUBLICKEYBYTES
size_t crypto_sign_publickeybytes(void);
#define crypto_sign_SECRETKEYBYTES crypto_sign_ed25519_SECRETKEYBYTES
size_t crypto_sign_secretkeybytes(void);
#define crypto_sign_MESSAGEBYTES_MAX crypto_sign_ed25519_MESSAGEBYTES_MAX
size_t crypto_sign_messagebytes_max(void);
#define crypto_sign_PRIMITIVE "ed25519"
const char *crypto_sign_primitive(void);
int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
int crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
int crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
int crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
int crypto_sign_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
int crypto_sign_init(crypto_sign_state *state);
int crypto_sign_update(crypto_sign_state *state,
const unsigned char *m, unsigned long long mlen);
int crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
int crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
const char *
crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk) __attribute__((warn_unused_result));
crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
crypto_sign_verify_detached(const unsigned char *sig, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk)
crypto_sign_init(crypto_sign_state *state);
crypto_sign_update(crypto_sign_state *state, const unsigned char *m,
unsigned long long mlen);
crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
#ifdef __cplusplus

@ -6,106 +6,125 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
typedef struct crypto_sign_ed25519ph_state {
typedef struct crypto_sign_ed25519ph_state
crypto_hash_sha512_state hs;
} crypto_sign_ed25519ph_state;
} crypto_sign_ed25519ph_state;
size_t crypto_sign_ed25519ph_statebytes(void);
#define crypto_sign_ed25519_BYTES 64U
size_t crypto_sign_ed25519_bytes(void);
#define crypto_sign_ed25519_SEEDBYTES 32U
size_t crypto_sign_ed25519_seedbytes(void);
#define crypto_sign_ed25519_PUBLICKEYBYTES 32U
size_t crypto_sign_ed25519_publickeybytes(void);
#define crypto_sign_ed25519_SECRETKEYBYTES (32U + 32U)
size_t crypto_sign_ed25519_secretkeybytes(void);
#define crypto_sign_ed25519_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
size_t crypto_sign_ed25519_messagebytes_max(void);
int crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
int crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
int crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk);
int crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
int crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
int crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
int crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
__attribute__ ((warn_unused_result));
int crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
int crypto_sign_ed25519_sk_to_seed(unsigned char *seed,
const unsigned char *sk);
int crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
int crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
int crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m,
unsigned long long mlen);
int crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
int crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
#define crypto_sign_ed25519_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
crypto_sign_ed25519_sk_to_seed(unsigned char *seed, const unsigned char *sk);
crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m, unsigned long long mlen);
crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
#ifdef __cplusplus

@ -16,37 +16,42 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_sign_edwards25519sha512batch_BYTES 64U
#define crypto_sign_edwards25519sha512batch_PUBLICKEYBYTES 32U
#define crypto_sign_edwards25519sha512batch_SECRETKEYBYTES (32U + 32U)
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
int crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
__attribute__ ((deprecated));
int crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((deprecated));
int crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
__attribute__ ((deprecated));
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
#ifdef __cplusplus

@ -14,82 +14,103 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_stream_chacha20_KEYBYTES 32U
size_t crypto_stream_chacha20_keybytes(void);
#define crypto_stream_chacha20_NONCEBYTES 8U
size_t crypto_stream_chacha20_noncebytes(void);
#define crypto_stream_chacha20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
size_t crypto_stream_chacha20_messagebytes_max(void);
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed */
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed
int crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
void crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]);
unsigned char k[crypto_stream_chacha20_KEYBYTES]);
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
#define crypto_stream_chacha20_ietf_KEYBYTES 32U
size_t crypto_stream_chacha20_ietf_keybytes(void);
#define crypto_stream_chacha20_ietf_NONCEBYTES 12U
size_t crypto_stream_chacha20_ietf_noncebytes(void);
#define crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX \
size_t crypto_stream_chacha20_ietf_messagebytes_max(void);
int crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
void crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
#define crypto_stream_chacha20_IETF_KEYBYTES crypto_stream_chacha20_ietf_KEYBYTES
#define crypto_stream_chacha20_IETF_NONCEBYTES crypto_stream_chacha20_ietf_NONCEBYTES
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
#define crypto_stream_chacha20_IETF_KEYBYTES \
#define crypto_stream_chacha20_IETF_NONCEBYTES \
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX \
#ifdef __cplusplus

@ -14,41 +14,48 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_stream_salsa20_KEYBYTES 32U
size_t crypto_stream_salsa20_keybytes(void);
#define crypto_stream_salsa20_NONCEBYTES 8U
size_t crypto_stream_salsa20_noncebytes(void);
#define crypto_stream_salsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
size_t crypto_stream_salsa20_messagebytes_max(void);
int crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
#ifdef __cplusplus

@ -14,41 +14,49 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
extern "C"
#define crypto_stream_xsalsa20_KEYBYTES 32U
size_t crypto_stream_xsalsa20_keybytes(void);
#define crypto_stream_xsalsa20_NONCEBYTES 24U
size_t crypto_stream_xsalsa20_noncebytes(void);
#define crypto_stream_xsalsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
size_t crypto_stream_xsalsa20_messagebytes_max(void);
int crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
int crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
void crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
#ifdef __cplusplus

@ -7,48 +7,48 @@
#include <limits.h>
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#define __attribute__(a)
#if defined(_MSC_VER)
#define SODIUM_EXPORT __declspec(dllexport)
#define SODIUM_EXPORT __declspec(dllimport)
#if defined(__SUNPRO_C)
#ifndef __GNU_C__
#define SODIUM_EXPORT __attribute__(visibility(__global))
#define SODIUM_EXPORT __attribute__ __global
#elif defined(_MSG_VER)
#define SODIUM_EXPORT extern __declspec(dllexport)
# if defined(_MSC_VER)
# define SODIUM_EXPORT __declspec(dllexport)
# else
# define SODIUM_EXPORT __declspec(dllimport)
# endif
# else
# if defined(__SUNPRO_C)
# ifndef __GNU_C__
# define SODIUM_EXPORT __attribute__ (visibility(__global))
# else
# define SODIUM_EXPORT __attribute__ __global
# endif
# elif defined(_MSG_VER)
# define SODIUM_EXPORT extern __declspec(dllexport)
# else
# define SODIUM_EXPORT __attribute__ ((visibility ("default")))
# endif
# endif
# if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
# define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
# else
# endif
#define SODIUM_EXPORT __attribute__((visibility("default")))
#if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
#define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
for(i = 0; i < n; i++)
out[i] ^= in[i];
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#define __attribute__(a)
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# if _MSC_VER >= 1600
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#if _MSC_VER >= 1600
#if _MSC_VER >= 1700 && defined(_M_X64)
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0

@ -15,17 +15,19 @@ typedef uint64_t fe25519[5];
typedef int32_t fe25519[10];
void fe25519_invert(fe25519 out, const fe25519 z);
void fe25519_frombytes(fe25519 h, const unsigned char *s);
void fe25519_tobytes(unsigned char *s, const fe25519 h);
fe25519_invert(fe25519 out, const fe25519 z);
fe25519_frombytes(fe25519 h, const unsigned char *s);
fe25519_tobytes(unsigned char *s, const fe25519 h);
# include "ed25519_ref10_fe_51.h"
#include "ed25519_ref10_fe_51.h"
# include "ed25519_ref10_fe_25_5.h"
#include "ed25519_ref10_fe_25_5.h"
ge means group element.
@ -40,86 +42,109 @@ void fe25519_tobytes(unsigned char *s, const fe25519 h);
ge25519_precomp (Duif): (y+x,y-x,2dxy)
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
typedef struct
fe25519 X;
fe25519 Y;
fe25519 Z;
} ge25519_p2;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p3;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p1p1;
typedef struct {
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
typedef struct
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
} ge25519_precomp;
typedef struct {
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
typedef struct
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
} ge25519_cached;
void ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
void ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
int ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
int ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
void ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
void ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
void ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
void ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
void ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A,
const unsigned char *b);
ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A, const unsigned char *b);
void ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a,
const ge25519_p3 *p);
ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a, const ge25519_p3 *p);
int ge25519_is_canonical(const unsigned char *s);
ge25519_is_canonical(const unsigned char *s);
int ge25519_is_on_curve(const ge25519_p3 *p);
ge25519_is_on_curve(const ge25519_p3 *p);
int ge25519_is_on_main_subgroup(const ge25519_p3 *p);
ge25519_is_on_main_subgroup(const ge25519_p3 *p);
int ge25519_has_small_order(const unsigned char s[32]);
ge25519_has_small_order(const unsigned char s[32]);
void ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
The set of scalars is \Z/l
where l = 2^252 + 27742317777372353535851937790883648493.
void sc25519_reduce(unsigned char *s);
sc25519_reduce(unsigned char *s);
void sc25519_muladd(unsigned char *s, const unsigned char *a,
const unsigned char *b, const unsigned char *c);
sc25519_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b,
const unsigned char *c);
int sc25519_is_canonical(const unsigned char *s);
sc25519_is_canonical(const unsigned char *s);

@ -10,7 +10,7 @@
static inline void
fe25519_0(fe25519 h)
memset(&h[0], 0, 5 * sizeof h[0]);
memset(&h[0], 0, 5 * sizeof h[0]);
@ -20,8 +20,8 @@ fe25519_0(fe25519 h)
static inline void
fe25519_1(fe25519 h)
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
@ -32,17 +32,17 @@ fe25519_1(fe25519 h)
static inline void
fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
@ -52,37 +52,37 @@ fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
@ -92,10 +92,10 @@ fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
static inline void
fe25519_neg(fe25519 h, const fe25519 f)
fe25519 zero;
fe25519 zero;
fe25519_sub(h, zero, f);
fe25519_sub(h, zero, f);
@ -108,31 +108,31 @@ fe25519_neg(fe25519 h, const fe25519 f)
static void
fe25519_cmov(fe25519 f, const fe25519 g, unsigned int b)
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
@ -145,43 +145,43 @@ Preconditions: b in {0,1}.
static void
fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
@ -191,17 +191,17 @@ fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
static inline void
fe25519_copy(fe25519 h, const fe25519 f)
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
@ -212,11 +212,11 @@ fe25519_copy(fe25519 h, const fe25519 f)
static inline int
fe25519_isnegative(const fe25519 f)
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return s[0] & 1;
return s[0] & 1;
@ -227,11 +227,11 @@ fe25519_isnegative(const fe25519 f)
static inline int
fe25519_iszero(const fe25519 f)
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return sodium_is_zero(s, 32);
return sodium_is_zero(s, 32);
@ -242,87 +242,87 @@ fe25519_iszero(const fe25519 f)
static void
fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) g0);
r0 += ((uint128_t) f1_19) * ((uint128_t) g4);
r0 += ((uint128_t) f2_19) * ((uint128_t) g3);
r0 += ((uint128_t) f3_19) * ((uint128_t) g2);
r0 += ((uint128_t) f4_19) * ((uint128_t) g1);
r1 = ((uint128_t) f0 ) * ((uint128_t) g1);
r1 += ((uint128_t) f1 ) * ((uint128_t) g0);
r1 += ((uint128_t) f2_19) * ((uint128_t) g4);
r1 += ((uint128_t) f3_19) * ((uint128_t) g3);
r1 += ((uint128_t) f4_19) * ((uint128_t) g2);
r2 = ((uint128_t) f0 ) * ((uint128_t) g2);
r2 += ((uint128_t) f1 ) * ((uint128_t) g1);
r2 += ((uint128_t) f2 ) * ((uint128_t) g0);
r2 += ((uint128_t) f3_19) * ((uint128_t) g4);
r2 += ((uint128_t) f4_19) * ((uint128_t) g3);
r3 = ((uint128_t) f0 ) * ((uint128_t) g3);
r3 += ((uint128_t) f1 ) * ((uint128_t) g2);
r3 += ((uint128_t) f2 ) * ((uint128_t) g1);
r3 += ((uint128_t) f3 ) * ((uint128_t) g0);
r3 += ((uint128_t) f4_19) * ((uint128_t) g4);
r4 = ((uint128_t) f0 ) * ((uint128_t) g4);
r4 += ((uint128_t) f1 ) * ((uint128_t) g3);
r4 += ((uint128_t) f2 ) * ((uint128_t) g2);
r4 += ((uint128_t) f3 ) * ((uint128_t) g1);
r4 += ((uint128_t) f4 ) * ((uint128_t) g0);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)g0);
r0 += ((uint128_t)f1_19) * ((uint128_t)g4);
r0 += ((uint128_t)f2_19) * ((uint128_t)g3);
r0 += ((uint128_t)f3_19) * ((uint128_t)g2);
r0 += ((uint128_t)f4_19) * ((uint128_t)g1);
r1 = ((uint128_t)f0) * ((uint128_t)g1);
r1 += ((uint128_t)f1) * ((uint128_t)g0);
r1 += ((uint128_t)f2_19) * ((uint128_t)g4);
r1 += ((uint128_t)f3_19) * ((uint128_t)g3);
r1 += ((uint128_t)f4_19) * ((uint128_t)g2);
r2 = ((uint128_t)f0) * ((uint128_t)g2);
r2 += ((uint128_t)f1) * ((uint128_t)g1);
r2 += ((uint128_t)f2) * ((uint128_t)g0);
r2 += ((uint128_t)f3_19) * ((uint128_t)g4);
r2 += ((uint128_t)f4_19) * ((uint128_t)g3);
r3 = ((uint128_t)f0) * ((uint128_t)g3);
r3 += ((uint128_t)f1) * ((uint128_t)g2);
r3 += ((uint128_t)f2) * ((uint128_t)g1);
r3 += ((uint128_t)f3) * ((uint128_t)g0);
r3 += ((uint128_t)f4_19) * ((uint128_t)g4);
r4 = ((uint128_t)f0) * ((uint128_t)g4);
r4 += ((uint128_t)f1) * ((uint128_t)g3);
r4 += ((uint128_t)f2) * ((uint128_t)g2);
r4 += ((uint128_t)f3) * ((uint128_t)g1);
r4 += ((uint128_t)f4) * ((uint128_t)g0);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
@ -333,75 +333,75 @@ fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sq(fe25519 h, const fe25519 f)
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
@ -412,107 +412,107 @@ fe25519_sq(fe25519 h, const fe25519 f)
static void
fe25519_sq2(fe25519 h, const fe25519 f)
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
static void
fe25519_scalar_product(fe25519 h, const fe25519 f, uint32_t n)
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t) n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t) a) & mask;
a = f[1] * sn + ((uint64_t) (a >> 51));
h1 = ((uint64_t) a) & mask;
a = f[2] * sn + ((uint64_t) (a >> 51));
h2 = ((uint64_t) a) & mask;
a = f[3] * sn + ((uint64_t) (a >> 51));
h3 = ((uint64_t) a) & mask;
a = f[4] * sn + ((uint64_t) (a >> 51));
h4 = ((uint64_t) a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t)n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t)a) & mask;
a = f[1] * sn + ((uint64_t)(a >> 51));
h1 = ((uint64_t)a) & mask;
a = f[2] * sn + ((uint64_t)(a >> 51));
h2 = ((uint64_t)a) & mask;
a = f[3] * sn + ((uint64_t)(a >> 51));
h3 = ((uint64_t)a) & mask;
a = f[4] * sn + ((uint64_t)(a >> 51));
h4 = ((uint64_t)a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;

@ -1,11 +1,17 @@
#ifndef implementations_H
#define implementations_H
int _crypto_generichash_blake2b_pick_best_implementation(void);
int _crypto_onetimeauth_poly1305_pick_best_implementation(void);
int _crypto_pwhash_argon2_pick_best_implementation(void);
int _crypto_scalarmult_curve25519_pick_best_implementation(void);
int _crypto_stream_chacha20_pick_best_implementation(void);
int _crypto_stream_salsa20_pick_best_implementation(void);

@ -1,7 +1,9 @@
#ifndef mutex_H
#define mutex_H 1
extern int sodium_crit_enter(void);
extern int sodium_crit_leave(void);
extern int
extern int

@ -4,46 +4,53 @@
#include "common.h"
# include <intrin.h>
#include <intrin.h>
#if defined(HAVE_EMMINTRIN_H) && \
!(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) || \
defined(_M_X64) || defined(_M_AMD64))
#if defined(HAVE_EMMINTRIN_H) \
&& !(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) \
|| defined(_M_X64) || defined(_M_AMD64))
# include <emmintrin.h>
# include <stdint.h>
#include <emmintrin.h>
#include <stdint.h>
# ifndef _mm_set_epi64x
# define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
#ifndef _mm_set_epi64x
#define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
static inline __m128i
sodium__mm_set_epi64x(int64_t q1, int64_t q0)
union { int64_t as64; int32_t as32[2]; } x0, x1;
x0.as64 = q0; x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
union {
int64_t as64;
int32_t as32[2];
} x0, x1;
x0.as64 = q0;
x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
# endif
# ifndef _mm_set1_epi64x
# define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
#ifndef _mm_set1_epi64x
#define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
static inline __m128i
sodium__mm_set1_epi64x(int64_t q)
return _mm_set_epi64x(q, q);
return _mm_set_epi64x(q, q);
# endif
# ifndef _mm_cvtsi64_si128
# define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
#ifndef _mm_cvtsi64_si128
#define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
static inline __m128i
sodium__mm_cvtsi64_si128(int64_t q)
union { int64_t as64; int32_t as32[2]; } x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
union {
int64_t as64;
int32_t as32[2];
} x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
# endif

@ -4,19 +4,21 @@
#ifdef __native_client__
# include "export.h"
# include "randombytes.h"
#include "export.h"
#include "randombytes.h"
# ifdef __cplusplus
extern "C" {
# endif
#ifdef __cplusplus
extern "C"
extern struct randombytes_implementation randombytes_nativeclient_implementation;
extern struct randombytes_implementation
# ifdef __cplusplus
#ifdef __cplusplus
# endif

@ -5,45 +5,59 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
int sodium_runtime_has_neon(void);
int sodium_runtime_has_sse2(void);
int sodium_runtime_has_sse3(void);
int sodium_runtime_has_ssse3(void);
int sodium_runtime_has_sse41(void);
int sodium_runtime_has_avx(void);
int sodium_runtime_has_avx2(void);
int sodium_runtime_has_avx512f(void);
int sodium_runtime_has_pclmul(void);
int sodium_runtime_has_aesni(void);
int sodium_runtime_has_rdrand(void);
/* ------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------
int _sodium_runtime_get_cpu_features(void);
#ifdef __cplusplus

@ -7,161 +7,188 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
#ifndef SODIUM_C99
# if defined(__cplusplus) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L
# define SODIUM_C99(X)
# else
# define SODIUM_C99(X) X
# endif
#if defined(__cplusplus) || !defined(__STDC_VERSION__) \
|| __STDC_VERSION__ < 199901L
#define SODIUM_C99(X)
#define SODIUM_C99(X) X
void sodium_memzero(void * const pnt, const size_t len);
void sodium_stackzero(const size_t len);
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
int sodium_memcmp(const void * const b1_, const void * const b2_, size_t len)
__attribute__ ((warn_unused_result));
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ == b2_
* It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
int sodium_compare(const unsigned char *b1_, const unsigned char *b2_,
size_t len)
__attribute__ ((warn_unused_result));
int sodium_is_zero(const unsigned char *n, const size_t nlen);
void sodium_increment(unsigned char *n, const size_t nlen);
void sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
char *sodium_bin2hex(char * const hex, const size_t hex_maxlen,
const unsigned char * const bin, const size_t bin_len);
int sodium_hex2bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const hex, const size_t hex_len,
const char * const ignore, size_t * const bin_len,
const char ** const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
sodium_memzero(void *const pnt, const size_t len);
sodium_stackzero(const size_t len);
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
sodium_memcmp(const void *const b1_, const void *const b2_, size_t len)
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ ==
* b2_ It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
sodium_compare(const unsigned char *b1_, const unsigned char *b2_, size_t len)
sodium_is_zero(const unsigned char *n, const size_t nlen);
sodium_increment(unsigned char *n, const size_t nlen);
sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
char *
sodium_bin2hex(char *const hex, const size_t hex_maxlen,
const unsigned char *const bin, const size_t bin_len);
sodium_hex2bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const hex, const size_t hex_len,
const char *const ignore, size_t *const bin_len,
const char **const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
#define sodium_base64_VARIANT_ORIGINAL_NO_PADDING 3
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
* Computes the required length to encode BIN_LEN bytes as a base64 string
* using the given variant. The computed length includes a trailing \0.
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U + \
((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) | (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) & 1U) * \
(4U - (~((((VARIANT) & 2U) >> 1) - 1U) & (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) + 1U)
size_t sodium_base64_encoded_len(const size_t bin_len, const int variant);
char *sodium_bin2base64(char * const b64, const size_t b64_maxlen,
const unsigned char * const bin, const size_t bin_len,
const int variant);
int sodium_base642bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const b64, const size_t b64_len,
const char * const ignore, size_t * const bin_len,
const char ** const b64_end, const int variant);
int sodium_mlock(void * const addr, const size_t len);
int sodium_munlock(void * const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected pages (read/write)][guard page (no access)]
* With the layout of the unprotected pages being:
* [optional padding][16-bytes canary][user region]
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
* sodium_malloc() can be used to allocate any libsodium data structure.
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
void *sodium_malloc(const size_t size)
__attribute__ ((malloc));
void *sodium_allocarray(size_t count, size_t size)
__attribute__ ((malloc));
void sodium_free(void *ptr);
int sodium_mprotect_noaccess(void *ptr);
int sodium_mprotect_readonly(void *ptr);
int sodium_mprotect_readwrite(void *ptr);
int sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
int sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
int _sodium_alloc_init(void);
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U \
+ ((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) \
| (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) \
& 1U) \
* (4U \
- (~((((VARIANT)&2U) >> 1) - 1U) \
& (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) \
+ 1U)
sodium_base64_encoded_len(const size_t bin_len, const int variant);
char *
sodium_bin2base64(char *const b64, const size_t b64_maxlen,
const unsigned char *const bin, const size_t bin_len,
const int variant);
sodium_base642bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const b64, const size_t b64_len,
const char *const ignore, size_t *const bin_len,
const char **const b64_end, const int variant);
sodium_mlock(void *const addr, const size_t len);
sodium_munlock(void *const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected
* pages (read/write)][guard page (no access)] With the layout of the
* unprotected pages being: [optional padding][16-bytes canary][user region]
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
* sodium_malloc() can be used to allocate any libsodium data structure.
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
void *
sodium_malloc(const size_t size) __attribute__((malloc));
void *
sodium_allocarray(size_t count, size_t size) __attribute__((malloc));
sodium_free(void *ptr);
sodium_mprotect_noaccess(void *ptr);
sodium_mprotect_readonly(void *ptr);
sodium_mprotect_readwrite(void *ptr);
sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
#ifdef __cplusplus

@ -4,424 +4,463 @@
typedef crypto_int32 int32;
static inline void minmax(int32 *x,int32 *y)
static inline void
minmax(int32 *x, int32 *y)
asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
: : "r"(x),"r"(y) : "%eax","%ebx","%edx");
__asm__("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg "
"%%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
: "r"(x), "r"(y)
: "%eax", "%ebx", "%edx");
/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
static inline void minmax02through1315(int32 *x)
static inline void
minmax02through1315(int32 *x)
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c,d);
__m256i h = _mm256_max_epi32(c,d);
a = _mm256_unpacklo_epi64(g,h);
b = _mm256_unpackhi_epi64(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi64(a, b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a, b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c, d);
__m256i h = _mm256_max_epi32(c, d);
a = _mm256_unpacklo_epi64(g, h);
b = _mm256_unpackhi_epi64(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
static inline void minmax02134657(int32 *x)
static inline void
minmax02134657(int32 *x)
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0x4e);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0x44);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0x4e);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0x44);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
static void multiminmax2plus2(
int32 *x,
int n)
static void
multiminmax2plus2(int32 *x, int n)
while (n >= 16) {
while(n >= 16)
n -= 16;
x += 16;
if (n >= 8) {
if(n >= 8)
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + 2);
minmax(x + 1,x + 3);
if(n >= 4)
minmax(x, x + 2);
minmax(x + 1, x + 3);
n -= 4;
x += 4;
if (n > 0) {
minmax(x,x + 2);
if (n > 1) minmax(x + 1,x + 3);
if(n > 0)
minmax(x, x + 2);
if(n > 1)
minmax(x + 1, x + 3);
static void multiminmax2plus6(
int32 *x,
int n)
static void
multiminmax2plus6(int32 *x, int n)
while (n >= 4) {
minmax(x,x + 6);
minmax(x + 1,x + 7);
while(n >= 4)
minmax(x, x + 6);
minmax(x + 1, x + 7);
n -= 4;
x += 4;
if (n > 0) {
minmax(x,x + 6);
if (n > 1) minmax(x + 1,x + 7);
if(n > 0)
minmax(x, x + 6);
if(n > 1)
minmax(x + 1, x + 7);
static void multiminmax2plus14(
int32 *x,
int n)
static void
multiminmax2plus14(int32 *x, int n)
while (n >= 8) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
minmax(x + 4,x + 18);
minmax(x + 5,x + 19);
while(n >= 8)
minmax(x, x + 14);
minmax(x + 1, x + 15);
minmax(x + 4, x + 18);
minmax(x + 5, x + 19);
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
if(n >= 4)
minmax(x, x + 14);
minmax(x + 1, x + 15);
n -= 4;
x += 4;
if (n > 0) {
minmax(x,x + 14);
if (n > 1) minmax(x + 1,x + 15);
if(n > 0)
minmax(x, x + 14);
if(n > 1)
minmax(x + 1, x + 15);
/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax0145891213(int32 *x,int32 *y)
static inline void
minmax0145891213(int32 *x, int32 *y)
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567, a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567, b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213, b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213, b0189451213);
__m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
__m256i c01234567 = _mm256_blend_epi32(a01234567, c0189451213, 0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567, d0189451213, 0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213, a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213, b89101112131415);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
/* offset >= 30 */
static void multiminmax2plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax2plusmore(int32 *x, int n, int offset)
while (n >= 16) {
minmax0145891213(x,x + offset);
while(n >= 16)
minmax0145891213(x, x + offset);
n -= 16;
x += 16;
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 5,x + 5 + offset);
if(n >= 8)
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 5, x + 5 + offset);
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
if(n >= 4)
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
n -= 4;
x += 4;
if (n > 0) {
minmax(x,x + offset);
if (n > 1) minmax(x + 1,x + 1 + offset);
if(n > 0)
minmax(x, x + offset);
if(n > 1)
minmax(x + 1, x + 1 + offset);
/* sort x0,x1; ... sort x14, x15 */
static inline void minmax01through1415(int32 *x)
static inline void
minmax01through1415(int32 *x)
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g,h);
b = _mm256_unpackhi_epi32(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi32(a, b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a, b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c, d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c, d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e, f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e, f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g, h);
b = _mm256_unpackhi_epi32(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
static inline void minmax01234567(int32 *x)
static inline void
minmax01234567(int32 *x)
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0xb1);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0xa0);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0xb1);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0xa0);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
static void multiminmax1plus1(
int32 *x,
int n)
static void
multiminmax1plus1(int32 *x, int n)
while (n >= 16) {
while(n >= 16)
n -= 16;
x += 16;
if (n >= 8) {
if(n >= 8)
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + 1);
minmax(x + 2,x + 3);
if(n >= 4)
minmax(x, x + 1);
minmax(x + 2, x + 3);
n -= 4;
x += 4;
if (n >= 2) {
minmax(x,x + 1);
if(n >= 2)
minmax(x, x + 1);
n -= 2;
x += 2;
if (n > 0)
minmax(x,x + 1);
if(n > 0)
minmax(x, x + 1);
static void multiminmax1(
int32 *x,
int n,
int offset)
static void
multiminmax1(int32 *x, int n, int offset)
while (n >= 16) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
minmax(x + 8,x + 8 + offset);
minmax(x + 10,x + 10 + offset);
minmax(x + 12,x + 12 + offset);
minmax(x + 14,x + 14 + offset);
while(n >= 16)
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
minmax(x + 8, x + 8 + offset);
minmax(x + 10, x + 10 + offset);
minmax(x + 12, x + 12 + offset);
minmax(x + 14, x + 14 + offset);
n -= 16;
x += 16;
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
minmax(x, x + offset);
n -= 2;
x += 2;
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax02468101214(int32 *x,int32 *y)
static inline void
minmax02468101214(int32 *x, int32 *y)
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567, a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567, a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513, a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513, a210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567, b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567, b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513, b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513, b210311614715);
__m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
__m256i c02810461214 = _mm256_min_epi32(a02810461214, b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214, b02810461214);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214, a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214, a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214, b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214, b13911571315);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
/* assumes offset >= 31 */
static void multiminmax1plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax1plusmore(int32 *x, int n, int offset)
while (n >= 16) {
minmax02468101214(x,x + offset);
while(n >= 16)
minmax02468101214(x, x + offset);
n -= 16;
x += 16;
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
minmax(x, x + offset);
n -= 2;
x += 2;
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
static inline void minmax8(int32 *x,int32 *y)
static inline void
minmax8(int32 *x, int32 *y)
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) y);
_mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
_mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)y);
_mm256_storeu_si256((__m256i *)x, _mm256_min_epi32(a, b));
_mm256_storeu_si256((__m256i *)y, _mm256_max_epi32(a, b));
/* assumes p >= 8; implies offset >= 8 */
static void multiminmax_atleast8(int p,
int32 *x,
int n,
int offset)
static void
multiminmax_atleast8(int p, int32 *x, int n, int offset)
int i;
while (n >= 2 * p) {
for (i = 0;i < p;i += 8)
minmax8(x + i,x + i + offset);
while(n >= 2 * p)
for(i = 0; i < p; i += 8)
minmax8(x + i, x + i + offset);
n -= 2 * p;
x += 2 * p;
for (i = 0;i + 8 <= n;i += 8) {
if (i & p) return;
minmax8(x + i,x + i + offset);
for(i = 0; i + 8 <= n; i += 8)
if(i & p)
minmax8(x + i, x + i + offset);
for (;i < n;++i) {
if (i & p) return;
minmax(x + i,x + i + offset);
for(; i < n; ++i)
if(i & p)
minmax(x + i, x + i + offset);
/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
static inline void minmax4(int32 *x,int32 *y)
static inline void
minmax4(int32 *x, int32 *y)
__m128i a = _mm_loadu_si128((__m128i *) x);
__m128i b = _mm_loadu_si128((__m128i *) y);
_mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
_mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
__m128i a = _mm_loadu_si128((__m128i *)x);
__m128i b = _mm_loadu_si128((__m128i *)y);
_mm_storeu_si128((__m128i *)x, _mm_min_epi32(a, b));
_mm_storeu_si128((__m128i *)y, _mm_max_epi32(a, b));
static void multiminmax4(
int32 *x,
int n,
int offset)
static void
multiminmax4(int32 *x, int n, int offset)
int i;
while (n >= 8) {
minmax4(x,x + offset);
while(n >= 8)
minmax4(x, x + offset);
n -= 8;
x += 8;
if (n >= 4)
minmax4(x,x + offset);
if(n >= 4)
minmax4(x, x + offset);
for (i = 0;i < n;++i)
minmax(x + i,x + i + offset);
for(i = 0; i < n; ++i)
minmax(x + i, x + i + offset);
void int32_sort(int32 *x,int n)
int32_sort(int32 *x, int n)
int top,p,q;
int top, p, q;
if (n < 2) return;
if(n < 2)
top = 1;
while (top < n - top) top += top;
while(top < n - top)
top += top;
for (p = top;p >= 8;p >>= 1) {
multiminmax_atleast8(p,x,n - p,p);
for (q = top;q > p;q >>= 1)
multiminmax_atleast8(p,x + p,n - q,q - p);
if (p >= 4) {
multiminmax4(x,n - 4,4);
for (q = top;q > 4;q >>= 1)
multiminmax4(x + 4,n - q,q - 4);
if (p >= 2) {
multiminmax2plus2(x,n - 2);
for (q = top;q >= 32;q >>= 1)
multiminmax2plusmore(x + 2,n - q,q - 2);
if (q >= 16)
multiminmax2plus14(x + 2,n - 16);
if (q >= 8)
multiminmax2plus6(x + 2,n - 8);
if (q >= 4)
multiminmax2plus2(x + 2,n - 4);
multiminmax1plus1(x,n - 1);
for (q = top;q >= 32;q >>= 1)
multiminmax1plusmore(x + 1,n - q,q - 1);
if (q >= 16)
multiminmax1(x + 1,n - 16,15);
if (q >= 8)
multiminmax1(x + 1,n - 8,7);
if (q >= 4)
multiminmax1(x + 1,n - 4,3);
if (q >= 2)
multiminmax1plus1(x + 1,n - 2);
for(p = top; p >= 8; p >>= 1)
multiminmax_atleast8(p, x, n - p, p);
for(q = top; q > p; q >>= 1)
multiminmax_atleast8(p, x + p, n - q, q - p);
if(p >= 4)
multiminmax4(x, n - 4, 4);
for(q = top; q > 4; q >>= 1)
multiminmax4(x + 4, n - q, q - 4);
if(p >= 2)
multiminmax2plus2(x, n - 2);
for(q = top; q >= 32; q >>= 1)
multiminmax2plusmore(x + 2, n - q, q - 2);
if(q >= 16)
multiminmax2plus14(x + 2, n - 16);
if(q >= 8)
multiminmax2plus6(x + 2, n - 8);
if(q >= 4)
multiminmax2plus2(x + 2, n - 4);
multiminmax1plus1(x, n - 1);
for(q = top; q >= 32; q >>= 1)
multiminmax1plusmore(x + 1, n - q, q - 1);
if(q >= 16)
multiminmax1(x + 1, n - 16, 15);
if(q >= 8)
multiminmax1(x + 1, n - 8, 7);
if(q >= 4)
multiminmax1(x + 1, n - 4, 3);
if(q >= 2)
multiminmax1plus1(x + 1, n - 2);

@ -4,12 +4,15 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);
#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
extern int r3_weightw_mask(const small *);
extern int
r3_weightw_mask(const small *);

@ -6,91 +6,102 @@
#include "r3.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
return (x - y) >> 31;
static void vectormod3_product(small *z,int len,const small *x,const small c)
static void
vectormod3_product(small *z, int len, const small *x, const small c)
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
_mm256_storeu_si256((__m256i *) z,xi);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while(len >= 32)
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec, xi) & minusvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
z += 32;
len -= 32;
for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_product(x[i], c);
static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
static void
vectormod3_minusproduct(small *z, int len, const small *x, const small *y,
const small c)
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec, twovec, fourvec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while(len >= 32)
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
__m256i r;
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
xi = _mm256_sub_epi8(xi,yi);
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec, yi) & minusvec);
xi = _mm256_sub_epi8(xi, yi);
r = _mm256_add_epi8(xi,twovec);
r = _mm256_add_epi8(xi, twovec);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_sub_epi8(twovec,xi);
r = _mm256_sub_epi8(twovec, xi);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_add_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_add_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_add_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_add_epi8(xi, r);
_mm256_storeu_si256((__m256i *) z,xi);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
y += 32;
z += 32;
len -= 32;
for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_minusproduct(x[i], y[i], c);
static void vectormod3_shift(small *z,int len)
static void
vectormod3_shift(small *z, int len)
int i;
while (len >= 33) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
_mm256_storeu_si256((__m256i *) (z + len - 32),zi);
while(len >= 33)
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 33));
_mm256_storeu_si256((__m256i *)(z + len - 32), zi);
len -= 32;
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
@ -100,12 +111,13 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
int r3_recip(small *r,const small *s)
r3_recip(small *r, const small *s)
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
small f[768];
small g[769];
small f[768];
small g[769];
small u[1536];
small v[1537];
small c;
@ -114,23 +126,28 @@ int r3_recip(small *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = s[i];
for(i = 0; i < p; ++i)
g[i] = s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -141,29 +158,35 @@ int r3_recip(small *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
c = mod3_quotient(g[p],f[p]);
c = mod3_quotient(g[p], f[p]);
vectormod3_minusproduct(g, 768, g, f, c);
vectormod3_shift(g, 769);
#ifdef SIMPLER
vectormod3_minusproduct(v, 1536, v, u, c);
vectormod3_shift(v, 1537);
if (loop < p) {
vectormod3_minusproduct(v,loop + 1,v,u,c);
vectormod3_shift(v,loop + 2);
} else {
vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormod3_shift(v + loop - p,p + 2);
if(loop < p)
vectormod3_minusproduct(v, loop + 1, v, u, c);
vectormod3_shift(v, loop + 2);
vectormod3_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
vectormod3_shift(v + loop - p, p + 2);
@ -171,24 +194,28 @@ int r3_recip(small *r,const small *s)
swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(small),swapmask);
swapmask = smaller_mask(e, d) & mod3_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, (p + 1) * sizeof(small), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(small),swapmask);
swap(u, v, 1536 * sizeof(small), swapmask);
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(small),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
if(loop < p)
swap(u, v, (loop + 1) * sizeof(small), swapmask);
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(small), swapmask);
c = mod3_reciprocal(f[p]);
vectormod3_product(r,p,u + p,c);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormod3_product(r, p, u + p, c);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);

@ -5,27 +5,35 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
extern void rq_roundencode(unsigned char *,const modq *);
extern void
rq_roundencode(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
extern void rq_mod3(small *,const modq *);
extern void
rq_mod3(small *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
int rq_recip3(modq *,const small *);
rq_recip3(modq *, const small *);

@ -12,47 +12,57 @@
// 32-bit hosts only
#ifndef __amd64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
#define _mm_extract_epi64(X, N) \
(__extension__({ \
__v2di __a = (__v2di)(X); \
__a[N]; \
static inline __m256i squeeze(__m256i x)
static inline __m256i
squeeze(__m256i x)
__m256i q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
__m256i q = _mm256_mulhrs_epi16(x, v7);
q = _mm256_mullo_epi16(q, v4591_16);
return _mm256_sub_epi16(x, q);
static inline __m256i freeze(__m256i x)
static inline __m256i
freeze(__m256i x)
__m256i mask, x2296, x4591;
x4591 = _mm256_add_epi16(x,v4591_16);
mask = _mm256_srai_epi16(x,15);
x = _mm256_blendv_epi8(x,x4591,mask);
x2296 = _mm256_sub_epi16(x,v2296_16);
mask = _mm256_srai_epi16(x2296,15);
x4591 = _mm256_sub_epi16(x,v4591_16);
x = _mm256_blendv_epi8(x4591,x,mask);
x4591 = _mm256_add_epi16(x, v4591_16);
mask = _mm256_srai_epi16(x, 15);
x = _mm256_blendv_epi8(x, x4591, mask);
x2296 = _mm256_sub_epi16(x, v2296_16);
mask = _mm256_srai_epi16(x2296, 15);
x4591 = _mm256_sub_epi16(x, v4591_16);
x = _mm256_blendv_epi8(x4591, x, mask);
return x;
void rq_mod3(small *g,const modq *f)
rq_mod3(small *g, const modq *f)
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i q;
x = _mm256_mullo_epi16(x,v3);
x = _mm256_mullo_epi16(x, v3);
x = squeeze(x);
x = freeze(x);
q = _mm256_mulhrs_epi16(x,v10923_16);
x = _mm256_sub_epi16(x,q);
q = _mm256_add_epi16(q,q);
x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
q = _mm256_mulhrs_epi16(x, v10923_16);
x = _mm256_sub_epi16(x, q);
q = _mm256_add_epi16(q, q);
x = _mm256_sub_epi16(x, q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,
x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 0), 0);
1 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 1), 0);

@ -10,93 +10,103 @@
#define v29234_16 _mm256_set1_epi16(29234)
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
return (x - y) >> 31;
static inline __m256i product(__m256i x,__m256i y)
static inline __m256i
product(__m256i x, __m256i y)
__m256i lo, hi, r0, r1, t0, t1, t, s0, s1;
lo = _mm256_mullo_epi16(x,y);
hi = _mm256_mulhi_epi16(x,y);
r0 = _mm256_unpacklo_epi16(lo,hi);
r1 = _mm256_unpackhi_epi16(lo,hi);
t0 = _mm256_srai_epi32(r0,16);
t1 = _mm256_srai_epi32(r1,16);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v29234_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
s0 = _mm256_slli_epi32(s0,4);
s1 = _mm256_slli_epi32(s1,4);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
t0 = _mm256_srai_epi32(r0,8);
t1 = _mm256_srai_epi32(r1,8);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v1827_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
x = _mm256_packs_epi32(r0,r1);
lo = _mm256_mullo_epi16(x, y);
hi = _mm256_mulhi_epi16(x, y);
r0 = _mm256_unpacklo_epi16(lo, hi);
r1 = _mm256_unpackhi_epi16(lo, hi);
t0 = _mm256_srai_epi32(r0, 16);
t1 = _mm256_srai_epi32(r1, 16);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v29234_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
s0 = _mm256_slli_epi32(s0, 4);
s1 = _mm256_slli_epi32(s1, 4);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
t0 = _mm256_srai_epi32(r0, 8);
t1 = _mm256_srai_epi32(r1, 8);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v1827_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
x = _mm256_packs_epi32(r0, r1);
return x;
static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
static inline __m256i
minusproduct(__m256i x, __m256i y, __m256i z)
__m256i t;
x = _mm256_sub_epi16(x,product(y,z));
t = _mm256_mulhrs_epi16(x,v7);
t = _mm256_mullo_epi16(t,v4591_16);
x = _mm256_sub_epi16(x,t);
x = _mm256_sub_epi16(x, product(y, z));
t = _mm256_mulhrs_epi16(x, v7);
t = _mm256_mullo_epi16(t, v4591_16);
x = _mm256_sub_epi16(x, t);
return x;
static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
static void
vectormodq_product(modq *z, int len, const modq *x, const modq c)
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = product(xi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = product(xi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
z += 16;
len -= 16;
while (len > 0) {
*z = modq_product(*x,c);
while(len > 0)
*z = modq_product(*x, c);
static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
static void
vectormodq_minusproduct(modq *z, int len, const modq *x, const modq *y,
const modq c)
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
xi = minusproduct(xi,yi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
xi = minusproduct(xi, yi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
y += 16;
z += 16;
len -= 16;
while (len > 0) {
*z = modq_minusproduct(*x,*y,c);
while(len > 0)
*z = modq_minusproduct(*x, *y, c);
@ -104,15 +114,18 @@ static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,
static void vectormodq_shift(modq *z,int len)
static void
vectormodq_shift(modq *z, int len)
int i;
while (len >= 17) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
_mm256_storeu_si256((__m256i *) (z + len - 16),zi);
while(len >= 17)
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 17));
_mm256_storeu_si256((__m256i *)(z + len - 16), zi);
len -= 16;
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
@ -122,9 +135,10 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
int rq_recip3(modq *r,const small *s)
rq_recip3(modq *r, const small *s)
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
modq f[768];
modq g[769];
@ -136,23 +150,28 @@ int rq_recip3(modq *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = 3 * s[i];
for(i = 0; i < p; ++i)
g[i] = 3 * s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -163,29 +182,35 @@ int rq_recip3(modq *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
c = modq_quotient(g[p],f[p]);
c = modq_quotient(g[p], f[p]);
vectormodq_minusproduct(g, 768, g, f, c);
vectormodq_shift(g, 769);
#ifdef SIMPLER
vectormodq_minusproduct(v, 1536, v, u, c);
vectormodq_shift(v, 1537);
if (loop < p) {
vectormodq_minusproduct(v,loop + 1,v,u,c);
vectormodq_shift(v,loop + 2);
} else {
vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormodq_shift(v + loop - p,p + 2);
if(loop < p)
vectormodq_minusproduct(v, loop + 1, v, u, c);
vectormodq_shift(v, loop + 2);
vectormodq_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
vectormodq_shift(v + loop - p, p + 2);
@ -193,25 +218,30 @@ int rq_recip3(modq *r,const small *s)
swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,768 * sizeof(modq),swapmask);
swapmask = smaller_mask(e, d) & modq_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, 768 * sizeof(modq), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(modq),swapmask);
swap(u, v, 1536 * sizeof(modq), swapmask);
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(modq),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
if(loop < p)
swap(u, v, (loop + 1) * sizeof(modq), swapmask);
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(modq), swapmask);
c = modq_reciprocal(f[p]);
vectormodq_product(r,p,u + p,c);
for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormodq_product(r, p, u + p, c);
for(i = 0; i < p; ++i)
r[i] = modq_freeze(r[i]);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);

@ -6,17 +6,19 @@
#define v3_16 _mm256_set1_epi16(3)
#define v10923_16 _mm256_set1_epi16(10923)
void rq_round3(modq *h,const modq *f)
rq_round3(modq *h, const modq *f)
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i x2;
x = _mm256_mulhrs_epi16(x,v10923_16);
x2 = _mm256_add_epi16(x,x);
x = _mm256_add_epi16(x,x2);
_mm256_storeu_si256((__m256i *) &h[i],x);
x = _mm256_mulhrs_epi16(x, v10923_16);
x2 = _mm256_add_epi16(x, x);
x = _mm256_add_epi16(x, x2);
_mm256_storeu_si256((__m256i *)&h[i], x);

@ -164,35 +164,40 @@ rq_decoderounded(modq *f, const unsigned char *c)
/* x is f0 + f1*1536 + f2*1536^2 */
/* with each f between 0 and 1530 */
f2 = x
f2 =
* _mm256_set1_pd(
f2 = floor(f2);
x -= f2 * _mm256_set1_pd(2359296.0);
f1 = x
f1 =
* _mm256_set1_pd(
f1 = floor(f1);
x -= f1 * _mm256_set1_pd(1536.0);
f0 = x;
f2 -= _mm256_set1_pd(1531.0)
f2 -=
* floor(
* _mm256_set1_pd(
f1 -= _mm256_set1_pd(1531.0)
* _mm256_set1_pd(
f1 -=
* floor(
* _mm256_set1_pd(
f0 -= _mm256_set1_pd(1531.0)
* _mm256_set1_pd(
f0 -=
* floor(
* _mm256_set1_pd(
* _mm256_set1_pd(
f2 *= _mm256_set1_pd(3.0);
f2 -= _mm256_set1_pd(2295.0);

@ -2,30 +2,33 @@
#include <immintrin.h>
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
swap(void *x, void *y, int bytes, int mask)
char c = mask;
char c = mask;
__m256i maskvec = _mm256_set1_epi32(mask);
while (bytes >= 32) {
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
__m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
x = 32 + (char *) x;
y = 32 + (char *) y;
while(bytes >= 32)
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi, yi, maskvec);
__m256i yinew = _mm256_blendv_epi8(yi, xi, maskvec);
_mm256_storeu_si256(x, xinew);
_mm256_storeu_si256(y, yinew);
x = 32 + (char *)x;
y = 32 + (char *)y;
bytes -= 32;
while (bytes > 0) {
char xi = *(char *) x;
char yi = *(char *) y;
char t = c & (xi ^ yi);
while(bytes > 0)
char xi = *(char *)x;
char yi = *(char *)y;
char t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
*(char *) x = xi;
*(char *) y = yi;
*(char *)x = xi;
*(char *)y = yi;

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_avx_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);

@ -1,7 +1,7 @@
#include <libntrup/ntru.h>
#include <stdbool.h>
#include <stdio.h> // printf
#include <stdio.h> // printf
#if __AVX2__
#include <cpuid.h>

@ -7,7 +7,7 @@
#define qshift 2295
#define p 761
#ifdef _MSC_VER
#define LOOPS 2*p+1
#define LOOPS 2 * p + 1
#define w 286

@ -4,9 +4,11 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);

@ -2,30 +2,34 @@
#include "mod3.h"
#include "r3.h"
void r3_mult(small *h,const small *f,const small *g)
r3_mult(small *h, const small *f, const small *g)
small fg[p + p - 1];
small result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
result = 0;
for (j = 0;j <= i;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
result = 0;
for (j = i - p + 1;j < p;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = mod3_sum(fg[i - p],fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
fg[i - p] = mod3_sum(fg[i - p], fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1], fg[i]);
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];

@ -5,24 +5,31 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
extern void rq_encoderounded(unsigned char *,const modq *);
extern void
rq_encoderounded(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
int rq_recip3(modq *,const small *);
rq_recip3(modq *, const small *);

@ -1,30 +1,34 @@
#include "params.h"
#include "rq.h"
void rq_mult(modq *h,const modq *f,const small *g)
rq_mult(modq *h, const modq *f, const small *g)
modq fg[p + p - 1];
modq result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
result = 0;
for (j = 0;j <= i;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
result = 0;
for (j = i - p + 1;j < p;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = modq_sum(fg[i - p],fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
fg[i - p] = modq_sum(fg[i - p], fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1], fg[i]);
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];

@ -1,10 +1,11 @@
#include "params.h"
#include "rq.h"
void rq_round3(modq *h,const modq *f)
rq_round3(modq *h, const modq *f)
int i;
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;

@ -4,34 +4,41 @@
/* XXX: these functions rely on p mod 4 = 1 */
/* all coefficients in -1, 0, 1 */
void small_encode(unsigned char *c,const small *f)
small_encode(unsigned char *c, const small *f)
small c0;
int i;
for (i = 0;i < p/4;++i) {
for(i = 0; i < p / 4; ++i)
c0 = *f++ + 1;
c0 += (*f++ + 1) << 2;
c0 += (*f++ + 1) << 4;
c0 += (*f++ + 1) << 6;
*c++ = c0;
c0 = *f++ + 1;
c0 = *f++ + 1;
*c++ = c0;
void small_decode(small *f,const unsigned char *c)
small_decode(small *f, const unsigned char *c)
unsigned char c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1;
for(i = 0; i < p / 4; ++i)
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1;
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;

@ -1,19 +1,21 @@
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
swap(void *x, void *y, int bytes, int mask)
int i;
char xi, yi, c, t;
c = mask;
for (i = 0;i < bytes;++i) {
xi = i[(char *) x];
yi = i[(char *) y];
t = c & (xi ^ yi);
for(i = 0; i < bytes; ++i)
xi = i[(char *)x];
yi = i[(char *)y];
t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
i[(char *) x] = xi;
i[(char *) y] = yi;
i[(char *)x] = xi;
i[(char *)y] = yi;

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_ref_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);

@ -36,7 +36,7 @@ sodium_init(void)
return -1; /* LCOV_EXCL_LINE */
/* if we're here, we already started properly */
return initialized ? 0: -1;
return initialized ? 0 : -1;

@ -10,116 +10,116 @@ crypto_core_salsa(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c,
const int rounds)
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
int i;
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
int i;
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if (c != NULL) {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if(c != NULL)
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
for (i = 0; i < rounds; i += 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
for(i = 0; i < rounds; i += 2)
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
crypto_core_salsa(out, in, k, c, 20);
return 0;
crypto_core_salsa(out, in, k, c, 20);
return 0;
return crypto_core_salsa20_OUTPUTBYTES;
return crypto_core_salsa20_OUTPUTBYTES;
return crypto_core_salsa20_INPUTBYTES;
return crypto_core_salsa20_INPUTBYTES;
return crypto_core_salsa20_KEYBYTES;
return crypto_core_salsa20_KEYBYTES;
return crypto_core_salsa20_CONSTBYTES;
return crypto_core_salsa20_CONSTBYTES;

@ -13,7 +13,6 @@ Public domain.
#include "../stream_salsa20.h"
#include "salsa20_ref.h"
static int
stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
const unsigned char *k)
@ -132,4 +131,3 @@ struct crypto_stream_salsa20_implementation
SODIUM_C99(.stream =) stream_ref,
SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,

@ -4,13 +4,13 @@
#include <stdint.h>
typedef struct crypto_stream_salsa20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
typedef struct crypto_stream_salsa20_implementation
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
} crypto_stream_salsa20_implementation;

@ -1,195 +1,199 @@
if (bytes > 0) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *) (partialblock + (A * 4)) = in##A; \
*(uint32_t *) (partialblock + (B * 4)) = in##B; \
*(uint32_t *) (partialblock + (C * 4)) = in##C; \
*(uint32_t *) (partialblock + (D * 4)) = in##D; \
} while (0)
if(bytes > 0)
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *)(partialblock + (A * 4)) = in##A; \
*(uint32_t *)(partialblock + (B * 4)) = in##B; \
*(uint32_t *)(partialblock + (C * 4)) = in##C; \
*(uint32_t *)(partialblock + (D * 4)) = in##D; \
} while(0)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
for(i = 0; i < bytes; i++)
c[i] = m[i] ^ partialblock[i];
sodium_memzero(partialblock, sizeof partialblock);
sodium_memzero(partialblock, sizeof partialblock);

@ -1,207 +1,211 @@
while (bytes >= 64) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *) (m + (A * 4)); \
in##B ^= *(uint32_t *) (m + (B * 4)); \
in##C ^= *(uint32_t *) (m + (C * 4)); \
in##D ^= *(uint32_t *) (m + (D * 4)); \
*(uint32_t *) (c + (A * 4)) = in##A; \
*(uint32_t *) (c + (B * 4)) = in##B; \
*(uint32_t *) (c + (C * 4)) = in##C; \
*(uint32_t *) (c + (D * 4)) = in##D; \
} while (0)
while(bytes >= 64)
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *)(m + (A * 4)); \
in##B ^= *(uint32_t *)(m + (B * 4)); \
in##C ^= *(uint32_t *)(m + (C * 4)); \
in##D ^= *(uint32_t *)(m + (D * 4)); \
*(uint32_t *)(c + (A * 4)) = in##A; \
*(uint32_t *)(c + (B * 4)) = in##B; \
*(uint32_t *)(c + (C * 4)) = in##C; \
*(uint32_t *)(c + (D * 4)) = in##D; \
} while(0)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
in8 = x[8];
in9 = x[13];
if (in8 == 0) {
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
in8 = x[8];
in9 = x[13];
if(in8 == 0)
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;

File diff suppressed because it is too large Load Diff

@ -1,476 +1,471 @@
if (bytes >= 512) {
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while (bytes >= 512) {
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < ROUNDS; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
if(bytes >= 512)
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while(bytes >= 512)
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t)in8) | (((uint64_t)in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for(i = 0; i < ROUNDS; i += 2)
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14,
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
y##B = \
_mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
y##C = \
_mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
y##D = \
_mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
y##A2 = \
_mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \
y##B2 = \
_mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \
y##C2 = \
_mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \
y##D2 = \
_mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), y##A); \
_mm256_storeu_si256((__m256i*) (c + 64), y##B); \
_mm256_storeu_si256((__m256i*) (c + 128), y##C); \
_mm256_storeu_si256((__m256i*) (c + 192), y##D); \
_mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
y##B = _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
y##C = _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
y##D = _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
y##A2 = _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
y##B2 = _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
y##C2 = _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
y##D2 = _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), y##A); \
_mm256_storeu_si256((__m256i*)(c + 64), y##B); \
_mm256_storeu_si256((__m256i*)(c + 128), y##C); \
_mm256_storeu_si256((__m256i*)(c + 192), y##D); \
_mm256_storeu_si256((__m256i*)(c + 256), y##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), y##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), y##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), y##D2); \
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
bytes -= 512;
c += 512;
m += 512;

@ -1,6 +1,10 @@
#if defined(_WIN32) && defined(RC_INVOKED)
#define LLARP_VERSION 0, 5, 0, 0
@ -33,5 +37,5 @@ struct Version
static const char LLARP_NET_ID[];

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
#include "resource.h"
#include <constants/version.hpp>
#ifdef __GNUC__
#include <winresrc.h>
// English (United States) resources
@ -58,8 +61,8 @@ END
#ifdef _DEBUG
@ -76,20 +79,20 @@ BEGIN
VALUE "Comments", "libabyss JSON-RPC daemon demo"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET for Microsoft® Windows® NT™"
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "abyss-main.exe"
VALUE "ProductName", "LokiNET for Windows"
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
#include "resource.h"
#include <constants/version.hpp>
#ifdef __GNUC__ // make windows rc accept this
#include <winresrc.h>
// English (United States) resources
@ -58,8 +61,8 @@ END
#ifdef _DEBUG
@ -76,20 +79,20 @@ BEGIN
VALUE "Comments", "includes relay/exit functionality, such code is highly experimental on non-Linux targets"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET daemon for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET daemon for Microsoft® Windows® NT™"
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "llarpd.exe"
VALUE "ProductName", "LokiNET for Windows"
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
#include <win32/resource.h>
#include <constants/version.hpp>
#ifdef __GNUC__
#include <winresrc.h>
// English (United States) resources
@ -58,8 +61,8 @@ END
#ifdef _DEBUG
@ -76,27 +79,27 @@ BEGIN
VALUE "Comments", "LokiNET test suite"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET for Microsoft® Windows® NT™"
#ifdef __GNUC__
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
VALUE "FileVersion", "0.4.0-dev"
VALUE "FileVersion", "0.5.0-dev"
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "llarpd.exe"
VALUE "ProductName", "LokiNET for Windows"
#ifdef __GNUC__
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
VALUE "ProductVersion", "0.4.0-dev"
VALUE "ProductVersion", "0.5.0-dev"

@ -1,3 +1,8 @@
// WARNING: for the love of all that is good and holy
// please DO NOT convert this file to UTF-8, much less
// UTF-16 - the UNIX port of Roslyn does not understand UTF-16,
// and UTF-8 chews up the copyright symbols.
// -rick
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
@ -10,8 +15,8 @@ using System.Runtime.InteropServices;
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("Loki Project")]
[assembly: AssemblyProduct("LokiNET Launcher")]
[assembly: AssemblyCopyright("Copyright ©2018-2019 Loki Project. All rights reserved. See LICENSE for more details.")]
[assembly: AssemblyTrademark("Loki, Loki Project, LokiNET are ™ & ©2018-2019 Loki Foundation")]
[assembly: AssemblyCopyright("Copyright ©2018-2019 Loki Project. All rights reserved. See LICENSE for more details.")]
[assembly: AssemblyTrademark("Loki, Loki Project, LokiNET are ™ & ©2018-2019 Loki Foundation")]
[assembly: AssemblyCulture("")]
// Setting ComVisible to false makes the types in this assembly not visible
@ -32,10 +37,10 @@ using System.Runtime.InteropServices;
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("0.4.3")]
[assembly: AssemblyFileVersion("0.4.3")]
[assembly: AssemblyVersion("0.5.0")]
[assembly: AssemblyFileVersion("0.5.0")]
[assembly: AssemblyInformationalVersion("0.4.3-dev-{chash:8}")]
[assembly: AssemblyInformationalVersion("0.5.0-dev-{chash:8}")]
[assembly: AssemblyInformationalVersion("0.4.3 (RELEASE_CODENAME)")]
[assembly: AssemblyInformationalVersion("0.5.0 (RELEASE_CODENAME)")]

@ -28,77 +28,101 @@
/// </summary>
private void InitializeComponent()
this.btnOK = new System.Windows.Forms.Button();
this.btnBoot = new System.Windows.Forms.Button();
this.btnDumpLog = new System.Windows.Forms.Button();
this.btnVSettings = new System.Windows.Forms.Button();
// btnOK
this.btnOK.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)));
this.btnOK.DialogResult = System.Windows.Forms.DialogResult.Cancel;
this.btnOK.Location = new System.Drawing.Point(109, 121);
this.btnOK.Name = "btnOK";
this.btnOK.Size = new System.Drawing.Size(75, 23);
this.btnOK.TabIndex = 0;
this.btnOK.Text = "Close";
this.btnOK.UseVisualStyleBackColor = true;
this.btnOK.Click += new System.EventHandler(this.btnOK_Click);
// btnBoot
this.btnBoot.Location = new System.Drawing.Point(13, 13);
this.btnBoot.Name = "btnBoot";
this.btnBoot.Size = new System.Drawing.Size(270, 23);
this.btnBoot.TabIndex = 1;
this.btnBoot.Text = "Bootstrap Client from Web...";
this.btnBoot.UseVisualStyleBackColor = true;
this.btnBoot.Click += new System.EventHandler(this.btnBoot_Click);
// btnDumpLog
this.btnDumpLog.Location = new System.Drawing.Point(13, 43);
this.btnDumpLog.Name = "btnDumpLog";
this.btnDumpLog.Size = new System.Drawing.Size(270, 23);
this.btnDumpLog.TabIndex = 2;
this.btnDumpLog.Text = "Save Log...";
this.btnDumpLog.UseVisualStyleBackColor = true;
this.btnDumpLog.Click += new System.EventHandler(this.btnDumpLog_Click);
// btnVSettings
this.btnVSettings.Location = new System.Drawing.Point(13, 73);
this.btnVSettings.Name = "btnVSettings";
this.btnVSettings.Size = new System.Drawing.Size(270, 23);
this.btnVSettings.TabIndex = 3;
this.btnVSettings.Text = "Display Settings...";
this.btnVSettings.UseVisualStyleBackColor = true;
this.btnVSettings.Click += new System.EventHandler(this.btnVSettings_Click);
// UserSettingsForm
this.AcceptButton = this.btnOK;
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.CancelButton = this.btnOK;
this.ClientSize = new System.Drawing.Size(295, 156);
this.ControlBox = false;
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.MaximizeBox = false;
this.MinimizeBox = false;
this.Name = "UserSettingsForm";
this.ShowIcon = false;
this.ShowInTaskbar = false;
this.SizeGripStyle = System.Windows.Forms.SizeGripStyle.Hide;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Settings";
this.btnOK = new System.Windows.Forms.Button();
this.btnBoot = new System.Windows.Forms.Button();
this.btnDumpLog = new System.Windows.Forms.Button();
this.btnVSettings = new System.Windows.Forms.Button();
this.btnEditCfg = new System.Windows.Forms.Button();
this.btnNewCfg = new System.Windows.Forms.Button();
// btnOK
this.btnOK.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)));
this.btnOK.DialogResult = System.Windows.Forms.DialogResult.Cancel;
this.btnOK.Location = new System.Drawing.Point(109, 167);
this.btnOK.Name = "btnOK";
this.btnOK.Size = new System.Drawing.Size(75, 23);
this.btnOK.TabIndex = 0;
this.btnOK.Text = "Close";
this.btnOK.UseVisualStyleBackColor = true;
this.btnOK.Click += new System.EventHandler(this.btnOK_Click);
// btnBoot
this.btnBoot.Location = new System.Drawing.Point(13, 13);
this.btnBoot.Name = "btnBoot";
this.btnBoot.Size = new System.Drawing.Size(270, 23);
this.btnBoot.TabIndex = 1;
this.btnBoot.Text = "Bootstrap Client from Web...";
this.btnBoot.UseVisualStyleBackColor = true;
this.btnBoot.Click += new System.EventHandler(this.btnBoot_Click);
// btnDumpLog
this.btnDumpLog.Location = new System.Drawing.Point(13, 43);
this.btnDumpLog.Name = "btnDumpLog";
this.btnDumpLog.Size = new System.Drawing.Size(270, 23);
this.btnDumpLog.TabIndex = 2;
this.btnDumpLog.Text = "Save Log...";
this.btnDumpLog.UseVisualStyleBackColor = true;
this.btnDumpLog.Click += new System.EventHandler(this.btnDumpLog_Click);
// btnVSettings
this.btnVSettings.Location = new System.Drawing.Point(13, 73);
this.btnVSettings.Name = "btnVSettings";
this.btnVSettings.Size = new System.Drawing.Size(270, 23);
this.btnVSettings.TabIndex = 3;
this.btnVSettings.Text = "Display Settings...";
this.btnVSettings.UseVisualStyleBackColor = true;
this.btnVSettings.Click += new System.EventHandler(this.btnVSettings_Click);
// btnEditCfg
this.btnEditCfg.Location = new System.Drawing.Point(13, 102);
this.btnEditCfg.Name = "btnEditCfg";
this.btnEditCfg.Size = new System.Drawing.Size(270, 23);
this.btnEditCfg.TabIndex = 4;
this.btnEditCfg.Text = "Edit Configuration File...";
this.btnEditCfg.UseVisualStyleBackColor = true;
this.btnEditCfg.Click += new System.EventHandler(this.BtnEditCfg_Click);
// btnNewCfg
this.btnNewCfg.Location = new System.Drawing.Point(12, 131);
this.btnNewCfg.Name = "btnNewCfg";
this.btnNewCfg.Size = new System.Drawing.Size(270, 23);
this.btnNewCfg.TabIndex = 5;
this.btnNewCfg.Text = "New Configuration File...";
this.btnNewCfg.UseVisualStyleBackColor = true;
this.btnNewCfg.Click += new System.EventHandler(this.BtnNewCfg_Click);
// UserSettingsForm
this.AcceptButton = this.btnOK;
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.CancelButton = this.btnOK;
this.ClientSize = new System.Drawing.Size(295, 202);
this.ControlBox = false;
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.MaximizeBox = false;
this.MinimizeBox = false;
this.Name = "UserSettingsForm";
this.ShowIcon = false;
this.ShowInTaskbar = false;
this.SizeGripStyle = System.Windows.Forms.SizeGripStyle.Hide;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Settings";
@ -106,6 +130,8 @@
private System.Windows.Forms.Button btnOK;
private System.Windows.Forms.Button btnBoot;
private System.Windows.Forms.Button btnDumpLog;
private System.Windows.Forms.Button btnVSettings;
private System.Windows.Forms.Button btnVSettings;
private System.Windows.Forms.Button btnEditCfg;
private System.Windows.Forms.Button btnNewCfg;

@ -1,4 +1,5 @@
using System;
using System.Diagnostics;
using System.IO;
using System.Windows.Forms;
@ -57,6 +58,54 @@ namespace network.loki.lokinet.win32.ui
VisualSettings v = new VisualSettings();
private void BtnEditCfg_Click(object sender, EventArgs e)
try {
Process.Start(string.Format("{0}/lokinet.ini", config_path)); }
MessageBox.Show("No existing config found");
BtnNewCfg_Click(sender, e);
private void BtnNewCfg_Click(object sender, EventArgs e)
if (File.Exists(string.Format("{0}/lokinet.ini", config_path)))
DialogResult resp = MessageBox.Show("WARNING: This will overwrite your existing config file, Continue?", "Lokinet", MessageBoxButtons.YesNo, MessageBoxIcon.Question);
case DialogResult.Yes:
File.Delete(string.Format("{0}/lokinet.ini", config_path));
case DialogResult.No:
string lokinetExeString;
if (Program.platform == PlatformID.Win32NT)
lokinetExeString = String.Format("{0}\\lokinet.exe", Directory.GetCurrentDirectory());
lokinetExeString = String.Format("{0}/lokinet", Directory.GetCurrentDirectory());
Process p = new Process();
p.StartInfo.FileName = lokinetExeString;
p.StartInfo.Arguments = "-g";
p.StartInfo.CreateNoWindow = true;
p.StartInfo.UseShellExecute = false;
p.EnableRaisingEvents = true;
p.Exited += new EventHandler(msg);
private void msg(object sender, EventArgs e)
MessageBox.Show(string.Format("Created new config file at {0}/lokinet.ini", config_path), "Success", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);

@ -1,120 +1,120 @@
<?xml version="1.0" encoding="utf-8"?>
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
... headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/">
<value>[base64 mime encoded serialized .NET Framework object]</value>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/ is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
<xsd:schema id="root" xmlns="" xmlns:xsd="" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:element name="value" type="xsd:string" minOccurs="0" />
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
<xsd:element name="assembly">
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
<xsd:element name="data">
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
<xsd:element name="resheader">
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:attribute name="name" type="xsd:string" use="required" />
<resheader name="resmimetype">
<resheader name="version">
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
<?xml version="1.0" encoding="utf-8"?>
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
... headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/">
<value>[base64 mime encoded serialized .NET Framework object]</value>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/ is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
<xsd:schema id="root" xmlns="" xmlns:xsd="" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:element name="value" type="xsd:string" minOccurs="0" />
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
<xsd:element name="assembly">
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
<xsd:element name="data">
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
<xsd:element name="resheader">
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:attribute name="name" type="xsd:string" use="required" />
<resheader name="resmimetype">
<resheader name="version">
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>

@ -2,7 +2,7 @@
#define MyAppName "loki-network"
#define MyAppVersion "0.4.3"
#define MyAppVersion "0.5.0"
#define MyAppPublisher "Loki Project"
#define MyAppURL ""
#define MyAppExeName "lokinetui.exe"
@ -39,18 +39,18 @@ OutputDir={#DevPath}win32-setup
VersionInfoCompany=Loki Project
VersionInfoDescription=LokiNET for Microsoft® Windows® NT™
#ifndef RELEASE
VersionInfoProductTextVersion=0.4.3 ({#Codename})
VersionInfoProductTextVersion=0.5.0 ({#Codename})
