Merge pull request #787 from despair86/master

make build internally consistent, bump windows version to 0.5.0
pull/801/head
Jeff 5 years ago committed by GitHub
commit 325b697e90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -14,6 +14,7 @@ option(USE_AVX2 "enable avx2 code" )
option(USE_NETNS "enable networking namespace support. Linux only" )
option(AMD_RYZEN_HACK "hack for AMD Ryzen FPU bug (support FMA3 and FMA4 in FPU, but does not show in CPUID)" )
option(NATIVE_BUILD "optimise for host system and FPU, may not be portable" )
option(EMBEDDED_CFG "optimise for older hardware or embedded systems")
if (NOT MSVC)
option(STATIC_LINK_RUNTIME "link statically against compiler runtime, standard library and pthreads")
endif()
@ -31,6 +32,7 @@ option(WARNINGS_AS_ERRORS "treat all warnings as errors. turn off for developmen
include(cmake/target_link_libraries_system.cmake)
include(cmake/add_import_library.cmake)
include(cmake/add_log_tag.cmake)
include(cmake/libatomic.cmake)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
@ -111,8 +113,6 @@ endif(WITH_SHELLHOOKS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(ABSEIL_DIR vendor/abseil-cpp)
add_subdirectory(vendor/gtest)
add_subdirectory(${ABSEIL_DIR})
include_directories(SYSTEM ${ABSEIL_DIR})
add_subdirectory(vendor/cxxopts)
add_subdirectory(vendor/nlohmann)
@ -163,7 +163,21 @@ if(NATIVE_BUILD)
set(CRYPTO_FLAGS -march=native -mfpmath=sse -mtune=native)
endif()
add_compile_options(${OPTIMIZE_FLAGS} ${CRYPTO_FLAGS})
if(EMBEDDED_CFG)
message(WARNING "This configuration is optimised for older hardware and/or constrained node operation, may result in poor performance on desktop systems")
message(WARNING "For deployment on such systems, all external code (currently, libuv) must also be compiled for the target!")
set(CRYPTO_FLAGS -march=i486 -mtune=i486)
endif()
if (NOT MSVC OR NOT MSVC_VERSION)
add_compile_options(${OPTIMIZE_FLAGS} ${CRYPTO_FLAGS})
endif()
add_subdirectory(${ABSEIL_DIR})
add_subdirectory(vendor/gtest)
if (FS_LIB STREQUAL "cppbackport")
add_subdirectory(vendor)
endif()
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
set(THREADS_PREFER_PTHREAD_FLAG TRUE)

@ -0,0 +1,49 @@
function(check_working_cxx_atomics64 varname)
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
if (EMBEDDED_CFG)
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -m32 -march=i486")
elseif(MSVC OR MSVC_VERSION)
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -arch:IA32 -std:c++14")
else()
# CMAKE_CXX_STANDARD does not propagate to cmake compile tests
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++14")
endif()
check_cxx_source_compiles("
#include <atomic>
#include <cstdint>
std::atomic<uint64_t> x (0);
int main() {
uint64_t i = x.load(std::memory_order_relaxed);
return 0;
}
" ${varname})
set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
endfunction()
function(link_libatomic)
check_working_cxx_atomics64(HAVE_CXX_ATOMICS64_WITHOUT_LIB)
if(HAVE_CXX_ATOMICS64_WITHOUT_LIB)
message(STATUS "Have working 64bit atomics")
return()
endif()
if (NOT MSVC AND NOT MSVC_VERSION)
check_library_exists(atomic __atomic_load_8 "" HAVE_CXX_LIBATOMICS64)
if (HAVE_CXX_LIBATOMICS64)
message(STATUS "Have 64bit atomics via library")
list(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
check_working_cxx_atomics64(HAVE_CXX_ATOMICS64_WITH_LIB)
if (HAVE_CXX_ATOMICS64_WITH_LIB)
message(STATUS "Can link with libatomic")
link_libraries(-latomic)
return()
endif()
endif()
endif()
if (MSVC OR MSVC_VERSION)
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic! (What does MSVC do to inline atomics?)")
else()
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic!")
endif()
endfunction()

@ -24,53 +24,16 @@ endif()
include_directories(${LIBUV_INCLUDE_DIRS})
function(check_working_cxx_atomics64 varname)
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++14")
check_cxx_source_compiles("
#include <atomic>
#include <cstdint>
std::atomic<uint64_t> x (0);
int main() {
uint64_t i = x.load(std::memory_order_relaxed);
return 0;
}
" ${varname})
set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
endfunction()
function(link_libatomic)
check_working_cxx_atomics64(HAVE_CXX_ATOMICS64_WITHOUT_LIB)
if(HAVE_CXX_ATOMICS64_WITHOUT_LIB)
message(STATUS "Have working 64bit atomics")
return()
endif()
check_library_exists(atomic __atomic_load_8 "" HAVE_CXX_LIBATOMICS64)
if (HAVE_CXX_LIBATOMICS64)
message(STATUS "Have 64bit atomics via library")
list(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
check_working_cxx_atomics64(HAVE_CXX_ATOMICS64_WITH_LIB)
if (HAVE_CXX_ATOMICS64_WITH_LIB)
message(STATUS "Can link with libatomic")
link_libraries(-latomic)
return()
endif()
endif()
message(FATAL_ERROR "Host compiler must support 64-bit std::atomic!")
endfunction()
if(EMBEDDED_CFG OR ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
link_libatomic()
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(FS_LIB stdc++fs)
get_filename_component(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-unix-linux.c ABSOLUTE)
link_libatomic()
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Android")
find_library(FS_LIB NAMES c++fs c++experimental stdc++fs)
if(FS_LIB STREQUAL FS_LIB-NOTFOUND)
add_subdirectory(vendor)
include_directories("${CMAKE_CURRENT_LIST_DIR}/../vendor/cppbackport-master/lib")
add_definitions(-DLOKINET_USE_CPPBACKPORT)
set(FS_LIB cppbackport)
@ -86,7 +49,6 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
find_library(FS_LIB NAMES c++fs c++experimental stdc++fs)
if(FS_LIB STREQUAL FS_LIB-NOTFOUND)
add_subdirectory(vendor)
include_directories("${CMAKE_CURRENT_LIST_DIR}/../vendor/cppbackport-master/lib")
add_definitions(-DLOKINET_USE_CPPBACKPORT)
set(FS_LIB cppbackport)
@ -96,8 +58,10 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
set(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-unix-sunos.c)
# Apple C++ screws up name decorations in stdc++fs, causing link to fail
# Samsung does not build c++experimental or c++fs in their Apple libc++ pkgsrc build
if (LIBUV_USE_STATIC)
link_libraries(-lkstat -lsendfile)
endif()
if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
add_subdirectory(vendor)
include_directories("${CMAKE_CURRENT_LIST_DIR}/../vendor/cppbackport-master/lib")
add_definitions(-DLOKINET_USE_CPPBACKPORT)
set(FS_LIB cppbackport)

@ -8,7 +8,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
if (MSVC OR MSVC_VERSION)
add_compile_options(/EHca /arch:AVX2 /MD)
add_definitions(-D_SILENCE_CXX17_OLD_ALLOCATOR_MEMBERS_DEPRECATION_WARNING)
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
add_compile_options(-Wno-nonportable-system-include-path)
endif()
@ -25,11 +24,13 @@ if(NOT MSVC_VERSION)
add_definitions(-DWINVER=0x0500 -D_WIN32_WINNT=0x0500)
# Wait a minute, if we're not Microsoft C++, nor a Clang paired with Microsoft C++,
# then the only possible option has to be GNU or a GNU-linked Clang!
if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0 OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(FS_LIB stdc++fs)
endif()
set(FS_LIB stdc++fs)
endif()
if(EMBEDDED_CFG)
link_libatomic()
endif()
get_filename_component(LIBTUNTAP_IMPL ${TT_ROOT}/tuntap-windows.c ABSOLUTE)
get_filename_component(EV_SRC "llarp/ev/ev_win32.cpp" ABSOLUTE)
add_definitions(-DWIN32_LEAN_AND_MEAN -DWIN32 -DWINVER=0x0500)

@ -26,9 +26,9 @@
<method_credential user="lokinet" group="lokinet"/>
</method_context>
<exec_method type="method" name="start" exec="/usr/bin/lokinet" timeout_seconds="60"/>
<exec_method type="method" name="start" exec="/usr/bin/lokinet %{config_file}" timeout_seconds="60"/>
<exec_method type="method" name="stop" exec="/usr/bin/kill -INT &lt;&lt;&lt; /path/to/lokinet.pid" timeout_seconds="60"/>
<exec_method type="method" name="stop" exec="/usr/bin/kill -INT &lt;&lt;&lt; `pgrep lokinet`" timeout_seconds="60"/>
<property_group name="startd" type="framework">
<propval name="duration" type="astring" value="child"/>
@ -38,7 +38,7 @@
</property_group>
<property_group name="application" type="application">
<propval name="config_file" type="astring" value="/etc/lokinet.ini"/>
<propval name="config_file" type="astring" value="/etc/loki/lokinet.ini"/>
</property_group>
</instance>
@ -50,11 +50,11 @@
<template>
<common_name>
<loctext xml:lang="C">
LokiNET
LokiNET: Anonymous Network layer thingydoo.
</loctext>
</common_name>
</template>
</service>
</service_bundle>
</service_bundle>

@ -2,30 +2,30 @@
#ifndef blake2b_compress_avx2_H
#define blake2b_compress_avx2_H
#define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
#define LOADU128(p) _mm_loadu_si128((__m128i *)(p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
#define LOAD(p) _mm256_load_si256((__m256i *) (p))
#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
#define LOAD(p) _mm256_load_si256((__m256i *)(p))
#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *)(p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
static inline uint64_t
LOADU64(const void *p)
{
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
}
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, \
4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, \
5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ADD(a, b) _mm256_add_epi64(a, b)
#define SUB(a, b) _mm256_sub_epi64(a, b)
@ -40,98 +40,104 @@ LOADU64(const void *p)
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
#define BLAKE2B_G1_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while(0)
#define BLAKE2B_G2_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while (0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while (0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while(0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while(0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while(0)
#include "blake2b-load-avx2.h"
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
do { \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while (0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do { \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while (0)
#define DECLARE_MESSAGE_WORDS(m) \
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do { \
DECLARE_MESSAGE_WORDS(m) \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = \
XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while (0)
do \
{ \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while(0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do \
{ \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while(0)
#define DECLARE_MESSAGE_WORDS(m) \
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do \
{ \
DECLARE_MESSAGE_WORDS(m) \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while(0)
#endif

@ -2,102 +2,99 @@
#ifndef blake2b_compress_sse41_H
#define blake2b_compress_sse41_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse41.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#endif

@ -2,102 +2,99 @@
#ifndef blake2b_compress_ssse3_H
#define blake2b_compress_ssse3_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse2.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#endif

@ -1,340 +1,388 @@
#ifndef blake2b_load_avx2_H
#define blake2b_load_avx2_H
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do { \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do { \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do { \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do { \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do { \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do \
{ \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#endif

@ -16,149 +16,149 @@
#ifndef blake2b_load_sse2_H
#define blake2b_load_sse2_H
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#endif

@ -16,292 +16,340 @@
#ifndef blake2b_load_sse41_H
#define blake2b_load_sse41_H
#define LOAD_MSG_0_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_0_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_1_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_1_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_1_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_1_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_2_1(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while (0)
#define LOAD_MSG_2_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while (0)
#define LOAD_MSG_2_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while (0)
#define LOAD_MSG_2_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while (0)
#define LOAD_MSG_3_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while (0)
#define LOAD_MSG_3_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_3_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_3_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_4_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while (0)
#define LOAD_MSG_4_2(b0, b1) \
do { \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_4_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while (0)
#define LOAD_MSG_4_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while (0)
#define LOAD_MSG_5_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_5_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while (0)
#define LOAD_MSG_5_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while (0)
#define LOAD_MSG_5_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while (0)
#define LOAD_MSG_6_1(b0, b1) \
do { \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while (0)
#define LOAD_MSG_6_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while (0)
#define LOAD_MSG_6_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while (0)
#define LOAD_MSG_6_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while (0)
#define LOAD_MSG_7_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while (0)
#define LOAD_MSG_7_2(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_7_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while (0)
#define LOAD_MSG_7_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while (0)
#define LOAD_MSG_8_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while (0)
#define LOAD_MSG_8_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while (0)
#define LOAD_MSG_8_3(b0, b1) \
do { \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while (0)
#define LOAD_MSG_8_4(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while (0)
#define LOAD_MSG_9_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while (0)
#define LOAD_MSG_9_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while (0)
#define LOAD_MSG_9_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while (0)
#define LOAD_MSG_9_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while (0)
#define LOAD_MSG_10_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_10_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_11_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_11_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_11_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_11_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

@ -24,15 +24,17 @@
#ifndef __amd64__
#ifdef __clang__
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __target__("sse2")))
#endif
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a)
{
return (__m128i){ __a, 0 };
return (__m128i){__a, 0};
}
#endif

@ -1,86 +1,89 @@
if (bytes > 0) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
_mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
_mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
_mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
_mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
if(bytes > 0)
{
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
{
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
_mm_storeu_si128((__m128i*)(partialblock + 0), x_0);
_mm_storeu_si128((__m128i*)(partialblock + 16), x_1);
_mm_storeu_si128((__m128i*)(partialblock + 32), x_2);
_mm_storeu_si128((__m128i*)(partialblock + 48), x_3);
for(i = 0; i < bytes; i++)
{
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
}

@ -1,98 +1,101 @@
while (bytes >= 64) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48)));
_mm_storeu_si128((__m128i*) (c + 0), x_0);
_mm_storeu_si128((__m128i*) (c + 16), x_1);
_mm_storeu_si128((__m128i*) (c + 32), x_2);
_mm_storeu_si128((__m128i*) (c + 48), x_3);
in12 = x[12];
in13 = x[13];
in12++;
if (in12 == 0) {
in13++;
}
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;
while(bytes >= 64)
{
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
{
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
_mm_storeu_si128((__m128i*)(c + 0), x_0);
_mm_storeu_si128((__m128i*)(c + 16), x_1);
_mm_storeu_si128((__m128i*)(c + 32), x_2);
_mm_storeu_si128((__m128i*)(c + 48), x_3);
in12 = x[12];
in13 = x[13];
in12++;
if(in12 == 0)
{
in13++;
}
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;
}

@ -1,174 +1,177 @@
#define VEC4_ROT(A, IMM) \
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
if (bytes >= 256) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while (bytes >= 256) {
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
\
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
\
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
if(bytes >= 256)
{
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while(bytes >= 256)
{
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
{
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
\
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
\
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
bytes -= 256;
c += 256;
m += 256;
}
bytes -= 256;
c += 256;
m += 256;
}
}
#undef VEC4_ROT
#undef VEC4_QUARTERROUND

@ -1,346 +1,344 @@
#define VEC8_ROT(A, IMM) \
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
/* implements a vector quarter round by-the-book (naive!) */
#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
* (8 & 16) (not as good as previous) */
#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
A4, B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
D4) \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
D4)
if (bytes >= 512) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while (bytes >= 512) {
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, D4)
if(bytes >= 512)
{
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13,
12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14,
13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while(bytes >= 512)
{
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
{
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
}
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = \
_mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
t_##B = \
_mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
t_##C = \
_mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
t_##D = \
_mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
t_##A2 = _mm256_xor_si256(t_##A2, \
_mm256_loadu_si256((__m256i*) (m + 256))); \
t_##B2 = _mm256_xor_si256(t_##B2, \
_mm256_loadu_si256((__m256i*) (m + 320))); \
t_##C2 = _mm256_xor_si256(t_##C2, \
_mm256_loadu_si256((__m256i*) (m + 384))); \
t_##D2 = _mm256_xor_si256(t_##D2, \
_mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), t_##A); \
_mm256_storeu_si256((__m256i*) (c + 64), t_##B); \
_mm256_storeu_si256((__m256i*) (c + 128), t_##C); \
_mm256_storeu_si256((__m256i*) (c + 192), t_##D); \
_mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
t_##B = _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
t_##C = _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
t_##D = _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
t_##A2 = \
_mm256_xor_si256(t_##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
t_##B2 = \
_mm256_xor_si256(t_##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
t_##C2 = \
_mm256_xor_si256(t_##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
t_##D2 = \
_mm256_xor_si256(t_##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), t_##A); \
_mm256_storeu_si256((__m256i*)(c + 64), t_##B); \
_mm256_storeu_si256((__m256i*)(c + 128), t_##C); \
_mm256_storeu_si256((__m256i*)(c + 192), t_##D); \
_mm256_storeu_si256((__m256i*)(c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), t_##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_UNPCK
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
}
bytes -= 512;
c += 512;
m += 512;
}
}
#undef VEC8_ROT
#undef VEC8_QUARTERROUND

@ -4,19 +4,18 @@
#include <stdint.h>
typedef struct crypto_stream_chacha20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
typedef struct crypto_stream_chacha20_implementation
{
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint32_t ic, const unsigned char *k);
} crypto_stream_chacha20_implementation;
#endif

@ -74,8 +74,8 @@ randombytes_stir(void)
try
{
var window_ = 'object' == = typeof window ? window : self;
var crypto_ = typeof window_.crypto != = 'undefined' ? window_.crypto
: window_.msCrypto;
var crypto_ = typeof window_.crypto != =
'undefined' ? window_.crypto : window_.msCrypto;
var randomValuesStandard = function()
{
var buf = new Uint32Array(1);

@ -69,7 +69,7 @@ typedef NTSTATUS(FAR PASCAL *CNGAPI_DRBG)(BCRYPT_ALG_HANDLE, UCHAR *, ULONG,
#endif
#ifndef TLS
#ifdef _WIN32
#ifdef _WIN32
#ifdef _MSC_VER
#define TLS __declspec(thread)
#else

File diff suppressed because it is too large Load Diff

@ -1,40 +1,52 @@
{
{ 25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626, -11754271, -6079156, 2047605 },
{ -12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384, 19500929, -15469378 },
{ -8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919, 11864899, -24514362, -4438546 }
},
{
{ 15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600, -14772189, 28944400, -1550024 },
{ 16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577, -11775962, 7689662, 11199574 },
{ 30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774, 10017326, -17749093, -9920357 }
},
{
{ 10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885, 14515107, -15438304, 10819380 },
{ 4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668, 12483688, -12668491, 5581306 },
{ 19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350, 13850243, -23678021, -15815942 }
},
{
{ 5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134, -23952439, -15175766 },
{ -30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025, 16520125, 30598449, 7715701 },
{ 28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660, 1370708, 29794553, -1409300 }
},
{
{ -22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211, -1361450, -13062696, 13821877 },
{ -6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028, -7212327, 18853322, -14220951 },
{ 4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358, -10431137, 2207753, -3209784 }
},
{
{ -25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364, -663000, -31111463, -16132436 },
{ 25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789, 15725684, 171356, 6466918 },
{ 23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339, -14088058, -30714912, 16193877 }
},
{
{ -33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398, 4729455, -18074513, 9256800 },
{ -25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405, 9761698, -19827198, 630305 },
{ -13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551, -15960994, -2449256, -14291300 }
},
{
{ -3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575, 15033784, 25105118, -7894876 },
{ -24326370, 15950226, -31801215, -14592823, -11662737, -5090925, 1573892, -2625887, 2198790, -15804619 },
{ -3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022, -16236442, -32461234, -12290683 }
{{25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
-11754271, -6079156, 2047605},
{-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384,
19500929, -15469378},
{-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
11864899, -24514362, -4438546}},
{{15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
-14772189, 28944400, -1550024},
{16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
-11775962, 7689662, 11199574},
{30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
10017326, -17749093, -9920357}},
{{10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
14515107, -15438304, 10819380},
{4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
12483688, -12668491, 5581306},
{19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
13850243, -23678021, -15815942}},
{{5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134,
-23952439, -15175766},
{-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
16520125, 30598449, 7715701},
{28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
1370708, 29794553, -1409300}},
{{-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
-1361450, -13062696, 13821877},
{-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
-7212327, 18853322, -14220951},
{4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
-10431137, 2207753, -3209784}},
{{-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
-663000, -31111463, -16132436},
{25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
15725684, 171356, 6466918},
{23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
-14088058, -30714912, 16193877}},
{{-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
4729455, -18074513, 9256800},
{-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
9761698, -19827198, 630305},
{-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
-15960994, -2449256, -14291300}},
{
{-3151181, -5046075, 9282714, 6866145, -31907062,
-863023, -18940575, 15033784, 25105118, -7894876},
{-24326370, 15950226, -31801215, -14592823, -11662737,
-5090925, 1573892, -2625887, 2198790, -15804619},
{
-3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
-16236442, -32461234, -12290683
}
}

@ -1,20 +1,18 @@
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
static const fe25519 d = {
-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
};
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555
*/
static const fe25519 d = {-10913610, 13857413, -15372611, 6949391, 114729,
-8787816, -6275908, -3247719, -18696448, -12055116};
/* 2 * d =
* 16295367250680780974490674513165176452449235426866156013048779062215315747161
*/
static const fe25519 d2 = {
-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199 };
static const fe25519 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
15978800, -12551817, -6495438, 29715968, 9444199};
/* sqrt(-1) */
static const fe25519 sqrtm1 = {
-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
};
static const fe25519 sqrtm1 = {-32595792, -7943725, 9377950, 3500415,
12389472, -272473, -25146209, -2005654,
326686, 11406482};
/* A = 486662 */
static const fe25519 curve25519_A = {
486662, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static const fe25519 curve25519_A = {486662, 0, 0, 0, 0, 0, 0, 0, 0, 0};

@ -5,70 +5,70 @@
void
fe25519_frombytes(fe25519 h, const unsigned char *s)
{
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t) 1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t) 1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t) 1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t) 1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t) 1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t) 1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t) 1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t) 1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t) 1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t) 1L << 26);
h[0] = (int32_t) h0;
h[1] = (int32_t) h1;
h[2] = (int32_t) h2;
h[3] = (int32_t) h3;
h[4] = (int32_t) h4;
h[5] = (int32_t) h5;
h[6] = (int32_t) h6;
h[7] = (int32_t) h7;
h[8] = (int32_t) h8;
h[9] = (int32_t) h9;
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t)1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t)1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t)1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t)1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t)1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t)1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t)1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t)1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t)1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t)1L << 26);
h[0] = (int32_t)h0;
h[1] = (int32_t)h1;
h[2] = (int32_t)h2;
h[3] = (int32_t)h3;
h[4] = (int32_t)h4;
h[5] = (int32_t)h5;
h[6] = (int32_t)h6;
h[7] = (int32_t)h7;
h[8] = (int32_t)h8;
h[9] = (int32_t)h9;
}
/*
@ -99,76 +99,77 @@ fe25519_frombytes(fe25519 h, const unsigned char *s)
static void
fe25519_reduce(fe25519 h, const fe25519 f)
{
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t) 1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t) 1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t) 1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t) 1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t) 1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t) 1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t) 1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t) 1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t) 1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t) 1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7,
carry8, carry9;
q = (19 * h9 + ((uint32_t)1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t)1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t)1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t)1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t)1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t)1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t)1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t)1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t)1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t)1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t)1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
}
/*
@ -182,39 +183,39 @@ fe25519_reduce(fe25519 h, const fe25519 f)
void
fe25519_tobytes(unsigned char *s, const fe25519 h)
{
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t) 1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t) 1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t) 1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t) 1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t) 1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t) 1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t) 1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t) 1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t)1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t)1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t)1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t)1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t)1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t)1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t)1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t)1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;
}

@ -17,4 +17,3 @@
#define REDMASK51 crypto_scalarmult_curve25519_sandy2x_REDMASK51
#endif /* ifndef consts_namespace_H */

@ -26,13 +26,14 @@ crypto_scalarmult_curve25519_sandy2x(unsigned char *q, const unsigned char *n,
const unsigned char *p)
{
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0; i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
{
t[i] = n[i];
}
t[0] &= 248;
t[31] &= 127;
@ -72,13 +73,14 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
const unsigned char *n)
{
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0;i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
{
t[i] = n[i];
}
t[0] &= 248;
t[31] &= 127;
@ -106,9 +108,8 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
}
struct crypto_scalarmult_curve25519_implementation
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult = ) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base = ) crypto_scalarmult_curve25519_sandy2x_base
};
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult =) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base =) crypto_scalarmult_curve25519_sandy2x_base};
#endif

@ -21,6 +21,7 @@ Bounds on each t[i] vary depending on context.
#define fe_frombytes crypto_scalarmult_curve25519_sandy2x_fe_frombytes
extern void fe_frombytes(fe, const unsigned char *);
extern void
fe_frombytes(fe, const unsigned char *);
#endif

@ -9,7 +9,8 @@
#define fe51_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include <stdint.h>
@ -17,16 +18,19 @@ extern "C" {
#include "fe51_namespace.h"
typedef struct
{
typedef struct
{
uint64_t v[5];
}
fe51;
extern void fe51_pack(unsigned char *, const fe51 *);
extern void fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void fe51_nsquare(fe51 *, const fe51 *, int);
extern void fe51_invert(fe51 *, const fe51 *);
} fe51;
extern void
fe51_pack(unsigned char *, const fe51 *);
extern void
fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void
fe51_nsquare(fe51 *, const fe51 *, int);
extern void
fe51_invert(fe51 *, const fe51 *);
#ifdef __cplusplus
}

@ -12,47 +12,47 @@
void
fe51_invert(fe51 *r, const fe51 *x)
{
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2,x);
/* 4 */ fe51_square(&t,&z2);
/* 8 */ fe51_square(&t,&t);
/* 9 */ fe51_mul(&z9,&t,x);
/* 11 */ fe51_mul(&z11,&z9,&z2);
/* 22 */ fe51_square(&t,&z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0,&t,&z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t,&z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0,&t,&z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t,&z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0,&t,&z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t,&z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t,&t,&z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t,&t,10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0,&t,&z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t,&z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0,&t,&z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t,&z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t,&t,&z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t,&t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t,&t,&z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t,&t,5);
/* 2^255 - 21 */ fe51_mul(r,&t,&z11);
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2, x);
/* 4 */ fe51_square(&t, &z2);
/* 8 */ fe51_square(&t, &t);
/* 9 */ fe51_mul(&z9, &t, x);
/* 11 */ fe51_mul(&z11, &z9, &z2);
/* 22 */ fe51_square(&t, &z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0, &t, &z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t, &z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0, &t, &z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t, &z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0, &t, &z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t, &z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t, &t, &z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t, &t, 10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0, &t, &z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t, &z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0, &t, &z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t, &z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t, &t, &z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t, &t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t, &t, &z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t, &t, 5);
/* 2^255 - 21 */ fe51_mul(r, &t, &z11);
}
#endif

@ -1,16 +1,15 @@
#ifndef fe51_namespace_H
#define fe51_namespace_H
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#endif /* ifndef fe51_namespace_H */

@ -10,9 +10,9 @@ static uint64_t
load_3(const unsigned char *in)
{
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
return result;
}
@ -20,10 +20,10 @@ static uint64_t
load_4(const unsigned char *in)
{
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result |= ((uint64_t) in[3]) << 24;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
result |= ((uint64_t)in[3]) << 24;
return result;
}
@ -51,17 +51,37 @@ fe_frombytes(fe h, const unsigned char *s)
uint64_t carry8;
uint64_t carry9;
carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF;
carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF;
carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF;
carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF;
carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF;
carry9 = h9 >> 25;
h0 += carry9 * 19;
h9 &= 0x1FFFFFF;
carry1 = h1 >> 25;
h2 += carry1;
h1 &= 0x1FFFFFF;
carry3 = h3 >> 25;
h4 += carry3;
h3 &= 0x1FFFFFF;
carry5 = h5 >> 25;
h6 += carry5;
h5 &= 0x1FFFFFF;
carry7 = h7 >> 25;
h8 += carry7;
h7 &= 0x1FFFFFF;
carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF;
carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF;
carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF;
carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF;
carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF;
carry0 = h0 >> 26;
h1 += carry0;
h0 &= 0x3FFFFFF;
carry2 = h2 >> 26;
h3 += carry2;
h2 &= 0x3FFFFFF;
carry4 = h4 >> 26;
h5 += carry4;
h4 &= 0x3FFFFFF;
carry6 = h6 >> 26;
h7 += carry6;
h6 &= 0x3FFFFFF;
carry8 = h8 >> 26;
h9 += carry8;
h8 &= 0x3FFFFFF;
h[0] = h0;
h[1] = h1;

@ -2,17 +2,18 @@
#define ladder_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include "fe.h"
#include "ladder_namespace.h"
extern void ladder(fe *, const unsigned char *);
extern void
ladder(fe *, const unsigned char *);
#ifdef __cplusplus
}
#endif
#endif /* ifndef ladder_H */

@ -2,17 +2,18 @@
#define ladder_base_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include "fe.h"
#include "ladder_base_namespace.h"
extern void ladder_base(fe *, const unsigned char *);
extern void
ladder_base(fe *, const unsigned char *);
#ifdef __cplusplus
}
#endif
#endif /* ifndef ladder_base_H */

@ -1,8 +1,7 @@
#ifndef ladder_base_namespace_H
#define ladder_base_namespace_H
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define _ladder_base _crypto_scalarmult_curve25519_sandy2x_ladder_base
#endif /* ifndef ladder_base_namespace_H */

@ -1,8 +1,7 @@
#ifndef ladder_namespace_H
#define ladder_namespace_H
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define _ladder _crypto_scalarmult_curve25519_sandy2x_ladder
#endif /* ifndef ladder_namespace_H */

@ -2,10 +2,10 @@
#ifndef scalarmult_poly1305_H
#define scalarmult_poly1305_H
typedef struct crypto_scalarmult_curve25519_implementation {
int (*mult)(unsigned char *q, const unsigned char *n,
const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
typedef struct crypto_scalarmult_curve25519_implementation
{
int (*mult)(unsigned char *q, const unsigned char *n, const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
} crypto_scalarmult_curve25519_implementation;
#endif

@ -1,18 +1,17 @@
#ifndef sign_ed25519_ref10_H
#define sign_ed25519_ref10_H
void _crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs,
int prehashed);
void
_crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs, int prehashed);
int _crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk, int prehashed);
int
_crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk, int prehashed);
int _crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk,
int prehashed);
int
_crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk, int prehashed);
#endif

@ -1,12 +1,22 @@
int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int
crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
int
crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int
crypto_kem_keypair_ref(unsigned char *pk, unsigned char *sk);
int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);
int
crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int
crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int
crypto_kem_keypair_avx2(unsigned char *pk, unsigned char *sk);

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
#ifdef HAVE_TI_MODE
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#endif
#endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
{
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
}
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
{
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
}
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
{
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
}
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
{
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
}
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
#endif
}
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
#endif
}
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
#endif
}
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
#endif
}
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
{
#ifdef NATIVE_BIG_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
#endif
}
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
{
#ifdef NATIVE_BIG_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
#endif
}
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
{
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
}
for(i = 0; i < n; i++)
{
out[i] ^= in[i];
}
}
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# define HAVE_EMMINTRIN_H 1
# define HAVE_PMMINTRIN_H 1
# define HAVE_TMMINTRIN_H 1
# define HAVE_SMMINTRIN_H 1
# define HAVE_AVXINTRIN_H 1
# if _MSC_VER >= 1600
# define HAVE_WMMINTRIN_H 1
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#define HAVE_MMINTRIN_H 1
#define HAVE_EMMINTRIN_H 1
#define HAVE_PMMINTRIN_H 1
#define HAVE_TMMINTRIN_H 1
#define HAVE_SMMINTRIN_H 1
#define HAVE_AVXINTRIN_H 1
#if _MSC_VER >= 1600
#define HAVE_WMMINTRIN_H 1
#endif
#if _MSC_VER >= 1700 && defined(_M_X64)
#define HAVE_AVX2INTRIN_H 1
#endif
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
#endif
#ifdef HAVE_LIBCTGRIND
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
#else
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0
#endif
#endif

@ -14,157 +14,189 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_box_SEEDBYTES crypto_box_curve25519xsalsa20poly1305_SEEDBYTES
SODIUM_EXPORT
size_t crypto_box_seedbytes(void);
#define crypto_box_PUBLICKEYBYTES crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
SODIUM_EXPORT
size_t crypto_box_publickeybytes(void);
#define crypto_box_SECRETKEYBYTES crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
SODIUM_EXPORT
size_t crypto_box_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_seedbytes(void);
#define crypto_box_PUBLICKEYBYTES \
crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
SODIUM_EXPORT
size_t
crypto_box_publickeybytes(void);
#define crypto_box_SECRETKEYBYTES \
crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
SODIUM_EXPORT
size_t
crypto_box_secretkeybytes(void);
#define crypto_box_NONCEBYTES crypto_box_curve25519xsalsa20poly1305_NONCEBYTES
SODIUM_EXPORT
size_t crypto_box_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_box_noncebytes(void);
#define crypto_box_MACBYTES crypto_box_curve25519xsalsa20poly1305_MACBYTES
SODIUM_EXPORT
size_t crypto_box_macbytes(void);
SODIUM_EXPORT
size_t
crypto_box_macbytes(void);
#define crypto_box_MESSAGEBYTES_MAX crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t crypto_box_messagebytes_max(void);
#define crypto_box_MESSAGEBYTES_MAX \
crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t
crypto_box_messagebytes_max(void);
#define crypto_box_PRIMITIVE "curve25519xsalsa20poly1305"
SODIUM_EXPORT
const char *crypto_box_primitive(void);
SODIUM_EXPORT
int crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_box_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
SODIUM_EXPORT
size_t crypto_box_beforenmbytes(void);
SODIUM_EXPORT
int crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
/* -- Ephemeral SK interface -- */
SODIUM_EXPORT
const char *
crypto_box_primitive(void);
SODIUM_EXPORT
int
crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_box_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac, unsigned long long clen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES \
crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
SODIUM_EXPORT
size_t
crypto_box_beforenmbytes(void);
SODIUM_EXPORT
int
crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
/* -- Ephemeral SK interface -- */
#define crypto_box_SEALBYTES (crypto_box_PUBLICKEYBYTES + crypto_box_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_sealbytes(void);
SODIUM_EXPORT
size_t
crypto_box_sealbytes(void);
SODIUM_EXPORT
int crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
SODIUM_EXPORT
int
crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
SODIUM_EXPORT
int crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_ZEROBYTES crypto_box_curve25519xsalsa20poly1305_ZEROBYTES
SODIUM_EXPORT
size_t crypto_box_zerobytes(void);
#define crypto_box_BOXZEROBYTES crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
SODIUM_EXPORT
size_t crypto_box_boxzerobytes(void);
SODIUM_EXPORT
int crypto_box(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
size_t
crypto_box_zerobytes(void);
#define crypto_box_BOXZEROBYTES \
crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
SODIUM_EXPORT
size_t
crypto_box_boxzerobytes(void);
SODIUM_EXPORT
int
crypto_box(unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk) __attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,101 +6,114 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_box_curve25519xsalsa20poly1305_SEEDBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_NONCEBYTES 24U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
#define crypto_box_curve25519xsalsa20poly1305_MACBYTES 16U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_macbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_macbytes(void);
/* Only for the libsodium API - The NaCl compatibility API would require BOXZEROBYTES extra bytes */
/* Only for the libsodium API - The NaCl compatibility API would require
* BOXZEROBYTES extra bytes */
#define crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX \
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX \
- crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES 16U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
#define crypto_box_curve25519xsalsa20poly1305_ZEROBYTES \
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES + \
crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_open(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES \
+ crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305(
unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_open(
unsigned char *m, const unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_core_hchacha20_OUTPUTBYTES 32U
SODIUM_EXPORT
size_t crypto_core_hchacha20_outputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_outputbytes(void);
#define crypto_core_hchacha20_INPUTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_hchacha20_inputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_inputbytes(void);
#define crypto_core_hchacha20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_core_hchacha20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_keybytes(void);
#define crypto_core_hchacha20_CONSTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_hchacha20_constbytes(void);
SODIUM_EXPORT
int crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_constbytes(void);
SODIUM_EXPORT
int
crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus
}

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_core_salsa20_OUTPUTBYTES 64U
SODIUM_EXPORT
size_t crypto_core_salsa20_outputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_outputbytes(void);
#define crypto_core_salsa20_INPUTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_salsa20_inputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_inputbytes(void);
#define crypto_core_salsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_core_salsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_keybytes(void);
#define crypto_core_salsa20_CONSTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_salsa20_constbytes(void);
SODIUM_EXPORT
int crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
SODIUM_EXPORT
size_t
crypto_core_salsa20_constbytes(void);
SODIUM_EXPORT
int
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus
}

@ -7,66 +7,79 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_generichash_BYTES_MIN crypto_generichash_blake2b_BYTES_MIN
SODIUM_EXPORT
size_t crypto_generichash_bytes_min(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes_min(void);
#define crypto_generichash_BYTES_MAX crypto_generichash_blake2b_BYTES_MAX
SODIUM_EXPORT
size_t crypto_generichash_bytes_max(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes_max(void);
#define crypto_generichash_BYTES crypto_generichash_blake2b_BYTES
SODIUM_EXPORT
size_t crypto_generichash_bytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes(void);
#define crypto_generichash_KEYBYTES_MIN crypto_generichash_blake2b_KEYBYTES_MIN
SODIUM_EXPORT
size_t crypto_generichash_keybytes_min(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes_min(void);
#define crypto_generichash_KEYBYTES_MAX crypto_generichash_blake2b_KEYBYTES_MAX
SODIUM_EXPORT
size_t crypto_generichash_keybytes_max(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes_max(void);
#define crypto_generichash_KEYBYTES crypto_generichash_blake2b_KEYBYTES
SODIUM_EXPORT
size_t crypto_generichash_keybytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes(void);
#define crypto_generichash_PRIMITIVE "blake2b"
SODIUM_EXPORT
const char *crypto_generichash_primitive(void);
typedef crypto_generichash_blake2b_state crypto_generichash_state;
SODIUM_EXPORT
size_t crypto_generichash_statebytes(void);
SODIUM_EXPORT
int crypto_generichash(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
SODIUM_EXPORT
int crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int crypto_generichash_final(crypto_generichash_state *state,
unsigned char *out, const size_t outlen);
SODIUM_EXPORT
void crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
SODIUM_EXPORT
const char *
crypto_generichash_primitive(void);
typedef crypto_generichash_blake2b_state crypto_generichash_state;
SODIUM_EXPORT
size_t
crypto_generichash_statebytes(void);
SODIUM_EXPORT
int
crypto_generichash(unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key,
size_t keylen);
SODIUM_EXPORT
int
crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
SODIUM_EXPORT
int
crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in, unsigned long long inlen);
SODIUM_EXPORT
int
crypto_generichash_final(crypto_generichash_state *state, unsigned char *out,
const size_t outlen);
SODIUM_EXPORT
void
crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
#ifdef __cplusplus
}

@ -8,107 +8,120 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack(1)
#pragma pack(1)
#else
# pragma pack(push, 1)
#pragma pack(push, 1)
#endif
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state {
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state
{
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack()
#pragma pack()
#else
# pragma pack(pop)
#pragma pack(pop)
#endif
#define crypto_generichash_blake2b_BYTES_MIN 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes_min(void);
#define crypto_generichash_blake2b_BYTES_MAX 64U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes_max(void);
#define crypto_generichash_blake2b_BYTES 32U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes(void);
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes_min(void);
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes_max(void);
#define crypto_generichash_blake2b_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes(void);
#define crypto_generichash_blake2b_SALTBYTES 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_saltbytes(void);
#define crypto_generichash_blake2b_BYTES_MIN 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes_min(void);
#define crypto_generichash_blake2b_BYTES_MAX 64U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes_max(void);
#define crypto_generichash_blake2b_BYTES 32U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes(void);
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes_min(void);
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes_max(void);
#define crypto_generichash_blake2b_KEYBYTES 32U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes(void);
#define crypto_generichash_blake2b_SALTBYTES 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_saltbytes(void);
#define crypto_generichash_blake2b_PERSONALBYTES 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_personalbytes(void);
SODIUM_EXPORT
size_t crypto_generichash_blake2b_statebytes(void);
SODIUM_EXPORT
int crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int crypto_generichash_blake2b_salt_personal(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key,
size_t keylen,
const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
SODIUM_EXPORT
int crypto_generichash_blake2b_init_salt_personal(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen,
const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out,
const size_t outlen);
SODIUM_EXPORT
void crypto_generichash_blake2b_keygen(unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_personalbytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_statebytes(void);
SODIUM_EXPORT
int
crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_salt_personal(
unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key, size_t keylen,
const unsigned char *salt, const unsigned char *personal);
SODIUM_EXPORT
int
crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_init_salt_personal(
crypto_generichash_blake2b_state *state, const unsigned char *key,
const size_t keylen, const size_t outlen, const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int
crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out, const size_t outlen);
SODIUM_EXPORT
void
crypto_generichash_blake2b_keygen(
unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
#ifdef __cplusplus
}

@ -7,36 +7,41 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_BYTES crypto_scalarmult_curve25519_BYTES
SODIUM_EXPORT
size_t crypto_scalarmult_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_bytes(void);
#define crypto_scalarmult_SCALARBYTES crypto_scalarmult_curve25519_SCALARBYTES
SODIUM_EXPORT
size_t crypto_scalarmult_scalarbytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_scalarbytes(void);
#define crypto_scalarmult_PRIMITIVE "curve25519"
SODIUM_EXPORT
const char *crypto_scalarmult_primitive(void);
SODIUM_EXPORT
int crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
const char *
crypto_scalarmult_primitive(void);
SODIUM_EXPORT
int
crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p) __attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,32 +6,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_curve25519_BYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_curve25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_curve25519_bytes(void);
#define crypto_scalarmult_curve25519_SCALARBYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_curve25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
SODIUM_EXPORT
size_t
crypto_scalarmult_curve25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus
}

@ -7,32 +7,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_ed25519_BYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_ed25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_ed25519_bytes(void);
#define crypto_scalarmult_ed25519_SCALARBYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_ed25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
SODIUM_EXPORT
size_t
crypto_scalarmult_ed25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus
}

@ -14,87 +14,102 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
typedef crypto_sign_ed25519ph_state crypto_sign_state;
typedef crypto_sign_ed25519ph_state crypto_sign_state;
SODIUM_EXPORT
size_t crypto_sign_statebytes(void);
SODIUM_EXPORT
size_t
crypto_sign_statebytes(void);
#define crypto_sign_BYTES crypto_sign_ed25519_BYTES
SODIUM_EXPORT
size_t crypto_sign_bytes(void);
SODIUM_EXPORT
size_t
crypto_sign_bytes(void);
#define crypto_sign_SEEDBYTES crypto_sign_ed25519_SEEDBYTES
SODIUM_EXPORT
size_t crypto_sign_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_sign_seedbytes(void);
#define crypto_sign_PUBLICKEYBYTES crypto_sign_ed25519_PUBLICKEYBYTES
SODIUM_EXPORT
size_t crypto_sign_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_publickeybytes(void);
#define crypto_sign_SECRETKEYBYTES crypto_sign_ed25519_SECRETKEYBYTES
SODIUM_EXPORT
size_t crypto_sign_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_secretkeybytes(void);
#define crypto_sign_MESSAGEBYTES_MAX crypto_sign_ed25519_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t crypto_sign_messagebytes_max(void);
SODIUM_EXPORT
size_t
crypto_sign_messagebytes_max(void);
#define crypto_sign_PRIMITIVE "ed25519"
SODIUM_EXPORT
const char *crypto_sign_primitive(void);
SODIUM_EXPORT
int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_init(crypto_sign_state *state);
SODIUM_EXPORT
int crypto_sign_update(crypto_sign_state *state,
const unsigned char *m, unsigned long long mlen);
SODIUM_EXPORT
int crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
const char *
crypto_sign_primitive(void);
SODIUM_EXPORT
int
crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk) __attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_verify_detached(const unsigned char *sig, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_init(crypto_sign_state *state);
SODIUM_EXPORT
int
crypto_sign_update(crypto_sign_state *state, const unsigned char *m,
unsigned long long mlen);
SODIUM_EXPORT
int
crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,106 +6,125 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
typedef struct crypto_sign_ed25519ph_state {
typedef struct crypto_sign_ed25519ph_state
{
crypto_hash_sha512_state hs;
} crypto_sign_ed25519ph_state;
} crypto_sign_ed25519ph_state;
SODIUM_EXPORT
size_t crypto_sign_ed25519ph_statebytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519ph_statebytes(void);
#define crypto_sign_ed25519_BYTES 64U
SODIUM_EXPORT
size_t crypto_sign_ed25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_bytes(void);
#define crypto_sign_ed25519_SEEDBYTES 32U
SODIUM_EXPORT
size_t crypto_sign_ed25519_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_seedbytes(void);
#define crypto_sign_ed25519_PUBLICKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_sign_ed25519_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_publickeybytes(void);
#define crypto_sign_ed25519_SECRETKEYBYTES (32U + 32U)
SODIUM_EXPORT
size_t crypto_sign_ed25519_secretkeybytes(void);
#define crypto_sign_ed25519_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
SODIUM_EXPORT
size_t crypto_sign_ed25519_messagebytes_max(void);
SODIUM_EXPORT
int crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_seed(unsigned char *seed,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
SODIUM_EXPORT
int crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m,
unsigned long long mlen);
SODIUM_EXPORT
int crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
size_t
crypto_sign_ed25519_secretkeybytes(void);
#define crypto_sign_ed25519_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
SODIUM_EXPORT
size_t
crypto_sign_ed25519_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_seed(unsigned char *seed, const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m, unsigned long long mlen);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -16,37 +16,42 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_sign_edwards25519sha512batch_BYTES 64U
#define crypto_sign_edwards25519sha512batch_PUBLICKEYBYTES 32U
#define crypto_sign_edwards25519sha512batch_SECRETKEYBYTES (32U + 32U)
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
__attribute__ ((deprecated));
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((deprecated));
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
__attribute__ ((deprecated));
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
__attribute__((deprecated));
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
__attribute__((deprecated));
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
__attribute__((deprecated));
#ifdef __cplusplus
}

@ -14,82 +14,103 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_chacha20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_chacha20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_keybytes(void);
#define crypto_stream_chacha20_NONCEBYTES 8U
SODIUM_EXPORT
size_t crypto_stream_chacha20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_noncebytes(void);
#define crypto_stream_chacha20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_chacha20_messagebytes_max(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_messagebytes_max(void);
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed */
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed
*/
SODIUM_EXPORT
int crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]);
SODIUM_EXPORT
void
crypto_stream_chacha20_keygen(
unsigned char k[crypto_stream_chacha20_KEYBYTES]);
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
#define crypto_stream_chacha20_ietf_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_keybytes(void);
#define crypto_stream_chacha20_ietf_NONCEBYTES 12U
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_noncebytes(void);
#define crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX \
SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
#define crypto_stream_chacha20_IETF_KEYBYTES crypto_stream_chacha20_ietf_KEYBYTES
#define crypto_stream_chacha20_IETF_NONCEBYTES crypto_stream_chacha20_ietf_NONCEBYTES
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_chacha20_ietf_keygen(
unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
#define crypto_stream_chacha20_IETF_KEYBYTES \
crypto_stream_chacha20_ietf_KEYBYTES
#define crypto_stream_chacha20_IETF_NONCEBYTES \
crypto_stream_chacha20_ietf_NONCEBYTES
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX \
crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
#ifdef __cplusplus
}

@ -14,41 +14,48 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_salsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_salsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_keybytes(void);
#define crypto_stream_salsa20_NONCEBYTES 8U
SODIUM_EXPORT
size_t crypto_stream_salsa20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_noncebytes(void);
#define crypto_stream_salsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_salsa20_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
#ifdef __cplusplus
}

@ -14,41 +14,49 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_xsalsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_keybytes(void);
#define crypto_stream_xsalsa20_NONCEBYTES 24U
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_noncebytes(void);
#define crypto_stream_xsalsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_xsalsa20_keygen(
unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
#ifdef __cplusplus
}

@ -7,48 +7,48 @@
#include <limits.h>
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifdef SODIUM_STATIC
# define SODIUM_EXPORT
# define SODIUM_EXPORT_WEAK
#define SODIUM_EXPORT
#define SODIUM_EXPORT_WEAK
#else
#if defined(_MSC_VER)
#ifdef SODIUM_DLL_EXPORT
#define SODIUM_EXPORT __declspec(dllexport)
#else
#define SODIUM_EXPORT __declspec(dllimport)
#endif
#else
#if defined(__SUNPRO_C)
#ifndef __GNU_C__
#define SODIUM_EXPORT __attribute__(visibility(__global))
#else
#define SODIUM_EXPORT __attribute__ __global
#endif
#elif defined(_MSG_VER)
#define SODIUM_EXPORT extern __declspec(dllexport)
#else
# if defined(_MSC_VER)
# ifdef SODIUM_DLL_EXPORT
# define SODIUM_EXPORT __declspec(dllexport)
# else
# define SODIUM_EXPORT __declspec(dllimport)
# endif
# else
# if defined(__SUNPRO_C)
# ifndef __GNU_C__
# define SODIUM_EXPORT __attribute__ (visibility(__global))
# else
# define SODIUM_EXPORT __attribute__ __global
# endif
# elif defined(_MSG_VER)
# define SODIUM_EXPORT extern __declspec(dllexport)
# else
# define SODIUM_EXPORT __attribute__ ((visibility ("default")))
# endif
# endif
# if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
# define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
# else
# define SODIUM_EXPORT_WEAK SODIUM_EXPORT
# endif
#define SODIUM_EXPORT __attribute__((visibility("default")))
#endif
#endif
#if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
#define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
#else
#define SODIUM_EXPORT_WEAK SODIUM_EXPORT
#endif
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
#ifdef HAVE_TI_MODE
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#endif
#endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
{
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
}
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
{
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
}
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
{
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
}
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
{
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
}
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
#endif
}
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
#endif
}
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
#endif
}
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
#endif
}
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
{
#ifdef NATIVE_BIG_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
#endif
}
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
{
#ifdef NATIVE_BIG_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
#endif
}
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
{
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
}
for(i = 0; i < n; i++)
{
out[i] ^= in[i];
}
}
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# define HAVE_EMMINTRIN_H 1
# define HAVE_PMMINTRIN_H 1
# define HAVE_TMMINTRIN_H 1
# define HAVE_SMMINTRIN_H 1
# define HAVE_AVXINTRIN_H 1
# if _MSC_VER >= 1600
# define HAVE_WMMINTRIN_H 1
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#define HAVE_MMINTRIN_H 1
#define HAVE_EMMINTRIN_H 1
#define HAVE_PMMINTRIN_H 1
#define HAVE_TMMINTRIN_H 1
#define HAVE_SMMINTRIN_H 1
#define HAVE_AVXINTRIN_H 1
#if _MSC_VER >= 1600
#define HAVE_WMMINTRIN_H 1
#endif
#if _MSC_VER >= 1700 && defined(_M_X64)
#define HAVE_AVX2INTRIN_H 1
#endif
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
#endif
#ifdef HAVE_LIBCTGRIND
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
#else
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0
#endif
#endif

@ -15,17 +15,19 @@ typedef uint64_t fe25519[5];
typedef int32_t fe25519[10];
#endif
void fe25519_invert(fe25519 out, const fe25519 z);
void fe25519_frombytes(fe25519 h, const unsigned char *s);
void fe25519_tobytes(unsigned char *s, const fe25519 h);
void
fe25519_invert(fe25519 out, const fe25519 z);
void
fe25519_frombytes(fe25519 h, const unsigned char *s);
void
fe25519_tobytes(unsigned char *s, const fe25519 h);
#ifdef HAVE_TI_MODE
# include "ed25519_ref10_fe_51.h"
#include "ed25519_ref10_fe_51.h"
#else
# include "ed25519_ref10_fe_25_5.h"
#include "ed25519_ref10_fe_25_5.h"
#endif
/*
ge means group element.
@ -40,86 +42,109 @@ void fe25519_tobytes(unsigned char *s, const fe25519 h);
ge25519_precomp (Duif): (y+x,y-x,2dxy)
*/
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
} ge25519_p2;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p3;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p1p1;
typedef struct {
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
typedef struct
{
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
} ge25519_precomp;
typedef struct {
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
typedef struct
{
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
} ge25519_cached;
void ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
void
ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
void ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
void
ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
int ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
int
ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
int ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
int
ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
void ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
void
ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
void ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
void
ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
void ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
void
ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
void ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void
ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void
ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
void
ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
void ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A,
const unsigned char *b);
void
ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A, const unsigned char *b);
void ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a,
const ge25519_p3 *p);
void
ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a, const ge25519_p3 *p);
int ge25519_is_canonical(const unsigned char *s);
int
ge25519_is_canonical(const unsigned char *s);
int ge25519_is_on_curve(const ge25519_p3 *p);
int
ge25519_is_on_curve(const ge25519_p3 *p);
int ge25519_is_on_main_subgroup(const ge25519_p3 *p);
int
ge25519_is_on_main_subgroup(const ge25519_p3 *p);
int ge25519_has_small_order(const unsigned char s[32]);
int
ge25519_has_small_order(const unsigned char s[32]);
void ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
void
ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
/*
The set of scalars is \Z/l
where l = 2^252 + 27742317777372353535851937790883648493.
*/
void sc25519_reduce(unsigned char *s);
void
sc25519_reduce(unsigned char *s);
void sc25519_muladd(unsigned char *s, const unsigned char *a,
const unsigned char *b, const unsigned char *c);
void
sc25519_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b,
const unsigned char *c);
int sc25519_is_canonical(const unsigned char *s);
int
sc25519_is_canonical(const unsigned char *s);
#endif

@ -10,7 +10,7 @@
static inline void
fe25519_0(fe25519 h)
{
memset(&h[0], 0, 5 * sizeof h[0]);
memset(&h[0], 0, 5 * sizeof h[0]);
}
/*
@ -20,8 +20,8 @@ fe25519_0(fe25519 h)
static inline void
fe25519_1(fe25519 h)
{
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
}
/*
@ -32,17 +32,17 @@ fe25519_1(fe25519 h)
static inline void
fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
{
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}
/*
@ -52,37 +52,37 @@ fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}
/*
@ -92,10 +92,10 @@ fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
static inline void
fe25519_neg(fe25519 h, const fe25519 f)
{
fe25519 zero;
fe25519 zero;
fe25519_0(zero);
fe25519_sub(h, zero, f);
fe25519_0(zero);
fe25519_sub(h, zero, f);
}
/*
@ -108,31 +108,31 @@ fe25519_neg(fe25519 h, const fe25519 f)
static void
fe25519_cmov(fe25519 f, const fe25519 g, unsigned int b)
{
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
}
/*
@ -145,43 +145,43 @@ Preconditions: b in {0,1}.
static void
fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
{
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
}
/*
@ -191,17 +191,17 @@ fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
static inline void
fe25519_copy(fe25519 h, const fe25519 f)
{
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
}
/*
@ -212,11 +212,11 @@ fe25519_copy(fe25519 h, const fe25519 f)
static inline int
fe25519_isnegative(const fe25519 f)
{
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return s[0] & 1;
return s[0] & 1;
}
/*
@ -227,11 +227,11 @@ fe25519_isnegative(const fe25519 f)
static inline int
fe25519_iszero(const fe25519 f)
{
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return sodium_is_zero(s, 32);
return sodium_is_zero(s, 32);
}
/*
@ -242,87 +242,87 @@ fe25519_iszero(const fe25519 f)
static void
fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) g0);
r0 += ((uint128_t) f1_19) * ((uint128_t) g4);
r0 += ((uint128_t) f2_19) * ((uint128_t) g3);
r0 += ((uint128_t) f3_19) * ((uint128_t) g2);
r0 += ((uint128_t) f4_19) * ((uint128_t) g1);
r1 = ((uint128_t) f0 ) * ((uint128_t) g1);
r1 += ((uint128_t) f1 ) * ((uint128_t) g0);
r1 += ((uint128_t) f2_19) * ((uint128_t) g4);
r1 += ((uint128_t) f3_19) * ((uint128_t) g3);
r1 += ((uint128_t) f4_19) * ((uint128_t) g2);
r2 = ((uint128_t) f0 ) * ((uint128_t) g2);
r2 += ((uint128_t) f1 ) * ((uint128_t) g1);
r2 += ((uint128_t) f2 ) * ((uint128_t) g0);
r2 += ((uint128_t) f3_19) * ((uint128_t) g4);
r2 += ((uint128_t) f4_19) * ((uint128_t) g3);
r3 = ((uint128_t) f0 ) * ((uint128_t) g3);
r3 += ((uint128_t) f1 ) * ((uint128_t) g2);
r3 += ((uint128_t) f2 ) * ((uint128_t) g1);
r3 += ((uint128_t) f3 ) * ((uint128_t) g0);
r3 += ((uint128_t) f4_19) * ((uint128_t) g4);
r4 = ((uint128_t) f0 ) * ((uint128_t) g4);
r4 += ((uint128_t) f1 ) * ((uint128_t) g3);
r4 += ((uint128_t) f2 ) * ((uint128_t) g2);
r4 += ((uint128_t) f3 ) * ((uint128_t) g1);
r4 += ((uint128_t) f4 ) * ((uint128_t) g0);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)g0);
r0 += ((uint128_t)f1_19) * ((uint128_t)g4);
r0 += ((uint128_t)f2_19) * ((uint128_t)g3);
r0 += ((uint128_t)f3_19) * ((uint128_t)g2);
r0 += ((uint128_t)f4_19) * ((uint128_t)g1);
r1 = ((uint128_t)f0) * ((uint128_t)g1);
r1 += ((uint128_t)f1) * ((uint128_t)g0);
r1 += ((uint128_t)f2_19) * ((uint128_t)g4);
r1 += ((uint128_t)f3_19) * ((uint128_t)g3);
r1 += ((uint128_t)f4_19) * ((uint128_t)g2);
r2 = ((uint128_t)f0) * ((uint128_t)g2);
r2 += ((uint128_t)f1) * ((uint128_t)g1);
r2 += ((uint128_t)f2) * ((uint128_t)g0);
r2 += ((uint128_t)f3_19) * ((uint128_t)g4);
r2 += ((uint128_t)f4_19) * ((uint128_t)g3);
r3 = ((uint128_t)f0) * ((uint128_t)g3);
r3 += ((uint128_t)f1) * ((uint128_t)g2);
r3 += ((uint128_t)f2) * ((uint128_t)g1);
r3 += ((uint128_t)f3) * ((uint128_t)g0);
r3 += ((uint128_t)f4_19) * ((uint128_t)g4);
r4 = ((uint128_t)f0) * ((uint128_t)g4);
r4 += ((uint128_t)f1) * ((uint128_t)g3);
r4 += ((uint128_t)f2) * ((uint128_t)g2);
r4 += ((uint128_t)f3) * ((uint128_t)g1);
r4 += ((uint128_t)f4) * ((uint128_t)g0);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
/*
@ -333,75 +333,75 @@ fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sq(fe25519 h, const fe25519 f)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
/*
@ -412,107 +412,107 @@ fe25519_sq(fe25519 h, const fe25519 f)
static void
fe25519_sq2(fe25519 h, const fe25519 f)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
static void
fe25519_scalar_product(fe25519 h, const fe25519 f, uint32_t n)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t) n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t) a) & mask;
a = f[1] * sn + ((uint64_t) (a >> 51));
h1 = ((uint64_t) a) & mask;
a = f[2] * sn + ((uint64_t) (a >> 51));
h2 = ((uint64_t) a) & mask;
a = f[3] * sn + ((uint64_t) (a >> 51));
h3 = ((uint64_t) a) & mask;
a = f[4] * sn + ((uint64_t) (a >> 51));
h4 = ((uint64_t) a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t)n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t)a) & mask;
a = f[1] * sn + ((uint64_t)(a >> 51));
h1 = ((uint64_t)a) & mask;
a = f[2] * sn + ((uint64_t)(a >> 51));
h2 = ((uint64_t)a) & mask;
a = f[3] * sn + ((uint64_t)(a >> 51));
h3 = ((uint64_t)a) & mask;
a = f[4] * sn + ((uint64_t)(a >> 51));
h4 = ((uint64_t)a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}

@ -1,11 +1,17 @@
#ifndef implementations_H
#define implementations_H
int _crypto_generichash_blake2b_pick_best_implementation(void);
int _crypto_onetimeauth_poly1305_pick_best_implementation(void);
int _crypto_pwhash_argon2_pick_best_implementation(void);
int _crypto_scalarmult_curve25519_pick_best_implementation(void);
int _crypto_stream_chacha20_pick_best_implementation(void);
int _crypto_stream_salsa20_pick_best_implementation(void);
int
_crypto_generichash_blake2b_pick_best_implementation(void);
int
_crypto_onetimeauth_poly1305_pick_best_implementation(void);
int
_crypto_pwhash_argon2_pick_best_implementation(void);
int
_crypto_scalarmult_curve25519_pick_best_implementation(void);
int
_crypto_stream_chacha20_pick_best_implementation(void);
int
_crypto_stream_salsa20_pick_best_implementation(void);
#endif

@ -1,7 +1,9 @@
#ifndef mutex_H
#define mutex_H 1
extern int sodium_crit_enter(void);
extern int sodium_crit_leave(void);
extern int
sodium_crit_enter(void);
extern int
sodium_crit_leave(void);
#endif

@ -4,46 +4,53 @@
#include "common.h"
#ifdef HAVE_INTRIN_H
# include <intrin.h>
#include <intrin.h>
#endif
#if defined(HAVE_EMMINTRIN_H) && \
!(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) || \
defined(_M_X64) || defined(_M_AMD64))
#if defined(HAVE_EMMINTRIN_H) \
&& !(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) \
|| defined(_M_X64) || defined(_M_AMD64))
# include <emmintrin.h>
# include <stdint.h>
#include <emmintrin.h>
#include <stdint.h>
# ifndef _mm_set_epi64x
# define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
#ifndef _mm_set_epi64x
#define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
static inline __m128i
sodium__mm_set_epi64x(int64_t q1, int64_t q0)
{
union { int64_t as64; int32_t as32[2]; } x0, x1;
x0.as64 = q0; x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
union {
int64_t as64;
int32_t as32[2];
} x0, x1;
x0.as64 = q0;
x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
}
# endif
#endif
# ifndef _mm_set1_epi64x
# define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
#ifndef _mm_set1_epi64x
#define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
static inline __m128i
sodium__mm_set1_epi64x(int64_t q)
{
return _mm_set_epi64x(q, q);
return _mm_set_epi64x(q, q);
}
# endif
#endif
# ifndef _mm_cvtsi64_si128
# define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
#ifndef _mm_cvtsi64_si128
#define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
static inline __m128i
sodium__mm_cvtsi64_si128(int64_t q)
{
union { int64_t as64; int32_t as32[2]; } x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
union {
int64_t as64;
int32_t as32[2];
} x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
}
# endif
#endif
#endif

@ -4,19 +4,21 @@
#ifdef __native_client__
# include "export.h"
# include "randombytes.h"
#include "export.h"
#include "randombytes.h"
# ifdef __cplusplus
extern "C" {
# endif
#ifdef __cplusplus
extern "C"
{
#endif
SODIUM_EXPORT
extern struct randombytes_implementation randombytes_nativeclient_implementation;
SODIUM_EXPORT
extern struct randombytes_implementation
randombytes_nativeclient_implementation;
# ifdef __cplusplus
#ifdef __cplusplus
}
# endif
#endif
#endif

@ -5,45 +5,59 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
SODIUM_EXPORT_WEAK
int sodium_runtime_has_neon(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_neon(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse2(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse2(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse3(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse3(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_ssse3(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_ssse3(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse41(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse41(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx2(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx2(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx512f(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx512f(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_pclmul(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_pclmul(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_aesni(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_aesni(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_rdrand(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_rdrand(void);
/* ------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------
*/
int _sodium_runtime_get_cpu_features(void);
int
_sodium_runtime_get_cpu_features(void);
#ifdef __cplusplus
}

@ -7,161 +7,188 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#ifndef SODIUM_C99
# if defined(__cplusplus) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L
# define SODIUM_C99(X)
# else
# define SODIUM_C99(X) X
# endif
#if defined(__cplusplus) || !defined(__STDC_VERSION__) \
|| __STDC_VERSION__ < 199901L
#define SODIUM_C99(X)
#else
#define SODIUM_C99(X) X
#endif
#endif
SODIUM_EXPORT
void sodium_memzero(void * const pnt, const size_t len);
SODIUM_EXPORT
void sodium_stackzero(const size_t len);
/*
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
*/
SODIUM_EXPORT
int sodium_memcmp(const void * const b1_, const void * const b2_, size_t len)
__attribute__ ((warn_unused_result));
/*
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ == b2_
* It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
*/
SODIUM_EXPORT
int sodium_compare(const unsigned char *b1_, const unsigned char *b2_,
size_t len)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int sodium_is_zero(const unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void sodium_increment(unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
SODIUM_EXPORT
char *sodium_bin2hex(char * const hex, const size_t hex_maxlen,
const unsigned char * const bin, const size_t bin_len);
SODIUM_EXPORT
int sodium_hex2bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const hex, const size_t hex_len,
const char * const ignore, size_t * const bin_len,
const char ** const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
SODIUM_EXPORT
void
sodium_memzero(void *const pnt, const size_t len);
SODIUM_EXPORT
void
sodium_stackzero(const size_t len);
/*
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
*/
SODIUM_EXPORT
int
sodium_memcmp(const void *const b1_, const void *const b2_, size_t len)
__attribute__((warn_unused_result));
/*
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ ==
* b2_ It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
*/
SODIUM_EXPORT
int
sodium_compare(const unsigned char *b1_, const unsigned char *b2_, size_t len)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
sodium_is_zero(const unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void
sodium_increment(unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void
sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
SODIUM_EXPORT
char *
sodium_bin2hex(char *const hex, const size_t hex_maxlen,
const unsigned char *const bin, const size_t bin_len);
SODIUM_EXPORT
int
sodium_hex2bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const hex, const size_t hex_len,
const char *const ignore, size_t *const bin_len,
const char **const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
#define sodium_base64_VARIANT_ORIGINAL_NO_PADDING 3
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
/*
* Computes the required length to encode BIN_LEN bytes as a base64 string
* using the given variant. The computed length includes a trailing \0.
*/
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U + \
((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) | (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) & 1U) * \
(4U - (~((((VARIANT) & 2U) >> 1) - 1U) & (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) + 1U)
SODIUM_EXPORT
size_t sodium_base64_encoded_len(const size_t bin_len, const int variant);
SODIUM_EXPORT
char *sodium_bin2base64(char * const b64, const size_t b64_maxlen,
const unsigned char * const bin, const size_t bin_len,
const int variant);
SODIUM_EXPORT
int sodium_base642bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const b64, const size_t b64_len,
const char * const ignore, size_t * const bin_len,
const char ** const b64_end, const int variant);
SODIUM_EXPORT
int sodium_mlock(void * const addr, const size_t len);
SODIUM_EXPORT
int sodium_munlock(void * const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
*
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
*
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
*
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected pages (read/write)][guard page (no access)]
* With the layout of the unprotected pages being:
* [optional padding][16-bytes canary][user region]
*
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
*
* sodium_malloc() can be used to allocate any libsodium data structure.
*
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
*/
SODIUM_EXPORT
void *sodium_malloc(const size_t size)
__attribute__ ((malloc));
SODIUM_EXPORT
void *sodium_allocarray(size_t count, size_t size)
__attribute__ ((malloc));
SODIUM_EXPORT
void sodium_free(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_noaccess(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_readonly(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_readwrite(void *ptr);
SODIUM_EXPORT
int sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
SODIUM_EXPORT
int sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
int _sodium_alloc_init(void);
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U \
+ ((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) \
| (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) \
& 1U) \
* (4U \
- (~((((VARIANT)&2U) >> 1) - 1U) \
& (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) \
+ 1U)
SODIUM_EXPORT
size_t
sodium_base64_encoded_len(const size_t bin_len, const int variant);
SODIUM_EXPORT
char *
sodium_bin2base64(char *const b64, const size_t b64_maxlen,
const unsigned char *const bin, const size_t bin_len,
const int variant);
SODIUM_EXPORT
int
sodium_base642bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const b64, const size_t b64_len,
const char *const ignore, size_t *const bin_len,
const char **const b64_end, const int variant);
SODIUM_EXPORT
int
sodium_mlock(void *const addr, const size_t len);
SODIUM_EXPORT
int
sodium_munlock(void *const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
*
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
*
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
*
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected
* pages (read/write)][guard page (no access)] With the layout of the
* unprotected pages being: [optional padding][16-bytes canary][user region]
*
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
*
* sodium_malloc() can be used to allocate any libsodium data structure.
*
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
*/
SODIUM_EXPORT
void *
sodium_malloc(const size_t size) __attribute__((malloc));
SODIUM_EXPORT
void *
sodium_allocarray(size_t count, size_t size) __attribute__((malloc));
SODIUM_EXPORT
void
sodium_free(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_noaccess(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_readonly(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_readwrite(void *ptr);
SODIUM_EXPORT
int
sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
SODIUM_EXPORT
int
sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
int
_sodium_alloc_init(void);
#ifdef __cplusplus
}

@ -4,424 +4,463 @@
typedef crypto_int32 int32;
static inline void minmax(int32 *x,int32 *y)
static inline void
minmax(int32 *x, int32 *y)
{
asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
: : "r"(x),"r"(y) : "%eax","%ebx","%edx");
__asm__("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg "
"%%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
:
: "r"(x), "r"(y)
: "%eax", "%ebx", "%edx");
}
/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
static inline void minmax02through1315(int32 *x)
static inline void
minmax02through1315(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c,d);
__m256i h = _mm256_max_epi32(c,d);
a = _mm256_unpacklo_epi64(g,h);
b = _mm256_unpackhi_epi64(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi64(a, b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a, b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c, d);
__m256i h = _mm256_max_epi32(c, d);
a = _mm256_unpacklo_epi64(g, h);
b = _mm256_unpackhi_epi64(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
}
/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
static inline void minmax02134657(int32 *x)
static inline void
minmax02134657(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0x4e);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0x44);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0x4e);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0x44);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
}
static void multiminmax2plus2(
int32 *x,
int n)
static void
multiminmax2plus2(int32 *x, int n)
{
while (n >= 16) {
while(n >= 16)
{
minmax02through1315(x);
n -= 16;
x += 16;
}
if (n >= 8) {
if(n >= 8)
{
minmax02134657(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 2);
minmax(x + 1,x + 3);
if(n >= 4)
{
minmax(x, x + 2);
minmax(x + 1, x + 3);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 2);
if (n > 1) minmax(x + 1,x + 3);
if(n > 0)
{
minmax(x, x + 2);
if(n > 1)
minmax(x + 1, x + 3);
}
}
static void multiminmax2plus6(
int32 *x,
int n)
static void
multiminmax2plus6(int32 *x, int n)
{
while (n >= 4) {
minmax(x,x + 6);
minmax(x + 1,x + 7);
while(n >= 4)
{
minmax(x, x + 6);
minmax(x + 1, x + 7);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 6);
if (n > 1) minmax(x + 1,x + 7);
if(n > 0)
{
minmax(x, x + 6);
if(n > 1)
minmax(x + 1, x + 7);
}
}
static void multiminmax2plus14(
int32 *x,
int n)
static void
multiminmax2plus14(int32 *x, int n)
{
while (n >= 8) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
minmax(x + 4,x + 18);
minmax(x + 5,x + 19);
while(n >= 8)
{
minmax(x, x + 14);
minmax(x + 1, x + 15);
minmax(x + 4, x + 18);
minmax(x + 5, x + 19);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
if(n >= 4)
{
minmax(x, x + 14);
minmax(x + 1, x + 15);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 14);
if (n > 1) minmax(x + 1,x + 15);
if(n > 0)
{
minmax(x, x + 14);
if(n > 1)
minmax(x + 1, x + 15);
}
}
/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax0145891213(int32 *x,int32 *y)
static inline void
minmax0145891213(int32 *x, int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567, a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567, b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213, b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213, b0189451213);
__m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
__m256i c01234567 = _mm256_blend_epi32(a01234567, c0189451213, 0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567, d0189451213, 0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213, a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213, b89101112131415);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
}
/* offset >= 30 */
static void multiminmax2plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax2plusmore(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax0145891213(x,x + offset);
while(n >= 16)
{
minmax0145891213(x, x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 5,x + 5 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 5, x + 5 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + offset);
if (n > 1) minmax(x + 1,x + 1 + offset);
if(n > 0)
{
minmax(x, x + offset);
if(n > 1)
minmax(x + 1, x + 1 + offset);
}
}
/* sort x0,x1; ... sort x14, x15 */
static inline void minmax01through1415(int32 *x)
static inline void
minmax01through1415(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g,h);
b = _mm256_unpackhi_epi32(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi32(a, b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a, b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c, d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c, d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e, f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e, f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g, h);
b = _mm256_unpackhi_epi32(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
}
/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
static inline void minmax01234567(int32 *x)
static inline void
minmax01234567(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0xb1);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0xa0);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0xb1);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0xa0);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
}
static void multiminmax1plus1(
int32 *x,
int n)
static void
multiminmax1plus1(int32 *x, int n)
{
while (n >= 16) {
while(n >= 16)
{
minmax01through1415(x);
n -= 16;
x += 16;
}
if (n >= 8) {
if(n >= 8)
{
minmax01234567(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 1);
minmax(x + 2,x + 3);
if(n >= 4)
{
minmax(x, x + 1);
minmax(x + 2, x + 3);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + 1);
if(n >= 2)
{
minmax(x, x + 1);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + 1);
if(n > 0)
minmax(x, x + 1);
}
static void multiminmax1(
int32 *x,
int n,
int offset)
static void
multiminmax1(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
minmax(x + 8,x + 8 + offset);
minmax(x + 10,x + 10 + offset);
minmax(x + 12,x + 12 + offset);
minmax(x + 14,x + 14 + offset);
while(n >= 16)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
minmax(x + 8, x + 8 + offset);
minmax(x + 10, x + 10 + offset);
minmax(x + 12, x + 12 + offset);
minmax(x + 14, x + 14 + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
{
minmax(x, x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
}
/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax02468101214(int32 *x,int32 *y)
static inline void
minmax02468101214(int32 *x, int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567, a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567, a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513, a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513, a210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567, b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567, b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513, b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513, b210311614715);
__m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
__m256i c02810461214 = _mm256_min_epi32(a02810461214, b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214, b02810461214);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214, a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214, a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214, b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214, b13911571315);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
}
/* assumes offset >= 31 */
static void multiminmax1plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax1plusmore(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax02468101214(x,x + offset);
while(n >= 16)
{
minmax02468101214(x, x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
{
minmax(x, x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
}
/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
static inline void minmax8(int32 *x,int32 *y)
static inline void
minmax8(int32 *x, int32 *y)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) y);
_mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
_mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)y);
_mm256_storeu_si256((__m256i *)x, _mm256_min_epi32(a, b));
_mm256_storeu_si256((__m256i *)y, _mm256_max_epi32(a, b));
}
/* assumes p >= 8; implies offset >= 8 */
static void multiminmax_atleast8(int p,
int32 *x,
int n,
int offset)
static void
multiminmax_atleast8(int p, int32 *x, int n, int offset)
{
int i;
while (n >= 2 * p) {
for (i = 0;i < p;i += 8)
minmax8(x + i,x + i + offset);
while(n >= 2 * p)
{
for(i = 0; i < p; i += 8)
minmax8(x + i, x + i + offset);
n -= 2 * p;
x += 2 * p;
}
for (i = 0;i + 8 <= n;i += 8) {
if (i & p) return;
minmax8(x + i,x + i + offset);
for(i = 0; i + 8 <= n; i += 8)
{
if(i & p)
return;
minmax8(x + i, x + i + offset);
}
for (;i < n;++i) {
if (i & p) return;
minmax(x + i,x + i + offset);
for(; i < n; ++i)
{
if(i & p)
return;
minmax(x + i, x + i + offset);
}
}
/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
static inline void minmax4(int32 *x,int32 *y)
static inline void
minmax4(int32 *x, int32 *y)
{
__m128i a = _mm_loadu_si128((__m128i *) x);
__m128i b = _mm_loadu_si128((__m128i *) y);
_mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
_mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
__m128i a = _mm_loadu_si128((__m128i *)x);
__m128i b = _mm_loadu_si128((__m128i *)y);
_mm_storeu_si128((__m128i *)x, _mm_min_epi32(a, b));
_mm_storeu_si128((__m128i *)y, _mm_max_epi32(a, b));
}
static void multiminmax4(
int32 *x,
int n,
int offset)
static void
multiminmax4(int32 *x, int n, int offset)
{
int i;
while (n >= 8) {
minmax4(x,x + offset);
while(n >= 8)
{
minmax4(x, x + offset);
n -= 8;
x += 8;
}
if (n >= 4)
minmax4(x,x + offset);
if(n >= 4)
minmax4(x, x + offset);
else
for (i = 0;i < n;++i)
minmax(x + i,x + i + offset);
for(i = 0; i < n; ++i)
minmax(x + i, x + i + offset);
}
void int32_sort(int32 *x,int n)
void
int32_sort(int32 *x, int n)
{
int top,p,q;
int top, p, q;
if (n < 2) return;
if(n < 2)
return;
top = 1;
while (top < n - top) top += top;
while(top < n - top)
top += top;
for (p = top;p >= 8;p >>= 1) {
multiminmax_atleast8(p,x,n - p,p);
for (q = top;q > p;q >>= 1)
multiminmax_atleast8(p,x + p,n - q,q - p);
}
if (p >= 4) {
multiminmax4(x,n - 4,4);
for (q = top;q > 4;q >>= 1)
multiminmax4(x + 4,n - q,q - 4);
}
if (p >= 2) {
multiminmax2plus2(x,n - 2);
for (q = top;q >= 32;q >>= 1)
multiminmax2plusmore(x + 2,n - q,q - 2);
if (q >= 16)
multiminmax2plus14(x + 2,n - 16);
if (q >= 8)
multiminmax2plus6(x + 2,n - 8);
if (q >= 4)
multiminmax2plus2(x + 2,n - 4);
}
multiminmax1plus1(x,n - 1);
for (q = top;q >= 32;q >>= 1)
multiminmax1plusmore(x + 1,n - q,q - 1);
if (q >= 16)
multiminmax1(x + 1,n - 16,15);
if (q >= 8)
multiminmax1(x + 1,n - 8,7);
if (q >= 4)
multiminmax1(x + 1,n - 4,3);
if (q >= 2)
multiminmax1plus1(x + 1,n - 2);
for(p = top; p >= 8; p >>= 1)
{
multiminmax_atleast8(p, x, n - p, p);
for(q = top; q > p; q >>= 1)
multiminmax_atleast8(p, x + p, n - q, q - p);
}
if(p >= 4)
{
multiminmax4(x, n - 4, 4);
for(q = top; q > 4; q >>= 1)
multiminmax4(x + 4, n - q, q - 4);
}
if(p >= 2)
{
multiminmax2plus2(x, n - 2);
for(q = top; q >= 32; q >>= 1)
multiminmax2plusmore(x + 2, n - q, q - 2);
if(q >= 16)
multiminmax2plus14(x + 2, n - 16);
if(q >= 8)
multiminmax2plus6(x + 2, n - 8);
if(q >= 4)
multiminmax2plus2(x + 2, n - 4);
}
multiminmax1plus1(x, n - 1);
for(q = top; q >= 32; q >>= 1)
multiminmax1plusmore(x + 1, n - q, q - 1);
if(q >= 16)
multiminmax1(x + 1, n - 16, 15);
if(q >= 8)
multiminmax1(x + 1, n - 8, 7);
if(q >= 4)
multiminmax1(x + 1, n - 4, 3);
if(q >= 2)
multiminmax1plus1(x + 1, n - 2);
}
#endif
#endif

@ -4,12 +4,15 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);
#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
extern int r3_weightw_mask(const small *);
extern int
r3_weightw_mask(const small *);
#endif

@ -6,91 +6,102 @@
#include "r3.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
{
return (x - y) >> 31;
}
static void vectormod3_product(small *z,int len,const small *x,const small c)
static void
vectormod3_product(small *z, int len, const small *x, const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
_mm256_storeu_si256((__m256i *) z,xi);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while(len >= 32)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec, xi) & minusvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_product(x[i], c);
}
static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
static void
vectormod3_minusproduct(small *z, int len, const small *x, const small *y,
const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec, twovec, fourvec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while(len >= 32)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
__m256i r;
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
xi = _mm256_sub_epi8(xi,yi);
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec, yi) & minusvec);
xi = _mm256_sub_epi8(xi, yi);
r = _mm256_add_epi8(xi,twovec);
r = _mm256_add_epi8(xi, twovec);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_sub_epi8(twovec,xi);
r = _mm256_sub_epi8(twovec, xi);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_add_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_add_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_add_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_add_epi8(xi, r);
_mm256_storeu_si256((__m256i *) z,xi);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
y += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_minusproduct(x[i], y[i], c);
}
static void vectormod3_shift(small *z,int len)
static void
vectormod3_shift(small *z, int len)
{
int i;
while (len >= 33) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
_mm256_storeu_si256((__m256i *) (z + len - 32),zi);
while(len >= 33)
{
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 33));
_mm256_storeu_si256((__m256i *)(z + len - 32), zi);
len -= 32;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
}
@ -100,12 +111,13 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int r3_recip(small *r,const small *s)
int
r3_recip(small *r, const small *s)
{
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
small f[768];
small g[769];
small f[768];
small g[769];
small u[1536];
small v[1537];
small c;
@ -114,23 +126,28 @@ int r3_recip(small *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = s[i];
for(i = 0; i < p; ++i)
g[i] = s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
for(;;)
{
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -141,29 +158,35 @@ int r3_recip(small *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
break;
c = mod3_quotient(g[p],f[p]);
c = mod3_quotient(g[p], f[p]);
vectormod3_minusproduct(g,768,g,f,c);
vectormod3_shift(g,769);
vectormod3_minusproduct(g, 768, g, f, c);
vectormod3_shift(g, 769);
#ifdef SIMPLER
vectormod3_minusproduct(v,1536,v,u,c);
vectormod3_shift(v,1537);
vectormod3_minusproduct(v, 1536, v, u, c);
vectormod3_shift(v, 1537);
#else
if (loop < p) {
vectormod3_minusproduct(v,loop + 1,v,u,c);
vectormod3_shift(v,loop + 2);
} else {
vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormod3_shift(v + loop - p,p + 2);
if(loop < p)
{
vectormod3_minusproduct(v, loop + 1, v, u, c);
vectormod3_shift(v, loop + 2);
}
else
{
vectormod3_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
c);
vectormod3_shift(v + loop - p, p + 2);
}
#endif
@ -171,24 +194,28 @@ int r3_recip(small *r,const small *s)
++loop;
swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(small),swapmask);
swapmask = smaller_mask(e, d) & mod3_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, (p + 1) * sizeof(small), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(small),swapmask);
swap(u, v, 1536 * sizeof(small), swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(small),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
if(loop < p)
{
swap(u, v, (loop + 1) * sizeof(small), swapmask);
}
else
{
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(small), swapmask);
}
#endif
}
c = mod3_reciprocal(f[p]);
vectormod3_product(r,p,u + p,c);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormod3_product(r, p, u + p, c);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);
}
#endif

@ -5,27 +5,35 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
extern void rq_roundencode(unsigned char *,const modq *);
extern void
rq_roundencode(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
extern void rq_mod3(small *,const modq *);
extern void
rq_mod3(small *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
int rq_recip3(modq *,const small *);
int
rq_recip3(modq *, const small *);
#endif

@ -12,47 +12,57 @@
// 32-bit hosts only
#ifndef __amd64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[N];}))
#define _mm_extract_epi64(X, N) \
(__extension__({ \
__v2di __a = (__v2di)(X); \
__a[N]; \
}))
#endif
static inline __m256i squeeze(__m256i x)
static inline __m256i
squeeze(__m256i x)
{
__m256i q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
__m256i q = _mm256_mulhrs_epi16(x, v7);
q = _mm256_mullo_epi16(q, v4591_16);
return _mm256_sub_epi16(x, q);
}
static inline __m256i freeze(__m256i x)
static inline __m256i
freeze(__m256i x)
{
__m256i mask, x2296, x4591;
x4591 = _mm256_add_epi16(x,v4591_16);
mask = _mm256_srai_epi16(x,15);
x = _mm256_blendv_epi8(x,x4591,mask);
x2296 = _mm256_sub_epi16(x,v2296_16);
mask = _mm256_srai_epi16(x2296,15);
x4591 = _mm256_sub_epi16(x,v4591_16);
x = _mm256_blendv_epi8(x4591,x,mask);
x4591 = _mm256_add_epi16(x, v4591_16);
mask = _mm256_srai_epi16(x, 15);
x = _mm256_blendv_epi8(x, x4591, mask);
x2296 = _mm256_sub_epi16(x, v2296_16);
mask = _mm256_srai_epi16(x2296, 15);
x4591 = _mm256_sub_epi16(x, v4591_16);
x = _mm256_blendv_epi8(x4591, x, mask);
return x;
}
void rq_mod3(small *g,const modq *f)
void
rq_mod3(small *g, const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
{
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i q;
x = _mm256_mullo_epi16(x,v3);
x = _mm256_mullo_epi16(x, v3);
x = squeeze(x);
x = freeze(x);
q = _mm256_mulhrs_epi16(x,v10923_16);
x = _mm256_sub_epi16(x,q);
q = _mm256_add_epi16(q,q);
x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
q = _mm256_mulhrs_epi16(x, v10923_16);
x = _mm256_sub_epi16(x, q);
q = _mm256_add_epi16(q, q);
x = _mm256_sub_epi16(x, q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,
x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 0), 0);
1 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 1), 0);
}
}
#endif

@ -10,93 +10,103 @@
#define v29234_16 _mm256_set1_epi16(29234)
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
{
return (x - y) >> 31;
}
static inline __m256i product(__m256i x,__m256i y)
static inline __m256i
product(__m256i x, __m256i y)
{
__m256i lo, hi, r0, r1, t0, t1, t, s0, s1;
lo = _mm256_mullo_epi16(x,y);
hi = _mm256_mulhi_epi16(x,y);
r0 = _mm256_unpacklo_epi16(lo,hi);
r1 = _mm256_unpackhi_epi16(lo,hi);
t0 = _mm256_srai_epi32(r0,16);
t1 = _mm256_srai_epi32(r1,16);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v29234_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
s0 = _mm256_slli_epi32(s0,4);
s1 = _mm256_slli_epi32(s1,4);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
t0 = _mm256_srai_epi32(r0,8);
t1 = _mm256_srai_epi32(r1,8);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v1827_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
x = _mm256_packs_epi32(r0,r1);
lo = _mm256_mullo_epi16(x, y);
hi = _mm256_mulhi_epi16(x, y);
r0 = _mm256_unpacklo_epi16(lo, hi);
r1 = _mm256_unpackhi_epi16(lo, hi);
t0 = _mm256_srai_epi32(r0, 16);
t1 = _mm256_srai_epi32(r1, 16);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v29234_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
s0 = _mm256_slli_epi32(s0, 4);
s1 = _mm256_slli_epi32(s1, 4);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
t0 = _mm256_srai_epi32(r0, 8);
t1 = _mm256_srai_epi32(r1, 8);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v1827_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
x = _mm256_packs_epi32(r0, r1);
return x;
}
static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
static inline __m256i
minusproduct(__m256i x, __m256i y, __m256i z)
{
__m256i t;
x = _mm256_sub_epi16(x,product(y,z));
t = _mm256_mulhrs_epi16(x,v7);
t = _mm256_mullo_epi16(t,v4591_16);
x = _mm256_sub_epi16(x,t);
x = _mm256_sub_epi16(x, product(y, z));
t = _mm256_mulhrs_epi16(x, v7);
t = _mm256_mullo_epi16(t, v4591_16);
x = _mm256_sub_epi16(x, t);
return x;
}
static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
static void
vectormodq_product(modq *z, int len, const modq *x, const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = product(xi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = product(xi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_product(*x,c);
while(len > 0)
{
*z = modq_product(*x, c);
++x;
++z;
--len;
}
}
static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
static void
vectormodq_minusproduct(modq *z, int len, const modq *x, const modq *y,
const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
xi = minusproduct(xi,yi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
xi = minusproduct(xi, yi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
y += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_minusproduct(*x,*y,c);
while(len > 0)
{
*z = modq_minusproduct(*x, *y, c);
++x;
++y;
++z;
@ -104,15 +114,18 @@ static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,
}
}
static void vectormodq_shift(modq *z,int len)
static void
vectormodq_shift(modq *z, int len)
{
int i;
while (len >= 17) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
_mm256_storeu_si256((__m256i *) (z + len - 16),zi);
while(len >= 17)
{
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 17));
_mm256_storeu_si256((__m256i *)(z + len - 16), zi);
len -= 16;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
}
@ -122,9 +135,10 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int rq_recip3(modq *r,const small *s)
int
rq_recip3(modq *r, const small *s)
{
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
modq f[768];
modq g[769];
@ -136,23 +150,28 @@ int rq_recip3(modq *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = 3 * s[i];
for(i = 0; i < p; ++i)
g[i] = 3 * s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
for(;;)
{
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -163,29 +182,35 @@ int rq_recip3(modq *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
break;
c = modq_quotient(g[p],f[p]);
c = modq_quotient(g[p], f[p]);
vectormodq_minusproduct(g,768,g,f,c);
vectormodq_shift(g,769);
vectormodq_minusproduct(g, 768, g, f, c);
vectormodq_shift(g, 769);
#ifdef SIMPLER
vectormodq_minusproduct(v,1536,v,u,c);
vectormodq_shift(v,1537);
vectormodq_minusproduct(v, 1536, v, u, c);
vectormodq_shift(v, 1537);
#else
if (loop < p) {
vectormodq_minusproduct(v,loop + 1,v,u,c);
vectormodq_shift(v,loop + 2);
} else {
vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormodq_shift(v + loop - p,p + 2);
if(loop < p)
{
vectormodq_minusproduct(v, loop + 1, v, u, c);
vectormodq_shift(v, loop + 2);
}
else
{
vectormodq_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
c);
vectormodq_shift(v + loop - p, p + 2);
}
#endif
@ -193,25 +218,30 @@ int rq_recip3(modq *r,const small *s)
++loop;
swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,768 * sizeof(modq),swapmask);
swapmask = smaller_mask(e, d) & modq_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, 768 * sizeof(modq), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(modq),swapmask);
swap(u, v, 1536 * sizeof(modq), swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(modq),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
if(loop < p)
{
swap(u, v, (loop + 1) * sizeof(modq), swapmask);
}
else
{
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(modq), swapmask);
}
#endif
}
c = modq_reciprocal(f[p]);
vectormodq_product(r,p,u + p,c);
for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormodq_product(r, p, u + p, c);
for(i = 0; i < p; ++i)
r[i] = modq_freeze(r[i]);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);
}
#endif

@ -6,17 +6,19 @@
#define v3_16 _mm256_set1_epi16(3)
#define v10923_16 _mm256_set1_epi16(10923)
void rq_round3(modq *h,const modq *f)
void
rq_round3(modq *h, const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
{
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i x2;
x = _mm256_mulhrs_epi16(x,v10923_16);
x2 = _mm256_add_epi16(x,x);
x = _mm256_add_epi16(x,x2);
_mm256_storeu_si256((__m256i *) &h[i],x);
x = _mm256_mulhrs_epi16(x, v10923_16);
x2 = _mm256_add_epi16(x, x);
x = _mm256_add_epi16(x, x2);
_mm256_storeu_si256((__m256i *)&h[i], x);
}
}
#endif

@ -164,35 +164,40 @@ rq_decoderounded(modq *f, const unsigned char *c)
/* x is f0 + f1*1536 + f2*1536^2 */
/* with each f between 0 and 1530 */
f2 = x
f2 =
x
* _mm256_set1_pd(
0.00000042385525173611114052197733521876177320564238470979034900665283203125);
0.00000042385525173611114052197733521876177320564238470979034900665283203125);
f2 = floor(f2);
x -= f2 * _mm256_set1_pd(2359296.0);
f1 = x
f1 =
x
* _mm256_set1_pd(
0.00065104166666666673894681149903362893383018672466278076171875);
0.00065104166666666673894681149903362893383018672466278076171875);
f1 = floor(f1);
x -= f1 * _mm256_set1_pd(1536.0);
f0 = x;
f2 -= _mm256_set1_pd(1531.0)
f2 -=
_mm256_set1_pd(1531.0)
* floor(
f2
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f1 -= _mm256_set1_pd(1531.0)
f2
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f1 -=
_mm256_set1_pd(1531.0)
* floor(
f1
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0 -= _mm256_set1_pd(1531.0)
f1
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0 -=
_mm256_set1_pd(1531.0)
* floor(
f0
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f2 *= _mm256_set1_pd(3.0);
f2 -= _mm256_set1_pd(2295.0);

@ -2,30 +2,33 @@
#include <immintrin.h>
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
void
swap(void *x, void *y, int bytes, int mask)
{
char c = mask;
char c = mask;
__m256i maskvec = _mm256_set1_epi32(mask);
while (bytes >= 32) {
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
__m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
_mm256_storeu_si256(x,xinew);
_mm256_storeu_si256(y,yinew);
x = 32 + (char *) x;
y = 32 + (char *) y;
while(bytes >= 32)
{
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi, yi, maskvec);
__m256i yinew = _mm256_blendv_epi8(yi, xi, maskvec);
_mm256_storeu_si256(x, xinew);
_mm256_storeu_si256(y, yinew);
x = 32 + (char *)x;
y = 32 + (char *)y;
bytes -= 32;
}
while (bytes > 0) {
char xi = *(char *) x;
char yi = *(char *) y;
char t = c & (xi ^ yi);
while(bytes > 0)
{
char xi = *(char *)x;
char yi = *(char *)y;
char t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
*(char *) x = xi;
*(char *) y = yi;
*(char *)x = xi;
*(char *)y = yi;
++x;
++y;
--bytes;

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_avx_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);
#endif

@ -1,7 +1,7 @@
#include <libntrup/ntru.h>
#include <stdbool.h>
#include <stdio.h> // printf
#include <stdio.h> // printf
#if __AVX2__
#include <cpuid.h>

@ -7,7 +7,7 @@
#define qshift 2295
#define p 761
#ifdef _MSC_VER
#define LOOPS 2*p+1
#define LOOPS 2 * p + 1
#endif
#define w 286

@ -4,9 +4,11 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);
#endif

@ -2,30 +2,34 @@
#include "mod3.h"
#include "r3.h"
void r3_mult(small *h,const small *f,const small *g)
void
r3_mult(small *h, const small *f, const small *g)
{
small fg[p + p - 1];
small result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
{
result = 0;
for (j = 0;j <= i;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
{
result = 0;
for (j = i - p + 1;j < p;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = mod3_sum(fg[i - p],fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
{
fg[i - p] = mod3_sum(fg[i - p], fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1], fg[i]);
}
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];
}

@ -5,24 +5,31 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
extern void rq_encoderounded(unsigned char *,const modq *);
extern void
rq_encoderounded(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
int rq_recip3(modq *,const small *);
int
rq_recip3(modq *, const small *);
#endif

@ -1,30 +1,34 @@
#include "params.h"
#include "rq.h"
void rq_mult(modq *h,const modq *f,const small *g)
void
rq_mult(modq *h, const modq *f, const small *g)
{
modq fg[p + p - 1];
modq result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
{
result = 0;
for (j = 0;j <= i;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
{
result = 0;
for (j = i - p + 1;j < p;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = modq_sum(fg[i - p],fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
{
fg[i - p] = modq_sum(fg[i - p], fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1], fg[i]);
}
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];
}

@ -1,10 +1,11 @@
#include "params.h"
#include "rq.h"
void rq_round3(modq *h,const modq *f)
void
rq_round3(modq *h, const modq *f)
{
int i;
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
}

@ -4,34 +4,41 @@
/* XXX: these functions rely on p mod 4 = 1 */
/* all coefficients in -1, 0, 1 */
void small_encode(unsigned char *c,const small *f)
void
small_encode(unsigned char *c, const small *f)
{
small c0;
int i;
for (i = 0;i < p/4;++i) {
for(i = 0; i < p / 4; ++i)
{
c0 = *f++ + 1;
c0 += (*f++ + 1) << 2;
c0 += (*f++ + 1) << 4;
c0 += (*f++ + 1) << 6;
*c++ = c0;
}
c0 = *f++ + 1;
c0 = *f++ + 1;
*c++ = c0;
}
void small_decode(small *f,const unsigned char *c)
void
small_decode(small *f, const unsigned char *c)
{
unsigned char c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1;
for(i = 0; i < p / 4; ++i)
{
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
}
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1;
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;
}

@ -1,19 +1,21 @@
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
void
swap(void *x, void *y, int bytes, int mask)
{
int i;
char xi, yi, c, t;
c = mask;
for (i = 0;i < bytes;++i) {
xi = i[(char *) x];
yi = i[(char *) y];
t = c & (xi ^ yi);
for(i = 0; i < bytes; ++i)
{
xi = i[(char *)x];
yi = i[(char *)y];
t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
i[(char *) x] = xi;
i[(char *) y] = yi;
i[(char *)x] = xi;
i[(char *)y] = yi;
}
}

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_ref_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);
#endif

@ -36,7 +36,7 @@ sodium_init(void)
return -1; /* LCOV_EXCL_LINE */
}
/* if we're here, we already started properly */
return initialized ? 0: -1;
return initialized ? 0 : -1;
}
_sodium_runtime_get_cpu_features();
_crypto_generichash_blake2b_pick_best_implementation();

@ -10,116 +10,116 @@ crypto_core_salsa(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c,
const int rounds)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
j15;
int i;
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
int i;
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if (c != NULL) {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if(c != NULL)
{
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
for (i = 0; i < rounds; i += 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
for(i = 0; i < rounds; i += 2)
{
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
}
int
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
{
crypto_core_salsa(out, in, k, c, 20);
return 0;
crypto_core_salsa(out, in, k, c, 20);
return 0;
}
size_t
crypto_core_salsa20_outputbytes(void)
{
return crypto_core_salsa20_OUTPUTBYTES;
return crypto_core_salsa20_OUTPUTBYTES;
}
size_t
crypto_core_salsa20_inputbytes(void)
{
return crypto_core_salsa20_INPUTBYTES;
return crypto_core_salsa20_INPUTBYTES;
}
size_t
crypto_core_salsa20_keybytes(void)
{
return crypto_core_salsa20_KEYBYTES;
return crypto_core_salsa20_KEYBYTES;
}
size_t
crypto_core_salsa20_constbytes(void)
{
return crypto_core_salsa20_CONSTBYTES;
return crypto_core_salsa20_CONSTBYTES;
}

@ -13,7 +13,6 @@ Public domain.
#include "../stream_salsa20.h"
#include "salsa20_ref.h"
static int
stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
const unsigned char *k)
@ -132,4 +131,3 @@ struct crypto_stream_salsa20_implementation
SODIUM_C99(.stream =) stream_ref,
SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
};

@ -4,13 +4,13 @@
#include <stdint.h>
typedef struct crypto_stream_salsa20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
typedef struct crypto_stream_salsa20_implementation
{
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
} crypto_stream_salsa20_implementation;
#endif

@ -1,195 +1,199 @@
if (bytes > 0) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *) (partialblock + (A * 4)) = in##A; \
*(uint32_t *) (partialblock + (B * 4)) = in##B; \
*(uint32_t *) (partialblock + (C * 4)) = in##C; \
*(uint32_t *) (partialblock + (D * 4)) = in##D; \
} while (0)
if(bytes > 0)
{
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
{
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *)(partialblock + (A * 4)) = in##A; \
*(uint32_t *)(partialblock + (B * 4)) = in##B; \
*(uint32_t *)(partialblock + (C * 4)) = in##C; \
*(uint32_t *)(partialblock + (D * 4)) = in##D; \
} while(0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
for(i = 0; i < bytes; i++)
{
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
sodium_memzero(partialblock, sizeof partialblock);
}

@ -1,207 +1,211 @@
while (bytes >= 64) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *) (m + (A * 4)); \
in##B ^= *(uint32_t *) (m + (B * 4)); \
in##C ^= *(uint32_t *) (m + (C * 4)); \
in##D ^= *(uint32_t *) (m + (D * 4)); \
*(uint32_t *) (c + (A * 4)) = in##A; \
*(uint32_t *) (c + (B * 4)) = in##B; \
*(uint32_t *) (c + (C * 4)) = in##C; \
*(uint32_t *) (c + (D * 4)) = in##D; \
} while (0)
while(bytes >= 64)
{
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
{
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *)(m + (A * 4)); \
in##B ^= *(uint32_t *)(m + (B * 4)); \
in##C ^= *(uint32_t *)(m + (C * 4)); \
in##D ^= *(uint32_t *)(m + (D * 4)); \
*(uint32_t *)(c + (A * 4)) = in##A; \
*(uint32_t *)(c + (B * 4)) = in##B; \
*(uint32_t *)(c + (C * 4)) = in##C; \
*(uint32_t *)(c + (D * 4)) = in##D; \
} while(0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
in8 = x[8];
in9 = x[13];
in8++;
if (in8 == 0) {
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
in8 = x[8];
in9 = x[13];
in8++;
if(in8 == 0)
{
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
}

File diff suppressed because it is too large Load Diff

@ -1,476 +1,471 @@
if (bytes >= 512) {
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while (bytes >= 512) {
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < ROUNDS; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
}
if(bytes >= 512)
{
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while(bytes >= 512)
{
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t)in8) | (((uint64_t)in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for(i = 0; i < ROUNDS; i += 2)
{
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14,
r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
}
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
y##B = \
_mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
y##C = \
_mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
y##D = \
_mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
y##A2 = \
_mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \
y##B2 = \
_mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \
y##C2 = \
_mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \
y##D2 = \
_mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), y##A); \
_mm256_storeu_si256((__m256i*) (c + 64), y##B); \
_mm256_storeu_si256((__m256i*) (c + 128), y##C); \
_mm256_storeu_si256((__m256i*) (c + 192), y##D); \
_mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
y##B = _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
y##C = _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
y##D = _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
y##A2 = _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
y##B2 = _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
y##C2 = _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
y##D2 = _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), y##A); \
_mm256_storeu_si256((__m256i*)(c + 64), y##B); \
_mm256_storeu_si256((__m256i*)(c + 128), y##C); \
_mm256_storeu_si256((__m256i*)(c + 192), y##D); \
_mm256_storeu_si256((__m256i*)(c + 256), y##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), y##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), y##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), y##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_UNPCK
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
}
bytes -= 512;
c += 512;
m += 512;
}
}

@ -1,6 +1,10 @@
#ifndef LLARP_VERSION_HPP
#define LLARP_VERSION_HPP
#if defined(_WIN32) && defined(RC_INVOKED)
#define LLARP_VERSION 0, 5, 0, 0
#else
#ifndef LLARP_VERSION_MAJ
#define LLARP_VERSION_MAJ "0"
#endif
@ -33,5 +37,5 @@ struct Version
{
static const char LLARP_NET_ID[];
};
#endif
#endif

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
//
#include "resource.h"
#include <constants/version.hpp>
#ifdef __GNUC__
#include <winresrc.h>
#endif
/////////////////////////////////////////////////////////////////////////////
// English (United States) resources
@ -58,8 +61,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,4,0,0
PRODUCTVERSION 0,4,0,0
FILEVERSION LLARP_VERSION
PRODUCTVERSION LLARP_VERSION
FILEFLAGSMASK 0x17L
#ifdef _DEBUG
FILEFLAGS 0x3L
@ -76,20 +79,20 @@ BEGIN
BEGIN
VALUE "Comments", "libabyss JSON-RPC daemon demo"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET for Microsoft® Windows® NT™"
#ifdef LLARP_RELEASE_MOTTO
VALUE "FileVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#endif
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "abyss-main.exe"
VALUE "ProductName", "LokiNET for Windows"
#ifdef LLARP_RELEASE_MOTTO
VALUE "ProductVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#endif
END
END

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
//
#include "resource.h"
#include <constants/version.hpp>
#ifdef __GNUC__ // make windows rc accept this
#include <winresrc.h>
#endif
/////////////////////////////////////////////////////////////////////////////
// English (United States) resources
@ -58,8 +61,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,4,0,0
PRODUCTVERSION 0,4,0,0
FILEVERSION LLARP_VERSION
PRODUCTVERSION LLARP_VERSION
FILEFLAGSMASK 0x17L
#ifdef _DEBUG
FILEFLAGS 0x3L
@ -76,20 +79,20 @@ BEGIN
BEGIN
VALUE "Comments", "includes relay/exit functionality, such code is highly experimental on non-Linux targets"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET daemon for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET daemon for Microsoft® Windows® NT™"
#ifdef LLARP_RELEASE_MOTTO
VALUE "FileVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#endif
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "llarpd.exe"
VALUE "ProductName", "LokiNET for Windows"
#ifdef LLARP_RELEASE_MOTTO
VALUE "ProductVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#endif
END
END

@ -7,7 +7,10 @@
// Microsoft Visual C++ generated resource script.
//
#include <win32/resource.h>
#include <constants/version.hpp>
#ifdef __GNUC__
#include <winresrc.h>
#endif
/////////////////////////////////////////////////////////////////////////////
// English (United States) resources
@ -58,8 +61,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,4,0,0
PRODUCTVERSION 0,4,0,0
FILEVERSION LLARP_VERSION
PRODUCTVERSION LLARP_VERSION
FILEFLAGSMASK 0x17L
#ifdef _DEBUG
FILEFLAGS 0x3L
@ -76,27 +79,27 @@ BEGIN
BEGIN
VALUE "Comments", "LokiNET test suite"
VALUE "CompanyName", "Loki Foundation"
VALUE "FileDescription", "LokiNET for Microsoft<EFBFBD> Windows<77> NT<4E>"
VALUE "FileDescription", "LokiNET for Microsoft® Windows® NT™"
#ifdef LLARP_RELEASE_MOTTO
VALUE "FileVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
#ifdef __GNUC__
VALUE "FileVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "FileVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#else
VALUE "FileVersion", "0.4.0-dev"
VALUE "FileVersion", "0.5.0-dev"
#endif
#endif
VALUE "InternalName", "llarpd"
VALUE "LegalCopyright", "Copyright <EFBFBD>2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "LegalCopyright", "Copyright ©2018-2019 Jeff Becker, Rick V for the Loki Foundation. All rights reserved. This software is provided under the terms of the zlib-libpng licence; see the file LICENSE for details."
VALUE "OriginalFilename", "llarpd.exe"
VALUE "ProductName", "LokiNET for Windows"
#ifdef LLARP_RELEASE_MOTTO
VALUE "ProductVersion", VERSION_STRING(0.4.0, RELEASE_MOTTO, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0, RELEASE_MOTTO, GIT_REV)
#else
#ifdef __GNUC__
VALUE "ProductVersion", VERSION_STRING(0.4.0-dev-, GIT_REV)
VALUE "ProductVersion", VERSION_STRING(0.5.0-dev-, GIT_REV)
#else
VALUE "ProductVersion", "0.4.0-dev"
VALUE "ProductVersion", "0.5.0-dev"
#endif
#endif
END

@ -1,3 +1,8 @@
// WARNING: for the love of all that is good and holy
// please DO NOT convert this file to UTF-8, much less
// UTF-16 - the UNIX port of Roslyn does not understand UTF-16,
// and UTF-8 chews up the copyright symbols.
// -rick
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
@ -10,8 +15,8 @@ using System.Runtime.InteropServices;
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("Loki Project")]
[assembly: AssemblyProduct("LokiNET Launcher")]
[assembly: AssemblyCopyright("Copyright ©2018-2019 Loki Project. All rights reserved. See LICENSE for more details.")]
[assembly: AssemblyTrademark("Loki, Loki Project, LokiNET are ™ & ©2018-2019 Loki Foundation")]
[assembly: AssemblyCopyright("Copyright ©2018-2019 Loki Project. All rights reserved. See LICENSE for more details.")]
[assembly: AssemblyTrademark("Loki, Loki Project, LokiNET are ™ & ©2018-2019 Loki Foundation")]
[assembly: AssemblyCulture("")]
// Setting ComVisible to false makes the types in this assembly not visible
@ -32,10 +37,10 @@ using System.Runtime.InteropServices;
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("0.4.3")]
[assembly: AssemblyFileVersion("0.4.3")]
[assembly: AssemblyVersion("0.5.0")]
[assembly: AssemblyFileVersion("0.5.0")]
#if DEBUG
[assembly: AssemblyInformationalVersion("0.4.3-dev-{chash:8}")]
[assembly: AssemblyInformationalVersion("0.5.0-dev-{chash:8}")]
#else
[assembly: AssemblyInformationalVersion("0.4.3 (RELEASE_CODENAME)")]
[assembly: AssemblyInformationalVersion("0.5.0 (RELEASE_CODENAME)")]
#endif

@ -28,77 +28,101 @@
/// </summary>
private void InitializeComponent()
{
this.btnOK = new System.Windows.Forms.Button();
this.btnBoot = new System.Windows.Forms.Button();
this.btnDumpLog = new System.Windows.Forms.Button();
this.btnVSettings = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// btnOK
//
this.btnOK.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)));
this.btnOK.DialogResult = System.Windows.Forms.DialogResult.Cancel;
this.btnOK.Location = new System.Drawing.Point(109, 121);
this.btnOK.Name = "btnOK";
this.btnOK.Size = new System.Drawing.Size(75, 23);
this.btnOK.TabIndex = 0;
this.btnOK.Text = "Close";
this.btnOK.UseVisualStyleBackColor = true;
this.btnOK.Click += new System.EventHandler(this.btnOK_Click);
//
// btnBoot
//
this.btnBoot.Location = new System.Drawing.Point(13, 13);
this.btnBoot.Name = "btnBoot";
this.btnBoot.Size = new System.Drawing.Size(270, 23);
this.btnBoot.TabIndex = 1;
this.btnBoot.Text = "Bootstrap Client from Web...";
this.btnBoot.UseVisualStyleBackColor = true;
this.btnBoot.Click += new System.EventHandler(this.btnBoot_Click);
//
// btnDumpLog
//
this.btnDumpLog.Location = new System.Drawing.Point(13, 43);
this.btnDumpLog.Name = "btnDumpLog";
this.btnDumpLog.Size = new System.Drawing.Size(270, 23);
this.btnDumpLog.TabIndex = 2;
this.btnDumpLog.Text = "Save Log...";
this.btnDumpLog.UseVisualStyleBackColor = true;
this.btnDumpLog.Click += new System.EventHandler(this.btnDumpLog_Click);
//
// btnVSettings
//
this.btnVSettings.Location = new System.Drawing.Point(13, 73);
this.btnVSettings.Name = "btnVSettings";
this.btnVSettings.Size = new System.Drawing.Size(270, 23);
this.btnVSettings.TabIndex = 3;
this.btnVSettings.Text = "Display Settings...";
this.btnVSettings.UseVisualStyleBackColor = true;
this.btnVSettings.Click += new System.EventHandler(this.btnVSettings_Click);
//
// UserSettingsForm
//
this.AcceptButton = this.btnOK;
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.CancelButton = this.btnOK;
this.ClientSize = new System.Drawing.Size(295, 156);
this.ControlBox = false;
this.Controls.Add(this.btnVSettings);
this.Controls.Add(this.btnDumpLog);
this.Controls.Add(this.btnBoot);
this.Controls.Add(this.btnOK);
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.MaximizeBox = false;
this.MinimizeBox = false;
this.Name = "UserSettingsForm";
this.ShowIcon = false;
this.ShowInTaskbar = false;
this.SizeGripStyle = System.Windows.Forms.SizeGripStyle.Hide;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Settings";
this.ResumeLayout(false);
this.btnOK = new System.Windows.Forms.Button();
this.btnBoot = new System.Windows.Forms.Button();
this.btnDumpLog = new System.Windows.Forms.Button();
this.btnVSettings = new System.Windows.Forms.Button();
this.btnEditCfg = new System.Windows.Forms.Button();
this.btnNewCfg = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// btnOK
//
this.btnOK.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)));
this.btnOK.DialogResult = System.Windows.Forms.DialogResult.Cancel;
this.btnOK.Location = new System.Drawing.Point(109, 167);
this.btnOK.Name = "btnOK";
this.btnOK.Size = new System.Drawing.Size(75, 23);
this.btnOK.TabIndex = 0;
this.btnOK.Text = "Close";
this.btnOK.UseVisualStyleBackColor = true;
this.btnOK.Click += new System.EventHandler(this.btnOK_Click);
//
// btnBoot
//
this.btnBoot.Location = new System.Drawing.Point(13, 13);
this.btnBoot.Name = "btnBoot";
this.btnBoot.Size = new System.Drawing.Size(270, 23);
this.btnBoot.TabIndex = 1;
this.btnBoot.Text = "Bootstrap Client from Web...";
this.btnBoot.UseVisualStyleBackColor = true;
this.btnBoot.Click += new System.EventHandler(this.btnBoot_Click);
//
// btnDumpLog
//
this.btnDumpLog.Location = new System.Drawing.Point(13, 43);
this.btnDumpLog.Name = "btnDumpLog";
this.btnDumpLog.Size = new System.Drawing.Size(270, 23);
this.btnDumpLog.TabIndex = 2;
this.btnDumpLog.Text = "Save Log...";
this.btnDumpLog.UseVisualStyleBackColor = true;
this.btnDumpLog.Click += new System.EventHandler(this.btnDumpLog_Click);
//
// btnVSettings
//
this.btnVSettings.Location = new System.Drawing.Point(13, 73);
this.btnVSettings.Name = "btnVSettings";
this.btnVSettings.Size = new System.Drawing.Size(270, 23);
this.btnVSettings.TabIndex = 3;
this.btnVSettings.Text = "Display Settings...";
this.btnVSettings.UseVisualStyleBackColor = true;
this.btnVSettings.Click += new System.EventHandler(this.btnVSettings_Click);
//
// btnEditCfg
//
this.btnEditCfg.Location = new System.Drawing.Point(13, 102);
this.btnEditCfg.Name = "btnEditCfg";
this.btnEditCfg.Size = new System.Drawing.Size(270, 23);
this.btnEditCfg.TabIndex = 4;
this.btnEditCfg.Text = "Edit Configuration File...";
this.btnEditCfg.UseVisualStyleBackColor = true;
this.btnEditCfg.Click += new System.EventHandler(this.BtnEditCfg_Click);
//
// btnNewCfg
//
this.btnNewCfg.Location = new System.Drawing.Point(12, 131);
this.btnNewCfg.Name = "btnNewCfg";
this.btnNewCfg.Size = new System.Drawing.Size(270, 23);
this.btnNewCfg.TabIndex = 5;
this.btnNewCfg.Text = "New Configuration File...";
this.btnNewCfg.UseVisualStyleBackColor = true;
this.btnNewCfg.Click += new System.EventHandler(this.BtnNewCfg_Click);
//
// UserSettingsForm
//
this.AcceptButton = this.btnOK;
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.CancelButton = this.btnOK;
this.ClientSize = new System.Drawing.Size(295, 202);
this.ControlBox = false;
this.Controls.Add(this.btnNewCfg);
this.Controls.Add(this.btnEditCfg);
this.Controls.Add(this.btnVSettings);
this.Controls.Add(this.btnDumpLog);
this.Controls.Add(this.btnBoot);
this.Controls.Add(this.btnOK);
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.MaximizeBox = false;
this.MinimizeBox = false;
this.Name = "UserSettingsForm";
this.ShowIcon = false;
this.ShowInTaskbar = false;
this.SizeGripStyle = System.Windows.Forms.SizeGripStyle.Hide;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Settings";
this.ResumeLayout(false);
}
#endregion
@ -106,6 +130,8 @@
private System.Windows.Forms.Button btnOK;
private System.Windows.Forms.Button btnBoot;
private System.Windows.Forms.Button btnDumpLog;
private System.Windows.Forms.Button btnVSettings;
private System.Windows.Forms.Button btnVSettings;
private System.Windows.Forms.Button btnEditCfg;
private System.Windows.Forms.Button btnNewCfg;
}
}

@ -1,4 +1,5 @@
using System;
using System.Diagnostics;
using System.IO;
using System.Windows.Forms;
@ -57,6 +58,54 @@ namespace network.loki.lokinet.win32.ui
VisualSettings v = new VisualSettings();
v.ShowDialog();
v.Dispose();
}
}
private void BtnEditCfg_Click(object sender, EventArgs e)
{
try {
Process.Start(string.Format("{0}/lokinet.ini", config_path)); }
catch
{
MessageBox.Show("No existing config found");
BtnNewCfg_Click(sender, e);
}
}
private void BtnNewCfg_Click(object sender, EventArgs e)
{
if (File.Exists(string.Format("{0}/lokinet.ini", config_path)))
{
DialogResult resp = MessageBox.Show("WARNING: This will overwrite your existing config file, Continue?", "Lokinet", MessageBoxButtons.YesNo, MessageBoxIcon.Question);
switch(resp)
{
case DialogResult.Yes:
File.Delete(string.Format("{0}/lokinet.ini", config_path));
break;
case DialogResult.No:
return;
}
}
string lokinetExeString;
if (Program.platform == PlatformID.Win32NT)
lokinetExeString = String.Format("{0}\\lokinet.exe", Directory.GetCurrentDirectory());
else
lokinetExeString = String.Format("{0}/lokinet", Directory.GetCurrentDirectory());
Process p = new Process();
p.StartInfo.FileName = lokinetExeString;
p.StartInfo.Arguments = "-g";
p.StartInfo.CreateNoWindow = true;
p.StartInfo.UseShellExecute = false;
p.EnableRaisingEvents = true;
p.Exited += new EventHandler(msg);
p.Start();
}
private void msg(object sender, EventArgs e)
{
MessageBox.Show(string.Format("Created new config file at {0}/lokinet.ini", config_path), "Success", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);
}
}
}

@ -1,120 +1,120 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
</root>

@ -2,7 +2,7 @@
; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES!
#define MyAppName "loki-network"
#define MyAppVersion "0.4.3"
#define MyAppVersion "0.5.0"
#define MyAppPublisher "Loki Project"
#define MyAppURL "https://loki.network"
#define MyAppExeName "lokinetui.exe"
@ -39,18 +39,18 @@ OutputDir={#DevPath}win32-setup
OutputBaseFilename=lokinet-win32
Compression=lzma
SolidCompression=yes
VersionInfoVersion=0.4.3
VersionInfoVersion=0.5.0
VersionInfoCompany=Loki Project
VersionInfoDescription=LokiNET for Microsoft® Windows® NT™
#ifndef RELEASE
VersionInfoTextVersion=0.4.3-dev-{#VCSRev}
VersionInfoProductTextVersion=0.4.3-dev-{#VCSRev}
VersionInfoTextVersion=0.5.0-dev-{#VCSRev}
VersionInfoProductTextVersion=0.5.0-dev-{#VCSRev}
#else
VersionInfoTextVersion=0.4.3
VersionInfoProductTextVersion=0.4.3 ({#Codename})
VersionInfoTextVersion=0.5.0
VersionInfoProductTextVersion=0.5.0 ({#Codename})
#endif
VersionInfoProductName=LokiNET
VersionInfoProductVersion=0.4.3
VersionInfoProductVersion=0.5.0
InternalCompressLevel=ultra64
MinVersion=0,5.0
ArchitecturesInstallIn64BitMode=x64

Loading…
Cancel
Save