RandomX: added SSE4.1-optimized Blake2b

+0.15% on `rx/0`
+0.3% on `rx/wow`
This commit is contained in:
SChernykh 2020-09-10 14:28:40 +02:00
parent 0f09883429
commit 4a9db89527
7 changed files with 623 additions and 9 deletions

View File

@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
add_definitions(/DHAVE_ROTR)
endif()
@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
if (HAVE_ROTR)

View File

@ -63,6 +63,7 @@ public:
FLAG_PDPE1GB,
FLAG_SSE2,
FLAG_SSSE3,
FLAG_SSE41,
FLAG_XOP,
FLAG_POPCNT,
FLAG_CAT_L3,

View File

@ -57,7 +57,7 @@
namespace xmrig {
static const std::array<const char *, ICpuInfo::FLAG_MAX> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "xop", "popcnt", "cat_l3" };
static const std::array<const char *, ICpuInfo::FLAG_MAX> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3" };
static const std::array<const char *, ICpuInfo::MSR_MOD_MAX> msrNames = { "none", "ryzen", "intel", "custom" };
@ -141,6 +141,7 @@ static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES,
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
static inline bool has_sse2() { return has_feature(PROCESSOR_INFO, EDX_Reg, 1 << 26); }
static inline bool has_ssse3() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 9); }
static inline bool has_sse41() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 19); }
static inline bool has_xop() { return has_feature(0x80000001, ECX_Reg, 1 << 11); }
static inline bool has_popcnt() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 23); }
static inline bool has_cat_l3() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 15) && has_feature(0x10, EBX_Reg, 1 << 1); }
@ -177,6 +178,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
m_flags.set(FLAG_PDPE1GB, has_pdpe1gb());
m_flags.set(FLAG_SSE2, has_sse2());
m_flags.set(FLAG_SSSE3, has_ssse3());
m_flags.set(FLAG_SSE41, has_sse41());
m_flags.set(FLAG_XOP, has_xop());
m_flags.set(FLAG_POPCNT, has_popcnt());
m_flags.set(FLAG_CAT_L3, has_cat_l3());

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,119 @@
/*
BLAKE2 reference source code package - optimized C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#ifndef BLAKE2B_ROUND_H
#define BLAKE2B_ROUND_H
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
#define LIKELY(x) __builtin_expect((x),1)
/* Microarchitecture-specific macros */
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63); \
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse41.h"
#define ROUND(r) \
LOAD_MSG_ ##r ##_1(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_2(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
LOAD_MSG_ ##r ##_3(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_4(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
#endif

View File

@ -39,6 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/blake2/blake2.h"
#include "crypto/randomx/blake2/blake2-impl.h"
#if defined(_M_X64) || defined(__x86_64__)
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <smmintrin.h>
#include "blake2b-round.h"
#endif
static const uint64_t blake2b_IV[8] = {
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
@ -179,7 +190,63 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t
return 0;
}
static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) {
#if defined(_M_X64) || defined(__x86_64__)
static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
const __m128i m0 = LOADU(block + 00);
const __m128i m1 = LOADU(block + 16);
const __m128i m2 = LOADU(block + 32);
const __m128i m3 = LOADU(block + 48);
const __m128i m4 = LOADU(block + 64);
const __m128i m5 = LOADU(block + 80);
const __m128i m6 = LOADU(block + 96);
const __m128i m7 = LOADU(block + 112);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
}
#undef ROUND
#endif
static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) {
uint64_t m[16];
uint64_t v[16];
unsigned int i, r;
@ -237,6 +304,20 @@ static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) {
#undef ROUND
}
#if defined(_M_X64) || defined(__x86_64__)
uint32_t rx_blake2b_use_sse41 = 0;
#define rx_blake2b_compress(S, block) \
if (rx_blake2b_use_sse41) \
rx_blake2b_compress_sse41(S, block); \
else \
rx_blake2b_compress_integer(S, block);
#else
#define rx_blake2b_compress(S, block) rx_blake2b_compress_integer(S, block);
#endif
int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
const uint8_t *pin = (const uint8_t *)in;
@ -260,14 +341,14 @@ int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
size_t fill = BLAKE2B_BLOCKBYTES - left;
memcpy(&S->buf[left], pin, fill);
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
rx_blake2b_compress(S, S->buf);
rx_blake2b_compress(S, S->buf);
S->buflen = 0;
inlen -= fill;
pin += fill;
/* Avoid buffer copies when possible */
while (inlen > BLAKE2B_BLOCKBYTES) {
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
rx_blake2b_compress(S, pin);
rx_blake2b_compress(S, pin);
inlen -= BLAKE2B_BLOCKBYTES;
pin += BLAKE2B_BLOCKBYTES;
}
@ -294,7 +375,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) {
blake2b_increment_counter(S, S->buflen);
blake2b_set_lastblock(S);
memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
rx_blake2b_compress(S, S->buf);
rx_blake2b_compress(S, S->buf);
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);

View File

@ -31,6 +31,11 @@
#include "crypto/rx/RxVm.h"
#if defined(_M_X64) || defined(__x86_64__)
extern "C" uint32_t rx_blake2b_use_sse41;
#endif
randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly, uint32_t node)
{
int flags = 0;
@ -55,6 +60,10 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
flags |= RANDOMX_FLAG_AMD;
}
# if defined(_M_X64) || defined(__x86_64__)
rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0;
# endif
return randomx_create_vm(static_cast<randomx_flags>(flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad, node);
}