RandomX: added SSE4.1-optimized Blake2b
+0.15% on `rx/0` +0.3% on `rx/wow`
This commit is contained in:
parent
0f09883429
commit
4a9db89527
@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
|
||||
|
||||
add_definitions(/DHAVE_ROTR)
|
||||
endif()
|
||||
@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
|
||||
|
||||
check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
|
||||
if (HAVE_ROTR)
|
||||
|
@ -63,6 +63,7 @@ public:
|
||||
FLAG_PDPE1GB,
|
||||
FLAG_SSE2,
|
||||
FLAG_SSSE3,
|
||||
FLAG_SSE41,
|
||||
FLAG_XOP,
|
||||
FLAG_POPCNT,
|
||||
FLAG_CAT_L3,
|
||||
|
@ -57,7 +57,7 @@
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
static const std::array<const char *, ICpuInfo::FLAG_MAX> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "xop", "popcnt", "cat_l3" };
|
||||
static const std::array<const char *, ICpuInfo::FLAG_MAX> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3" };
|
||||
static const std::array<const char *, ICpuInfo::MSR_MOD_MAX> msrNames = { "none", "ryzen", "intel", "custom" };
|
||||
|
||||
|
||||
@ -141,6 +141,7 @@ static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES,
|
||||
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
|
||||
static inline bool has_sse2() { return has_feature(PROCESSOR_INFO, EDX_Reg, 1 << 26); }
|
||||
static inline bool has_ssse3() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 9); }
|
||||
static inline bool has_sse41() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 19); }
|
||||
static inline bool has_xop() { return has_feature(0x80000001, ECX_Reg, 1 << 11); }
|
||||
static inline bool has_popcnt() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 23); }
|
||||
static inline bool has_cat_l3() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 15) && has_feature(0x10, EBX_Reg, 1 << 1); }
|
||||
@ -177,6 +178,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
m_flags.set(FLAG_PDPE1GB, has_pdpe1gb());
|
||||
m_flags.set(FLAG_SSE2, has_sse2());
|
||||
m_flags.set(FLAG_SSSE3, has_ssse3());
|
||||
m_flags.set(FLAG_SSE41, has_sse41());
|
||||
m_flags.set(FLAG_XOP, has_xop());
|
||||
m_flags.set(FLAG_POPCNT, has_popcnt());
|
||||
m_flags.set(FLAG_CAT_L3, has_cat_l3());
|
||||
|
402
src/crypto/randomx/blake2/blake2b-load-sse41.h
Normal file
402
src/crypto/randomx/blake2/blake2b-load-sse41.h
Normal file
File diff suppressed because it is too large
Load Diff
119
src/crypto/randomx/blake2/blake2b-round.h
Normal file
119
src/crypto/randomx/blake2/blake2b-round.h
Normal file
@ -0,0 +1,119 @@
|
||||
/*
|
||||
BLAKE2 reference source code package - optimized C implementations
|
||||
|
||||
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
|
||||
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
|
||||
your option. The terms of these licenses can be found at:
|
||||
|
||||
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
|
||||
- OpenSSL license : https://www.openssl.org/source/license.html
|
||||
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
More information about the BLAKE2 hash function can be found at
|
||||
https://blake2.net.
|
||||
*/
|
||||
#ifndef BLAKE2B_ROUND_H
|
||||
#define BLAKE2B_ROUND_H
|
||||
|
||||
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
|
||||
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
|
||||
|
||||
#define TOF(reg) _mm_castsi128_ps((reg))
|
||||
#define TOI(reg) _mm_castps_si128((reg))
|
||||
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
|
||||
|
||||
/* Microarchitecture-specific macros */
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
|
||||
|
||||
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -32); \
|
||||
row4h = _mm_roti_epi64(row4h, -32); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -24); \
|
||||
row2h = _mm_roti_epi64(row2h, -24); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -16); \
|
||||
row4h = _mm_roti_epi64(row4h, -16); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -63); \
|
||||
row2h = _mm_roti_epi64(row2h, -63); \
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#include "blake2b-load-sse41.h"
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
|
||||
#endif
|
@ -39,6 +39,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/blake2/blake2.h"
|
||||
#include "crypto/randomx/blake2/blake2-impl.h"
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include "blake2b-round.h"
|
||||
|
||||
#endif
|
||||
|
||||
static const uint64_t blake2b_IV[8] = {
|
||||
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
|
||||
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
|
||||
@ -179,7 +190,63 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) {
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
|
||||
{
|
||||
__m128i row1l, row1h;
|
||||
__m128i row2l, row2h;
|
||||
__m128i row3l, row3h;
|
||||
__m128i row4l, row4h;
|
||||
__m128i b0, b1;
|
||||
__m128i t0, t1;
|
||||
|
||||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||
|
||||
const __m128i m0 = LOADU(block + 00);
|
||||
const __m128i m1 = LOADU(block + 16);
|
||||
const __m128i m2 = LOADU(block + 32);
|
||||
const __m128i m3 = LOADU(block + 48);
|
||||
const __m128i m4 = LOADU(block + 64);
|
||||
const __m128i m5 = LOADU(block + 80);
|
||||
const __m128i m6 = LOADU(block + 96);
|
||||
const __m128i m7 = LOADU(block + 112);
|
||||
|
||||
row1l = LOADU(&S->h[0]);
|
||||
row1h = LOADU(&S->h[2]);
|
||||
row2l = LOADU(&S->h[4]);
|
||||
row2h = LOADU(&S->h[6]);
|
||||
row3l = LOADU(&blake2b_IV[0]);
|
||||
row3h = LOADU(&blake2b_IV[2]);
|
||||
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
|
||||
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
|
||||
|
||||
ROUND(0);
|
||||
ROUND(1);
|
||||
ROUND(2);
|
||||
ROUND(3);
|
||||
ROUND(4);
|
||||
ROUND(5);
|
||||
ROUND(6);
|
||||
ROUND(7);
|
||||
ROUND(8);
|
||||
ROUND(9);
|
||||
ROUND(10);
|
||||
ROUND(11);
|
||||
|
||||
row1l = _mm_xor_si128(row3l, row1l);
|
||||
row1h = _mm_xor_si128(row3h, row1h);
|
||||
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
|
||||
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
|
||||
row2l = _mm_xor_si128(row4l, row2l);
|
||||
row2h = _mm_xor_si128(row4h, row2h);
|
||||
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
|
||||
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
|
||||
}
|
||||
#undef ROUND
|
||||
#endif
|
||||
|
||||
static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) {
|
||||
uint64_t m[16];
|
||||
uint64_t v[16];
|
||||
unsigned int i, r;
|
||||
@ -237,6 +304,20 @@ static void rx_blake2b_compress(blake2b_state *S, const uint8_t *block) {
|
||||
#undef ROUND
|
||||
}
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
|
||||
uint32_t rx_blake2b_use_sse41 = 0;
|
||||
|
||||
#define rx_blake2b_compress(S, block) \
|
||||
if (rx_blake2b_use_sse41) \
|
||||
rx_blake2b_compress_sse41(S, block); \
|
||||
else \
|
||||
rx_blake2b_compress_integer(S, block);
|
||||
|
||||
#else
|
||||
#define rx_blake2b_compress(S, block) rx_blake2b_compress_integer(S, block);
|
||||
#endif
|
||||
|
||||
int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
|
||||
const uint8_t *pin = (const uint8_t *)in;
|
||||
|
||||
|
@ -31,6 +31,11 @@
|
||||
#include "crypto/rx/RxVm.h"
|
||||
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
extern "C" uint32_t rx_blake2b_use_sse41;
|
||||
#endif
|
||||
|
||||
|
||||
randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly, uint32_t node)
|
||||
{
|
||||
int flags = 0;
|
||||
@ -55,6 +60,10 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
|
||||
flags |= RANDOMX_FLAG_AMD;
|
||||
}
|
||||
|
||||
# if defined(_M_X64) || defined(__x86_64__)
|
||||
rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0;
|
||||
# endif
|
||||
|
||||
return randomx_create_vm(static_cast<randomx_flags>(flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad, node);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user