Dataset initialization with AVX2 (WIP)
This commit is contained in:
parent
6b21a51a2f
commit
515a85e66c
@ -214,13 +214,6 @@ void xmrig::Workers<T>::start(const std::vector<T> &data, bool sleep)
|
||||
|
||||
for (auto worker : m_workers) {
|
||||
worker->start(Workers<T>::onReady);
|
||||
|
||||
// This sleep is important for optimal caching!
|
||||
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
|
||||
// Sub-optimal caching can result in up to 0.5% hashrate penalty
|
||||
if (sleep) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(20));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -53,6 +53,7 @@ public:
|
||||
|
||||
enum Flag : uint32_t {
|
||||
FLAG_AES,
|
||||
FLAG_AVX,
|
||||
FLAG_AVX2,
|
||||
FLAG_AVX512F,
|
||||
FLAG_BMI2,
|
||||
@ -80,9 +81,11 @@ public:
|
||||
virtual Assembly::Id assembly() const = 0;
|
||||
virtual bool has(Flag feature) const = 0;
|
||||
virtual bool hasAES() const = 0;
|
||||
virtual bool hasAVX() const = 0;
|
||||
virtual bool hasAVX2() const = 0;
|
||||
virtual bool hasBMI2() const = 0;
|
||||
virtual bool hasOneGbPages() const = 0;
|
||||
virtual bool hasXOP() const = 0;
|
||||
virtual bool hasCatL3() const = 0;
|
||||
virtual bool isVM() const = 0;
|
||||
virtual const char *backend() const = 0;
|
||||
|
@ -52,8 +52,8 @@
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
constexpr size_t kCpuFlagsSize = 13;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
constexpr size_t kCpuFlagsSize = 14;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
|
||||
|
||||
|
||||
@ -134,11 +134,12 @@ static inline uint64_t xgetbv()
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; }
|
||||
static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; }
|
||||
static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; }
|
||||
static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); }
|
||||
static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); }
|
||||
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); }
|
||||
static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
|
||||
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
|
||||
static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
|
||||
static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); }
|
||||
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
|
||||
@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
cpu_brand_string(m_brand);
|
||||
|
||||
m_flags.set(FLAG_AES, has_aes_ni());
|
||||
m_flags.set(FLAG_AVX, has_avx());
|
||||
m_flags.set(FLAG_AVX2, has_avx2());
|
||||
m_flags.set(FLAG_AVX512F, has_avx512f());
|
||||
m_flags.set(FLAG_BMI2, has_bmi2());
|
||||
|
@ -48,9 +48,11 @@ protected:
|
||||
inline Assembly::Id assembly() const override { return m_assembly; }
|
||||
inline bool has(Flag flag) const override { return m_flags.test(flag); }
|
||||
inline bool hasAES() const override { return has(FLAG_AES); }
|
||||
inline bool hasAVX() const override { return has(FLAG_AVX); }
|
||||
inline bool hasAVX2() const override { return has(FLAG_AVX2); }
|
||||
inline bool hasBMI2() const override { return has(FLAG_BMI2); }
|
||||
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
|
||||
inline bool hasXOP() const override { return has(FLAG_XOP); }
|
||||
inline bool hasCatL3() const override { return has(FLAG_CAT_L3); }
|
||||
inline bool isVM() const override { return has(FLAG_VM); }
|
||||
inline const char *brand() const override { return m_brand; }
|
||||
|
28
src/crypto/randomx/asm/program_sshash_avx2_constants.inc
Normal file
28
src/crypto/randomx/asm/program_sshash_avx2_constants.inc
Normal file
@ -0,0 +1,28 @@
|
||||
r0_avx2_increments:
|
||||
db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
|
||||
mul_hi_avx2_data:
|
||||
db 0,0,0,0,1,0,0,0
|
||||
r0_avx2_mul:
|
||||
;#/ 6364136223846793005
|
||||
db 45, 127, 149, 76, 45, 244, 81, 88
|
||||
r1_avx2_add:
|
||||
;#/ 9298411001130361340
|
||||
db 252, 161, 245, 89, 138, 151, 10, 129
|
||||
r2_avx2_add:
|
||||
;#/ 12065312585734608966
|
||||
db 70, 216, 194, 56, 223, 153, 112, 167
|
||||
r3_avx2_add:
|
||||
;#/ 9306329213124626780
|
||||
db 92, 73, 34, 191, 28, 185, 38, 129
|
||||
r4_avx2_add:
|
||||
;#/ 5281919268842080866
|
||||
db 98, 138, 159, 23, 151, 37, 77, 73
|
||||
r5_avx2_add:
|
||||
;#/ 10536153434571861004
|
||||
db 12, 236, 170, 206, 185, 239, 55, 146
|
||||
r6_avx2_add:
|
||||
;#/ 3398623926847679864
|
||||
db 120, 45, 230, 108, 116, 86, 42, 47
|
||||
r7_avx2_add:
|
||||
;#/ 9549104520008361294
|
||||
db 78, 229, 44, 182, 247, 59, 133, 132
|
31
src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
Normal file
31
src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
Normal file
@ -0,0 +1,31 @@
|
||||
add rsp, 32
|
||||
pop r9
|
||||
|
||||
movdqu xmm0, xmmword ptr [rsp]
|
||||
movdqu xmm1, xmmword ptr [rsp + 16]
|
||||
movdqu xmm2, xmmword ptr [rsp + 32]
|
||||
movdqu xmm3, xmmword ptr [rsp + 48]
|
||||
movdqu xmm4, xmmword ptr [rsp + 64]
|
||||
movdqu xmm5, xmmword ptr [rsp + 80]
|
||||
movdqu xmm6, xmmword ptr [rsp + 96]
|
||||
movdqu xmm7, xmmword ptr [rsp + 112]
|
||||
movdqu xmm8, xmmword ptr [rsp + 128]
|
||||
movdqu xmm9, xmmword ptr [rsp + 144]
|
||||
movdqu xmm10, xmmword ptr [rsp + 160]
|
||||
movdqu xmm11, xmmword ptr [rsp + 176]
|
||||
movdqu xmm12, xmmword ptr [rsp + 192]
|
||||
movdqu xmm13, xmmword ptr [rsp + 208]
|
||||
movdqu xmm14, xmmword ptr [rsp + 224]
|
||||
movdqu xmm15, xmmword ptr [rsp + 240]
|
||||
vzeroupper
|
||||
add rsp, 256
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
37
src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
Normal file
37
src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
Normal file
@ -0,0 +1,37 @@
|
||||
;# prefetch RandomX dataset lines
|
||||
prefetchnta byte ptr [rsi]
|
||||
prefetchnta byte ptr [rsi+64]
|
||||
prefetchnta byte ptr [rsi+128]
|
||||
prefetchnta byte ptr [rsi+192]
|
||||
prefetchnta byte ptr [rsi+256]
|
||||
|
||||
;# prefetch RandomX cache lines
|
||||
mov rbx, rbp
|
||||
and rbx, RANDOMX_CACHE_MASK
|
||||
shl rbx, 6
|
||||
add rbx, rdi
|
||||
prefetchnta byte ptr [rbx]
|
||||
lea rax, [rbp+1]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp], rax
|
||||
lea rax, [rbp+2]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+8], rax
|
||||
lea rax, [rbp+3]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+16], rax
|
||||
lea rax, [rbp+4]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+24], rax
|
38
src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
Normal file
38
src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
Normal file
@ -0,0 +1,38 @@
|
||||
mov qword ptr [rsi+0], r8
|
||||
vpunpcklqdq ymm8, ymm0, ymm1
|
||||
mov qword ptr [rsi+8], r9
|
||||
vpunpcklqdq ymm9, ymm2, ymm3
|
||||
mov qword ptr [rsi+16], r10
|
||||
vpunpcklqdq ymm10, ymm4, ymm5
|
||||
mov qword ptr [rsi+24], r11
|
||||
vpunpcklqdq ymm11, ymm6, ymm7
|
||||
mov qword ptr [rsi+32], r12
|
||||
vpunpckhqdq ymm12, ymm0, ymm1
|
||||
mov qword ptr [rsi+40], r13
|
||||
vpunpckhqdq ymm13, ymm2, ymm3
|
||||
mov qword ptr [rsi+48], r14
|
||||
vpunpckhqdq ymm14, ymm4, ymm5
|
||||
mov qword ptr [rsi+56], r15
|
||||
vpunpckhqdq ymm15, ymm6, ymm7
|
||||
|
||||
vperm2i128 ymm0, ymm8, ymm9, 32
|
||||
vperm2i128 ymm1, ymm10, ymm11, 32
|
||||
vmovdqu ymmword ptr [rsi+64], ymm0
|
||||
vmovdqu ymmword ptr [rsi+96], ymm1
|
||||
vperm2i128 ymm2, ymm12, ymm13, 32
|
||||
vperm2i128 ymm3, ymm14, ymm15, 32
|
||||
vmovdqu ymmword ptr [rsi+128], ymm2
|
||||
vmovdqu ymmword ptr [rsi+160], ymm3
|
||||
vperm2i128 ymm4, ymm8, ymm9, 49
|
||||
vperm2i128 ymm5, ymm10, ymm11, 49
|
||||
vmovdqu ymmword ptr [rsi+192], ymm4
|
||||
vmovdqu ymmword ptr [rsi+224], ymm5
|
||||
vperm2i128 ymm6, ymm12, ymm13, 49
|
||||
vperm2i128 ymm7, ymm14, ymm15, 49
|
||||
vmovdqu ymmword ptr [rsi+256], ymm6
|
||||
vmovdqu ymmword ptr [rsi+288], ymm7
|
||||
|
||||
add rbp, 5
|
||||
add rsi, 320
|
||||
cmp rbp, qword ptr [rsp+32]
|
||||
db 15, 130, 0, 0, 0, 0 ;# jb rel32
|
@ -0,0 +1,27 @@
|
||||
push rbx
|
||||
push rbp
|
||||
push rdi
|
||||
push rsi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
;# save all XMM registers just to be safe for all calling conventions
|
||||
sub rsp, 256
|
||||
movdqu xmmword ptr [rsp], xmm0
|
||||
movdqu xmmword ptr [rsp + 16], xmm1
|
||||
movdqu xmmword ptr [rsp + 32], xmm2
|
||||
movdqu xmmword ptr [rsp + 48], xmm3
|
||||
movdqu xmmword ptr [rsp + 64], xmm4
|
||||
movdqu xmmword ptr [rsp + 80], xmm5
|
||||
movdqu xmmword ptr [rsp + 96], xmm6
|
||||
movdqu xmmword ptr [rsp + 112], xmm7
|
||||
movdqu xmmword ptr [rsp + 128], xmm8
|
||||
movdqu xmmword ptr [rsp + 144], xmm9
|
||||
movdqu xmmword ptr [rsp + 160], xmm10
|
||||
movdqu xmmword ptr [rsp + 176], xmm11
|
||||
movdqu xmmword ptr [rsp + 192], xmm12
|
||||
movdqu xmmword ptr [rsp + 208], xmm13
|
||||
movdqu xmmword ptr [rsp + 224], xmm14
|
||||
movdqu xmmword ptr [rsp + 240], xmm15
|
50
src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
Normal file
50
src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
Normal file
@ -0,0 +1,50 @@
|
||||
sub rsp, 40
|
||||
mov [rsp], rbx
|
||||
vmovdqu ymmword ptr [rsp+8], ymm14
|
||||
|
||||
mov rax, [rsp+40]
|
||||
mov rbx, [rsp+48]
|
||||
mov rcx, [rsp+56]
|
||||
mov rdx, [rsp+64]
|
||||
|
||||
vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1]
|
||||
vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2]
|
||||
vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3]
|
||||
vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4]
|
||||
|
||||
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2]
|
||||
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4]
|
||||
vpxor ymm0, ymm0, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4]
|
||||
vpxor ymm2, ymm2, ymm14
|
||||
|
||||
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2]
|
||||
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4]
|
||||
vpxor ymm1, ymm1, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4]
|
||||
vpxor ymm3, ymm3, ymm14
|
||||
|
||||
vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1]
|
||||
vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2]
|
||||
vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3]
|
||||
vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4]
|
||||
|
||||
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2]
|
||||
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4]
|
||||
vpxor ymm4, ymm4, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4]
|
||||
vpxor ymm6, ymm6, ymm14
|
||||
|
||||
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2]
|
||||
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4]
|
||||
vpxor ymm5, ymm5, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4]
|
||||
vpxor ymm7, ymm7, ymm14
|
||||
|
||||
mov rbx, [rsp]
|
||||
vmovdqu ymm14, ymmword ptr [rsp+8]
|
||||
add rsp, 40
|
29
src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
Normal file
29
src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
Normal file
@ -0,0 +1,29 @@
|
||||
vmovdqu ymmword ptr [rsp], ymm0
|
||||
|
||||
mov rax, [rsp]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+8]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+8], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+16]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+16], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+24]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+24], rax
|
||||
prefetchnta byte ptr [rax]
|
File diff suppressed because it is too large
Load Diff
@ -96,6 +96,7 @@ namespace randomx {
|
||||
|
||||
bool BranchesWithin32B = false;
|
||||
bool hasAVX;
|
||||
bool hasAVX2;
|
||||
bool hasXOP;
|
||||
|
||||
uint8_t* allocatedCode = nullptr;
|
||||
@ -107,9 +108,10 @@ namespace randomx {
|
||||
static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos);
|
||||
static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos);
|
||||
static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos);
|
||||
static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos);
|
||||
static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; }
|
||||
|
||||
void generateSuperscalarCode(Instruction &);
|
||||
template<bool AVX2>
|
||||
void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos);
|
||||
|
||||
static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) {
|
||||
code[codePos] = val;
|
||||
|
@ -52,6 +52,11 @@
|
||||
.global DECL(randomx_program_loop_store)
|
||||
.global DECL(randomx_program_loop_end)
|
||||
.global DECL(randomx_dataset_init)
|
||||
.global DECL(randomx_dataset_init_avx2_prologue)
|
||||
.global DECL(randomx_dataset_init_avx2_loop_end)
|
||||
.global DECL(randomx_dataset_init_avx2_epilogue)
|
||||
.global DECL(randomx_dataset_init_avx2_ssh_load)
|
||||
.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
|
||||
.global DECL(randomx_program_epilogue)
|
||||
.global DECL(randomx_sshash_load)
|
||||
.global DECL(randomx_sshash_prefetch)
|
||||
@ -192,6 +197,98 @@ call_offset:
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_dataset_init_avx2_prologue):
|
||||
#include "asm/program_sshash_avx2_save_registers.inc"
|
||||
|
||||
#if defined(WINABI)
|
||||
mov rdi, qword ptr [rcx] ;# cache->memory
|
||||
mov rsi, rdx ;# dataset
|
||||
mov rbp, r8 ;# block index
|
||||
push r9 ;# max. block index
|
||||
#else
|
||||
mov rdi, qword ptr [rdi] ;# cache->memory
|
||||
;# dataset in rsi
|
||||
mov rbp, rdx ;# block index
|
||||
push rcx ;# max. block index
|
||||
#endif
|
||||
sub rsp, 32
|
||||
|
||||
jmp randomx_dataset_init_avx2_prologue_loop_begin
|
||||
#include "asm/program_sshash_avx2_constants.inc"
|
||||
|
||||
.balign 64
|
||||
randomx_dataset_init_avx2_prologue_loop_begin:
|
||||
#include "asm/program_sshash_avx2_loop_begin.inc"
|
||||
|
||||
;# init integer registers (lane 0)
|
||||
lea r8, [rbp+1]
|
||||
imul r8, qword ptr [r0_avx2_mul+rip]
|
||||
mov r9, qword ptr [r1_avx2_add+rip]
|
||||
xor r9, r8
|
||||
mov r10, qword ptr [r2_avx2_add+rip]
|
||||
xor r10, r8
|
||||
mov r11, qword ptr [r3_avx2_add+rip]
|
||||
xor r11, r8
|
||||
mov r12, qword ptr [r4_avx2_add+rip]
|
||||
xor r12, r8
|
||||
mov r13, qword ptr [r5_avx2_add+rip]
|
||||
xor r13, r8
|
||||
mov r14, qword ptr [r6_avx2_add+rip]
|
||||
xor r14, r8
|
||||
mov r15, qword ptr [r7_avx2_add+rip]
|
||||
xor r15, r8
|
||||
|
||||
;# init AVX registers (lanes 1-4)
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
movq xmm0, rbp
|
||||
vpbroadcastq ymm0, xmm0
|
||||
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
|
||||
|
||||
;# ymm0 *= r0_avx2_mul
|
||||
vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
|
||||
vpsrlq ymm8, ymm0, 32
|
||||
vpsrlq ymm9, ymm1, 32
|
||||
vpmuludq ymm10, ymm0, ymm1
|
||||
vpmuludq ymm11, ymm9, ymm0
|
||||
vpmuludq ymm0, ymm8, ymm1
|
||||
vpsllq ymm11, ymm11, 32
|
||||
vpsllq ymm0, ymm0, 32
|
||||
vpaddq ymm10, ymm10, ymm11
|
||||
vpaddq ymm0, ymm10, ymm0
|
||||
|
||||
vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
|
||||
vpxor ymm1, ymm0, ymm1
|
||||
vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
|
||||
vpxor ymm2, ymm0, ymm2
|
||||
vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
|
||||
vpxor ymm3, ymm0, ymm3
|
||||
vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
|
||||
vpxor ymm4, ymm0, ymm4
|
||||
vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
|
||||
vpxor ymm5, ymm0, ymm5
|
||||
vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
|
||||
vpxor ymm6, ymm0, ymm6
|
||||
vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
|
||||
vpxor ymm7, ymm0, ymm7
|
||||
|
||||
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
|
||||
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
|
||||
|
||||
;# generated SuperscalarHash code goes here
|
||||
|
||||
DECL(randomx_dataset_init_avx2_loop_end):
|
||||
#include "asm/program_sshash_avx2_loop_end.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_epilogue):
|
||||
#include "asm/program_sshash_avx2_epilogue.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_ssh_load):
|
||||
#include "asm/program_sshash_avx2_ssh_load.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_ssh_prefetch):
|
||||
#include "asm/program_sshash_avx2_ssh_prefetch.inc"
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_program_epilogue):
|
||||
#include "asm/program_epilogue_store.inc"
|
||||
|
@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen
|
||||
PUBLIC randomx_program_read_dataset_sshash_init
|
||||
PUBLIC randomx_program_read_dataset_sshash_fin
|
||||
PUBLIC randomx_dataset_init
|
||||
PUBLIC randomx_dataset_init_avx2_prologue
|
||||
PUBLIC randomx_dataset_init_avx2_loop_end
|
||||
PUBLIC randomx_dataset_init_avx2_epilogue
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_load
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_prefetch
|
||||
PUBLIC randomx_program_loop_store
|
||||
PUBLIC randomx_program_loop_end
|
||||
PUBLIC randomx_program_epilogue
|
||||
@ -183,6 +188,95 @@ init_block_loop:
|
||||
randomx_dataset_init ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_dataset_init_avx2_prologue PROC
|
||||
include asm/program_sshash_avx2_save_registers.inc
|
||||
|
||||
mov rdi, qword ptr [rcx] ;# cache->memory
|
||||
mov rsi, rdx ;# dataset
|
||||
mov rbp, r8 ;# block index
|
||||
push r9 ;# max. block index
|
||||
sub rsp, 32
|
||||
|
||||
jmp loop_begin
|
||||
include asm/program_sshash_avx2_constants.inc
|
||||
|
||||
ALIGN 64
|
||||
loop_begin:
|
||||
include asm/program_sshash_avx2_loop_begin.inc
|
||||
|
||||
;# init integer registers (lane 0)
|
||||
lea r8, [rbp+1]
|
||||
imul r8, qword ptr [r0_avx2_mul]
|
||||
mov r9, qword ptr [r1_avx2_add]
|
||||
xor r9, r8
|
||||
mov r10, qword ptr [r2_avx2_add]
|
||||
xor r10, r8
|
||||
mov r11, qword ptr [r3_avx2_add]
|
||||
xor r11, r8
|
||||
mov r12, qword ptr [r4_avx2_add]
|
||||
xor r12, r8
|
||||
mov r13, qword ptr [r5_avx2_add]
|
||||
xor r13, r8
|
||||
mov r14, qword ptr [r6_avx2_add]
|
||||
xor r14, r8
|
||||
mov r15, qword ptr [r7_avx2_add]
|
||||
xor r15, r8
|
||||
|
||||
;# init AVX registers (lanes 1-4)
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
movq xmm0, rbp
|
||||
vpbroadcastq ymm0, xmm0
|
||||
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
|
||||
|
||||
;# ymm0 *= r0_avx2_mul
|
||||
vbroadcastsd ymm1, qword ptr [r0_avx2_mul]
|
||||
vpsrlq ymm8, ymm0, 32
|
||||
vpsrlq ymm9, ymm1, 32
|
||||
vpmuludq ymm10, ymm0, ymm1
|
||||
vpmuludq ymm11, ymm9, ymm0
|
||||
vpmuludq ymm0, ymm8, ymm1
|
||||
vpsllq ymm11, ymm11, 32
|
||||
vpsllq ymm0, ymm0, 32
|
||||
vpaddq ymm10, ymm10, ymm11
|
||||
vpaddq ymm0, ymm10, ymm0
|
||||
|
||||
vbroadcastsd ymm1, qword ptr [r1_avx2_add]
|
||||
vpxor ymm1, ymm0, ymm1
|
||||
vbroadcastsd ymm2, qword ptr [r2_avx2_add]
|
||||
vpxor ymm2, ymm0, ymm2
|
||||
vbroadcastsd ymm3, qword ptr [r3_avx2_add]
|
||||
vpxor ymm3, ymm0, ymm3
|
||||
vbroadcastsd ymm4, qword ptr [r4_avx2_add]
|
||||
vpxor ymm4, ymm0, ymm4
|
||||
vbroadcastsd ymm5, qword ptr [r5_avx2_add]
|
||||
vpxor ymm5, ymm0, ymm5
|
||||
vbroadcastsd ymm6, qword ptr [r6_avx2_add]
|
||||
vpxor ymm6, ymm0, ymm6
|
||||
vbroadcastsd ymm7, qword ptr [r7_avx2_add]
|
||||
vpxor ymm7, ymm0, ymm7
|
||||
|
||||
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32)
|
||||
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
|
||||
randomx_dataset_init_avx2_prologue ENDP
|
||||
|
||||
;# generated SuperscalarHash code goes here
|
||||
|
||||
randomx_dataset_init_avx2_loop_end PROC
|
||||
include asm/program_sshash_avx2_loop_end.inc
|
||||
randomx_dataset_init_avx2_loop_end ENDP
|
||||
|
||||
randomx_dataset_init_avx2_epilogue PROC
|
||||
include asm/program_sshash_avx2_epilogue.inc
|
||||
randomx_dataset_init_avx2_epilogue ENDP
|
||||
|
||||
randomx_dataset_init_avx2_ssh_load PROC
|
||||
include asm/program_sshash_avx2_ssh_load.inc
|
||||
randomx_dataset_init_avx2_ssh_load ENDP
|
||||
|
||||
randomx_dataset_init_avx2_ssh_prefetch PROC
|
||||
include asm/program_sshash_avx2_ssh_prefetch.inc
|
||||
randomx_dataset_init_avx2_ssh_prefetch ENDP
|
||||
|
||||
randomx_program_epilogue PROC
|
||||
include asm/program_epilogue_store.inc
|
||||
include asm/program_epilogue_win64.inc
|
||||
|
@ -44,6 +44,11 @@ extern "C" {
|
||||
void randomx_program_loop_store();
|
||||
void randomx_program_loop_end();
|
||||
void randomx_dataset_init();
|
||||
void randomx_dataset_init_avx2_prologue();
|
||||
void randomx_dataset_init_avx2_loop_end();
|
||||
void randomx_dataset_init_avx2_epilogue();
|
||||
void randomx_dataset_init_avx2_ssh_load();
|
||||
void randomx_dataset_init_avx2_ssh_prefetch();
|
||||
void randomx_program_epilogue();
|
||||
void randomx_sshash_load();
|
||||
void randomx_sshash_prefetch();
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
|
||||
#include "crypto/rx/RxDataset.h"
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "base/io/log/Log.h"
|
||||
#include "base/io/log/Tags.h"
|
||||
#include "base/kernel/Platform.h"
|
||||
@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache,
|
||||
{
|
||||
Platform::setThreadPriority(priority);
|
||||
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount);
|
||||
if (Cpu::info()->hasAVX2() && (itemCount % 5)) {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
|
||||
randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5);
|
||||
}
|
||||
else {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user