From 763691fa4b7777a1798c9954fe0e3268f78e7d2d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 9 Dec 2019 20:29:05 +0100 Subject: [PATCH] More optimizations for Ryzen --- .../randomx/asm/program_read_dataset_ryzen.inc | 15 +++++++-------- src/crypto/randomx/jit_compiler_x86.cpp | 10 +++++++++- src/crypto/randomx/jit_compiler_x86.hpp | 1 + src/crypto/randomx/jit_compiler_x86_static.asm | 1 + src/crypto/randomx/randomx.h | 4 ++-- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/crypto/randomx/asm/program_read_dataset_ryzen.inc b/src/crypto/randomx/asm/program_read_dataset_ryzen.inc index 37e2104b..6bb87c8f 100644 --- a/src/crypto/randomx/asm/program_read_dataset_ryzen.inc +++ b/src/crypto/randomx/asm/program_read_dataset_ryzen.inc @@ -7,13 +7,12 @@ and edx, RANDOMX_DATASET_BASE_MASK prefetchnta byte ptr [rdi+rdx] ror rbp, 32 ;# swap "ma" and "mx" - add rcx, rdi ;# dataset cache line xor r8, rax - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] + xor r9, qword ptr [rdi+rcx+8] + xor r10, qword ptr [rdi+rcx+16] + xor r11, qword ptr [rdi+rcx+24] + xor r12, qword ptr [rdi+rcx+32] + xor r13, qword ptr [rdi+rcx+40] + xor r14, qword ptr [rdi+rcx+48] + xor r15, qword ptr [rdi+rcx+56] \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 2528c2cf..84cfe39c 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -169,6 +169,7 @@ namespace randomx { static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC }; + static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC }; static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; @@ -300,6 +301,8 @@ namespace randomx { } void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) { + vm_flags = flags; + generateProgramPrologue(prog, pcfg); uint8_t* p; @@ -1010,7 +1013,12 @@ namespace randomx { emit(ROL_RAX, p, pos); emitByte(rotate, p, pos); } - emit(AND_OR_MOV_LDMXCSR, p, pos); + if (vm_flags & RANDOMX_FLAG_RYZEN) { + emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos); + } + else { + emit(AND_OR_MOV_LDMXCSR, p, pos); + } codePos = pos; } diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 0d515b0e..9354e5db 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -70,6 +70,7 @@ namespace randomx { uint8_t* allocatedCode; uint8_t* code; int32_t codePos; + uint32_t vm_flags; static bool BranchesWithin32B; diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index 7dd1232d..90395c52 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -81,6 +81,7 @@ randomx_program_prologue_first_load PROC and eax, RANDOMX_SCRATCHPAD_MASK ror rdx, 32 and edx, RANDOMX_SCRATCHPAD_MASK + stmxcsr dword ptr [rsp-20] jmp randomx_program_loop_begin randomx_program_prologue_first_load ENDP diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 92314229..1ed5aa53 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -119,9 +119,9 @@ struct RandomX_ConfigurationBase rx_vec_i128 fillAes4Rx4_Key[8]; uint8_t codeShhPrefetchTweaked[20]; - uint8_t codeReadDatasetTweaked[72]; + uint8_t codeReadDatasetTweaked[256]; uint32_t codeReadDatasetTweakedSize; - uint8_t codeReadDatasetRyzenTweaked[72]; + uint8_t codeReadDatasetRyzenTweaked[256]; uint32_t codeReadDatasetRyzenTweakedSize; uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codePrefetchScratchpadTweaked[32];