From d0df8245990ea688b83942906a3e57051ef88758 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 8 Dec 2019 16:14:02 +0100 Subject: [PATCH] Optimized dataset read for Ryzen CPUs Removed register dependency in dataset read, +0.8% speedup on average. --- src/backend/cpu/CpuWorker.cpp | 2 +- .../asm/program_read_dataset_ryzen.inc | 19 ++++++++++++++++++ src/crypto/randomx/jit_compiler_a64.cpp | 2 +- src/crypto/randomx/jit_compiler_a64.hpp | 2 +- src/crypto/randomx/jit_compiler_fallback.hpp | 2 +- src/crypto/randomx/jit_compiler_x86.cpp | 20 ++++++++++++++----- src/crypto/randomx/jit_compiler_x86.hpp | 2 +- src/crypto/randomx/jit_compiler_x86_static.S | 4 ++++ .../randomx/jit_compiler_x86_static.asm | 5 +++++ .../randomx/jit_compiler_x86_static.hpp | 1 + src/crypto/randomx/randomx.cpp | 19 +++++++++++++----- src/crypto/randomx/randomx.h | 6 +++++- src/crypto/randomx/virtual_machine.hpp | 4 ++++ src/crypto/randomx/vm_compiled.cpp | 2 +- src/crypto/rx/RxVm.cpp | 6 +++++- src/crypto/rx/RxVm.h | 3 ++- src/net/JobResults.cpp | 2 +- 17 files changed, 81 insertions(+), 20 deletions(-) create mode 100644 src/crypto/randomx/asm/program_read_dataset_ryzen.inc diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index 2e223a31..5d58106d 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -96,7 +96,7 @@ void xmrig::CpuWorker::allocateRandomX_VM() } if (!m_vm) { - m_vm = new RxVm(dataset, m_memory->scratchpad(), !m_hwAES); + m_vm = new RxVm(dataset, m_memory->scratchpad(), !m_hwAES, m_assembly); } } #endif diff --git a/src/crypto/randomx/asm/program_read_dataset_ryzen.inc b/src/crypto/randomx/asm/program_read_dataset_ryzen.inc new file mode 100644 index 00000000..37e2104b --- /dev/null +++ b/src/crypto/randomx/asm/program_read_dataset_ryzen.inc @@ -0,0 +1,19 @@ + mov rcx, rbp ;# ecx = ma + shr rcx, 32 + and ecx, RANDOMX_DATASET_BASE_MASK + xor rbp, rax ;# modify "mx" + mov rax, qword ptr [rdi+rcx] + mov edx, ebp ;# edx = mx + and edx, RANDOMX_DATASET_BASE_MASK + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + add rcx, rdi ;# dataset cache line + xor r8, rax + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index bf790c2b..d291de4d 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -118,7 +118,7 @@ static void clear_code_cache(char* p1, char* p2) # endif } -void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) +void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config, uint32_t) { uint32_t codePos = MainLoopBegin + 4; diff --git a/src/crypto/randomx/jit_compiler_a64.hpp b/src/crypto/randomx/jit_compiler_a64.hpp index e524feb8..05afdc70 100644 --- a/src/crypto/randomx/jit_compiler_a64.hpp +++ b/src/crypto/randomx/jit_compiler_a64.hpp @@ -49,7 +49,7 @@ namespace randomx { JitCompilerA64(); ~JitCompilerA64(); - void generateProgram(Program&, ProgramConfiguration&); + void generateProgram(Program&, ProgramConfiguration&, uint32_t); void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); template diff --git a/src/crypto/randomx/jit_compiler_fallback.hpp b/src/crypto/randomx/jit_compiler_fallback.hpp index bc363858..063ae521 100644 --- a/src/crypto/randomx/jit_compiler_fallback.hpp +++ b/src/crypto/randomx/jit_compiler_fallback.hpp @@ -44,7 +44,7 @@ namespace randomx { JitCompilerFallback() { throw std::runtime_error("JIT compilation is not supported on this platform"); } - void generateProgram(Program&, ProgramConfiguration&) { + void generateProgram(Program&, ProgramConfiguration&, uint32_t) { } void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index bfde7d00..082b9eb3 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -89,7 +89,6 @@ namespace randomx { const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; - const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; @@ -105,7 +104,6 @@ namespace randomx { const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad; const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; - const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; @@ -301,10 +299,22 @@ namespace randomx { freePagedMemory(allocatedCode, CodeSize); } - void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) { generateProgramPrologue(prog, pcfg); - memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); - codePos += readDatasetSize; + + uint8_t* p; + uint32_t n; + if (flags & RANDOMX_FLAG_RYZEN) { + p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked; + n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize; + } + else { + p = RandomX_CurrentConfig.codeReadDatasetTweaked; + n = RandomX_CurrentConfig.codeReadDatasetTweakedSize; + } + memcpy(code + codePos, p, n); + codePos += n; + generateProgramEpilogue(prog, pcfg); } diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index f1864018..0d515b0e 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -49,7 +49,7 @@ namespace randomx { public: JitCompilerX86(); ~JitCompilerX86(); - void generateProgram(Program&, ProgramConfiguration&); + void generateProgram(Program&, ProgramConfiguration&, uint32_t); void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); template void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector &); diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S index c20cd743..8e1f9ef6 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.S +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -45,6 +45,7 @@ .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_read_dataset_ryzen) .global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_loop_store) @@ -110,6 +111,9 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" +DECL(randomx_program_read_dataset_ryzen): + #include "asm/program_read_dataset_ryzen.inc" + DECL(randomx_program_read_dataset_sshash_init): #include "asm/program_read_dataset_sshash_init.inc" diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index 73fa503a..7dd1232d 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_read_dataset_ryzen PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init @@ -103,6 +104,10 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP +randomx_program_read_dataset_ryzen PROC + include asm/program_read_dataset_ryzen.inc +randomx_program_read_dataset_ryzen ENDP + randomx_program_read_dataset_sshash_init PROC include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp index 0a62c986..151c1c58 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.hpp +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -37,6 +37,7 @@ extern "C" { void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); + void randomx_program_read_dataset_ryzen(); void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 44d881eb..08f4f241 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -157,8 +157,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset; - const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen; memcpy(codeReadDatasetTweaked, a, b - a); + codeReadDatasetTweakedSize = b - a; + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + memcpy(codeReadDatasetRyzenTweaked, a, b - a); + codeReadDatasetRyzenTweakedSize = b - a; } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init; @@ -191,10 +198,11 @@ void RandomX_ConfigurationBase::Apply() #if defined(_M_X64) || defined(__x86_64__) *(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1; - const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE; - *(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask; - *(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask; - *(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask; + // Not needed right now because all variants use default dataset base size + //const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE; + //*(uint32_t*)(codeReadDatasetTweaked + 9) = DatasetBaseMask; + //*(uint32_t*)(codeReadDatasetTweaked + 24) = DatasetBaseMask; + //*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask; *(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated; @@ -435,6 +443,7 @@ extern "C" { } vm->setScratchpad(scratchpad); + vm->setFlags(flags); } catch (std::exception &ex) { delete vm; diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 95bfdbf4..84ae7dfc 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -49,6 +49,7 @@ enum randomx_flags { RANDOMX_FLAG_FULL_MEM = 4, RANDOMX_FLAG_JIT = 8, RANDOMX_FLAG_1GB_PAGES = 16, + RANDOMX_FLAG_RYZEN = 64, }; @@ -118,7 +119,10 @@ struct RandomX_ConfigurationBase rx_vec_i128 fillAes4Rx4_Key[8]; uint8_t codeShhPrefetchTweaked[20]; - uint8_t codeReadDatasetTweaked[64]; + uint8_t codeReadDatasetTweaked[72]; + uint32_t codeReadDatasetTweakedSize; + uint8_t codeReadDatasetRyzenTweaked[72]; + uint32_t codeReadDatasetRyzenTweakedSize; uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codePrefetchScratchpadTweaked[32]; diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index d3718d04..3fdd86df 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -46,6 +46,9 @@ public: virtual void run(void* seed) = 0; void resetRoundingMode(); + void setFlags(uint32_t flags) { vm_flags = flags; } + uint32_t getFlags() const { return vm_flags; } + randomx::RegisterFile *getRegisterFile() { return ® } @@ -71,6 +74,7 @@ protected: randomx_dataset* datasetPtr; }; uint64_t datasetOffset; + uint32_t vm_flags; }; namespace randomx { diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp index d2ee59e8..1ab76cd7 100644 --- a/src/crypto/randomx/vm_compiled.cpp +++ b/src/crypto/randomx/vm_compiled.cpp @@ -43,7 +43,7 @@ namespace randomx { void CompiledVm::run(void* seed) { VmBase::generateProgram(seed); randomx_vm::initialize(); - compiler.generateProgram(program, config); + compiler.generateProgram(program, config, getFlags()); mem.memory = datasetPtr->memory + datasetOffset; execute(); } diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index e8d615e8..8cdad371 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -31,7 +31,7 @@ #include "crypto/rx/RxVm.h" -xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes) +xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly) { if (!softAes) { m_flags |= RANDOMX_FLAG_HARD_AES; @@ -45,6 +45,10 @@ xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes) m_flags |= RANDOMX_FLAG_JIT; } + if (assembly == Assembly::RYZEN) { + m_flags |= RANDOMX_FLAG_RYZEN; + } + m_vm = randomx_create_vm(static_cast(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad); } diff --git a/src/crypto/rx/RxVm.h b/src/crypto/rx/RxVm.h index 30a31c2e..7cddf93b 100644 --- a/src/crypto/rx/RxVm.h +++ b/src/crypto/rx/RxVm.h @@ -29,6 +29,7 @@ #include "base/tools/Object.h" +#include "backend/cpu/Cpu.h" #include @@ -49,7 +50,7 @@ class RxVm public: XMRIG_DISABLE_COPY_MOVE_DEFAULT(RxVm); - RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes); + RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly); ~RxVm(); inline randomx_vm *get() const { return m_vm; } diff --git a/src/net/JobResults.cpp b/src/net/JobResults.cpp index e8b4adce..26f16952 100644 --- a/src/net/JobResults.cpp +++ b/src/net/JobResults.cpp @@ -117,7 +117,7 @@ static void getResults(JobBundle &bundle, std::vector &results, uint3 return; } - auto vm = new RxVm(dataset, memory->scratchpad(), !hwAES); + auto vm = new RxVm(dataset, memory->scratchpad(), !hwAES, Assembly::NONE); for (uint32_t nonce : bundle.nonces) { *bundle.job.nonce() = nonce;