From 84d7eb05f36d6d91efe991b815e88d915b0ec8fd Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 1 Dec 2019 08:46:35 +0100 Subject: [PATCH] RandomX fixes Intel JCC erratum fix and various other improvements, see more here: https://www.phoronix.com/scan.php?page=article&item=intel-jcc-microcode&num=1 --- src/App.cpp | 3 + src/backend/common/Workers.cpp | 5 + src/backend/cpu/CpuWorker.cpp | 32 +++- src/crypto/randomx/aes_hash.cpp | 81 ++++++++++ src/crypto/randomx/aes_hash.hpp | 3 + src/crypto/randomx/intrin_portable.h | 7 + src/crypto/randomx/jit_compiler_x86.cpp | 193 ++++++++++++++++++++---- src/crypto/randomx/jit_compiler_x86.hpp | 7 +- src/crypto/randomx/randomx.cpp | 18 +++ src/crypto/randomx/randomx.h | 3 + src/crypto/randomx/virtual_machine.cpp | 6 + src/crypto/randomx/virtual_machine.hpp | 2 + 12 files changed, 320 insertions(+), 40 deletions(-) diff --git a/src/App.cpp b/src/App.cpp index 04b05451..1908482c 100644 --- a/src/App.cpp +++ b/src/App.cpp @@ -33,6 +33,7 @@ #include "base/io/Console.h" #include "base/io/log/Log.h" #include "base/kernel/Signals.h" +#include "base/kernel/Platform.h" #include "core/config/Config.h" #include "core/Controller.h" #include "core/Miner.h" @@ -89,6 +90,8 @@ int xmrig::App::exec() m_controller->start(); + Platform::setThreadPriority(5); + rc = uv_run(uv_default_loop(), UV_RUN_DEFAULT); uv_loop_close(uv_default_loop()); diff --git a/src/backend/common/Workers.cpp b/src/backend/common/Workers.cpp index ec47f965..1efa6cbe 100644 --- a/src/backend/common/Workers.cpp +++ b/src/backend/common/Workers.cpp @@ -109,6 +109,11 @@ void xmrig::Workers::start(const std::vector &data) for (Thread *worker : m_workers) { worker->start(Workers::onReady); + + // This sleep is important for optimal caching! + // Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads + // Sub-optimal caching can result in up to 0.5% hashrate penalty + std::this_thread::sleep_for(std::chrono::milliseconds(20)); } } diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index f64882ba..7584b686 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -185,8 +185,20 @@ void xmrig::CpuWorker::start() consumeJob(); } + uint64_t storeStatsMask = 7; + +# ifdef XMRIG_ALGO_RANDOMX + bool first = true; + uint64_t tempHash[8] = {}; + + // RandomX is faster, we don't need to store stats so often + if (m_job.currentJob().algorithm().family() == Algorithm::RANDOM_X) { + storeStatsMask = 63; + } +# endif + while (!Nonce::isOutdated(Nonce::CPU, m_job.sequence())) { - if ((m_count & 0x7) == 0) { + if ((m_count & storeStatsMask) == 0) { storeStats(); } @@ -196,26 +208,34 @@ void xmrig::CpuWorker::start() break; } + uint32_t current_job_nonces[N]; + for (size_t i = 0; i < N; ++i) { + current_job_nonces[i] = *m_job.nonce(i); + } + # ifdef XMRIG_ALGO_RANDOMX if (job.algorithm().family() == Algorithm::RANDOM_X) { - randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash); + if (first) { + first = false; + randomx_calculate_hash_first(m_vm->get(), tempHash, m_job.blob(), job.size()); + } + m_job.nextRound(kReserveCount, 1); + randomx_calculate_hash_next(m_vm->get(), tempHash, m_job.blob(), job.size(), m_hash); } else # endif { fn(job.algorithm())(m_job.blob(), job.size(), m_hash, m_ctx, job.height()); + m_job.nextRound(kReserveCount, 1); } for (size_t i = 0; i < N; ++i) { if (*reinterpret_cast(m_hash + (i * 32) + 24) < job.target()) { - JobResults::submit(job, *m_job.nonce(i), m_hash + (i * 32)); + JobResults::submit(job, current_job_nonces[i], m_hash + (i * 32)); } } - m_job.nextRound(kReserveCount, 1); m_count += N; - - std::this_thread::yield(); } consumeJob(); diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index fe149dfe..4a400d0a 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -212,3 +212,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; + const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; + + // initial state + rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0); + rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1); + rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2); + rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3); + + constexpr int PREFETCH_DISTANCE = 4096; + const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE; + scratchpadEnd -= PREFETCH_DISTANCE; + + for (int i = 0; i < 2; ++i) { + //process 64 bytes at a time in 4 lanes + while (scratchpadPtr < scratchpadEnd) { + hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); + hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); + hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); + hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); + + fill_state0 = aesdec(fill_state0, key0); + fill_state1 = aesenc(fill_state1, key1); + fill_state2 = aesdec(fill_state2, key2); + fill_state3 = aesenc(fill_state3, key3); + + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); + + rx_prefetch_t0(prefetchPtr); + + scratchpadPtr += 64; + prefetchPtr += 64; + } + prefetchPtr = (const char*) scratchpad; + scratchpadEnd += PREFETCH_DISTANCE; + } + + rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3); + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + hash_state0 = aesenc(hash_state0, xkey0); + hash_state1 = aesdec(hash_state1, xkey0); + hash_state2 = aesenc(hash_state2, xkey0); + hash_state3 = aesdec(hash_state3, xkey0); + + hash_state0 = aesenc(hash_state0, xkey1); + hash_state1 = aesdec(hash_state1, xkey1); + hash_state2 = aesenc(hash_state2, xkey1); + hash_state3 = aesdec(hash_state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); +} + +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/crypto/randomx/aes_hash.hpp b/src/crypto/randomx/aes_hash.hpp index b4d0e940..9f75f73a 100644 --- a/src/crypto/randomx/aes_hash.hpp +++ b/src/crypto/randomx/aes_hash.hpp @@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/crypto/randomx/intrin_portable.h b/src/crypto/randomx/intrin_portable.h index 346c433a..1dcd3ad3 100644 --- a/src/crypto/randomx/intrin_portable.h +++ b/src/crypto/randomx/intrin_portable.h @@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128; #define rx_aligned_alloc(a, b) _mm_malloc(a,b) #define rx_aligned_free(a) _mm_free(a) #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) +#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0) #define rx_load_vec_f128 _mm_load_pd #define rx_store_vec_f128 _mm_store_pd @@ -201,6 +202,7 @@ typedef union{ #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) /* Splat 64-bit long long to 2 64-bit long longs */ FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) @@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) { asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); } +inline void rx_prefetch_t0(const void* ptr) { + asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); +} + FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { return vld1q_f64((const float64_t*)pd); } @@ -532,6 +538,7 @@ typedef union { #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { rx_vec_f128 x; diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 2a342535..bfde7d00 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -29,6 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "crypto/randomx/jit_compiler_x86.hpp" #include "crypto/randomx/jit_compiler_x86_static.hpp" #include "crypto/randomx/superscalar.hpp" @@ -36,6 +37,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" +#ifdef _MSC_VER +# include +#else +# include +#endif + namespace randomx { /* @@ -108,7 +115,7 @@ namespace randomx { const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; - const int32_t epilogueOffset = CodeSize - epilogueSize; + const int32_t epilogueOffset = (CodeSize - epilogueSize) & ~63; constexpr int32_t superScalarHashOffset = 32768; static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; @@ -183,6 +190,7 @@ namespace randomx { static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t JZ_SHORT = 0x74; static const uint8_t RET = 0xc3; static const uint8_t LEA_32[] = { 0x41, 0x8d }; static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; @@ -197,20 +205,100 @@ namespace randomx { static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; -// static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; + + static const uint8_t JMP_ALIGN_PREFIX[14][16] = { + {}, + {0x2E}, + {0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + {0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, + }; + + bool JitCompilerX86::BranchesWithin32B = false; size_t JitCompilerX86::getCodeSize() { return codePos < prologueSize ? 0 : codePos - prologueSize; } + static inline void cpuid(uint32_t level, int32_t output[4]) + { + memset(output, 0, sizeof(int32_t) * 4); + +# ifdef _MSC_VER + __cpuid(output, static_cast(level)); +# else + __cpuid_count(level, 0, output[0], output[1], output[2], output[3]); +# endif + } + + // CPU-specific tweaks + void JitCompilerX86::applyTweaks() { + int32_t info[4]; + cpuid(0, info); + + int32_t manufacturer[4]; + manufacturer[0] = info[1]; + manufacturer[1] = info[3]; + manufacturer[2] = info[2]; + manufacturer[3] = 0; + + if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { + struct + { + unsigned int stepping : 4; + unsigned int model : 4; + unsigned int family : 4; + unsigned int processor_type : 2; + unsigned int reserved1 : 2; + unsigned int ext_model : 4; + unsigned int ext_family : 8; + unsigned int reserved2 : 4; + } processor_info; + + cpuid(1, info); + memcpy(&processor_info, info, sizeof(processor_info)); + + // Intel JCC erratum mitigation + if (processor_info.family == 6) { + const uint32_t model = processor_info.model | (processor_info.ext_model << 4); + const uint32_t stepping = processor_info.stepping; + + // Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + BranchesWithin32B = + ((model == 0x4E) && (stepping == 0x3)) || + ((model == 0x55) && (stepping == 0x4)) || + ((model == 0x5E) && (stepping == 0x3)) || + ((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) || + ((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) || + ((model == 0xA6) && (stepping == 0x0)) || + ((model == 0xAE) && (stepping == 0xA)); + } + } + } + + static std::atomic codeOffset; + JitCompilerX86::JitCompilerX86() { - code = (uint8_t*)allocExecutableMemory(CodeSize); + applyTweaks(); + allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2); + // Shift code base address to improve caching - all threads will use different L2/L3 cache sets + code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize); memcpy(code, codePrologue, prologueSize); memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); } JitCompilerX86::~JitCompilerX86() { - freePagedMemory(code, CodeSize); + freePagedMemory(allocatedCode, CodeSize); } void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { @@ -307,6 +395,22 @@ namespace randomx { emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos); memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; + + if (BranchesWithin32B) { + const uint32_t branch_begin = static_cast(codePos); + const uint32_t branch_end = static_cast(branch_begin + 9); + + // If the jump crosses or touches 32-byte boundary, align it + if ((branch_begin ^ branch_end) >= 32) { + uint32_t alignment_size = 32 - (branch_begin & 31); + if (alignment_size > 8) { + emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos); + alignment_size = 8; + } + emit(NOPX[alignment_size - 1], alignment_size, code, codePos); + } + } + emit(SUB_EBX, code, codePos); emit(JNZ, code, codePos); emit32(prologueSize - codePos - 4, code, codePos); @@ -408,12 +512,13 @@ namespace randomx { } } - void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos, bool rax) { - emit(LEA_32, code, codePos); - emitByte(0x80 + instr.src + (rax ? 0 : 8), code, codePos); - if (instr.src == RegisterNeedsSib) { - emitByte(0x24, code, codePos); - } + template + FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos) { + const uint32_t src = *((uint32_t*)&instr) & 0xFF0000; + + *(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + src; + codePos += (src == (RegisterNeedsSib << 16)) ? 4 : 3; + emit32(instr.getImm32(), code, codePos); if (rax) emitByte(AND_EAX_I, code, codePos); @@ -422,12 +527,14 @@ namespace randomx { emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos); } - void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) { - emit(LEA_32, code, codePos); - emitByte(0x80 + instr.dst, code, codePos); - if (instr.dst == RegisterNeedsSib) { - emitByte(0x24, code, codePos); - } + template void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos); + template void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos); + + FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) { + const uint32_t dst = static_cast(instr.dst) << 16; + *(uint32_t*)(code + codePos) = 0x24808d41 + dst; + codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3; + emit32(instr.getImm32(), code, codePos); emitByte(AND_EAX_I, code, codePos); if (instr.getModCond() < StoreL3Condition) { @@ -438,7 +545,7 @@ namespace randomx { } } - void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) { + FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) { emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos); } @@ -483,7 +590,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit32(template_IADD_M[instr.dst], p, pos); } else { @@ -523,7 +630,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_SUB_RM, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x06, p, pos); @@ -561,7 +668,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_IMUL_RM, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x06, p, pos); @@ -596,7 +703,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos, false); + genAddressReg(instr, p, pos); emit(REX_MOV_RR64, p, pos); emitByte(0xc0 + instr.dst, p, pos); emit(REX_MUL_MEM, p, pos); @@ -635,7 +742,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos, false); + genAddressReg(instr, p, pos); emit(REX_MOV_RR64, p, pos); emitByte(0xc0 + instr.dst, p, pos); emit(REX_IMUL_MEM, p, pos); @@ -704,7 +811,7 @@ namespace randomx { int pos = codePos; if (instr.src != instr.dst) { - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_XOR_RM, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x06, p, pos); @@ -801,7 +908,7 @@ namespace randomx { int pos = codePos; const uint32_t dst = instr.dst % RegisterCountFlt; - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_ADDPD, p, pos); emitByte(0xc4 + 8 * dst, p, pos); @@ -826,7 +933,7 @@ namespace randomx { int pos = codePos; const uint32_t dst = instr.dst % RegisterCountFlt; - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_SUBPD, p, pos); emitByte(0xc4 + 8 * dst, p, pos); @@ -862,7 +969,7 @@ namespace randomx { int pos = codePos; const uint32_t dst = instr.dst % RegisterCountFlt; - genAddressReg(instr, p, pos); + genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_ANDPS_XMM12, p, pos); emit(REX_DIVPD, p, pos); @@ -902,19 +1009,39 @@ namespace randomx { uint8_t* const p = code; int pos = codePos; - int reg = instr.dst; + const int reg = instr.dst; + int32_t jmp_offset = registerUsage[reg] - (pos + 16); + + if (BranchesWithin32B) { + const uint32_t branch_begin = static_cast(pos + 7); + const uint32_t branch_end = static_cast(branch_begin + ((jmp_offset >= -128) ? 9 : 13)); + + // If the jump crosses or touches 32-byte boundary, align it + if ((branch_begin ^ branch_end) >= 32) { + const uint32_t alignment_size = 32 - (branch_begin & 31); + jmp_offset -= alignment_size; + emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos); + } + } + emit(REX_ADD_I, p, pos); emitByte(0xc0 + reg, p, pos); - int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; - uint32_t imm = instr.getImm32() | (1UL << shift); - if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0) - imm &= ~(1UL << (shift - 1)); + const int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; + const uint32_t imm = (instr.getImm32() | (1UL << shift)) & ~(1UL << (shift - 1)); emit32(imm, p, pos); emit(REX_TEST, p, pos); emitByte(0xc0 + reg, p, pos); emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos); - emit(JZ, p, pos); - emit32(registerUsage[reg] - (pos + 4), p, pos); + + if (jmp_offset >= -128) { + emitByte(JZ_SHORT, p, pos); + emitByte(jmp_offset, p, pos); + } + else { + emit(JZ, p, pos); + emit32(jmp_offset - 4, p, pos); + } + //mark all registers as used uint64_t* r = (uint64_t*) registerUsage; uint64_t k = pos; diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 30b16f58..f1864018 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -67,12 +67,17 @@ namespace randomx { static InstructionGeneratorX86 engine[256]; int registerUsage[RegistersCount]; + uint8_t* allocatedCode; uint8_t* code; int32_t codePos; + static bool BranchesWithin32B; + + static void applyTweaks(); void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramEpilogue(Program&, ProgramConfiguration&); - static void genAddressReg(const Instruction&, uint8_t* code, int& codePos, bool rax = true); + template + static void genAddressReg(const Instruction&, uint8_t* code, int& codePos); static void genAddressRegDst(const Instruction&, uint8_t* code, int& codePos); static void genAddressImm(const Instruction&, uint8_t* code, int& codePos); static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 4f56dc09..f9ce93f8 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -473,4 +473,22 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) { + rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + machine->initScratchpad(tempHash); + } + + void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) { + machine->resetRoundingMode(); + for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { + machine->run(&tempHash); + rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + } + machine->run(&tempHash); + + // Finish current hash and fill the scratchpad for the next hash at the same time + rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0); + machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); + } + } diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 6e074dba..ea3bb099 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -338,6 +338,9 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); */ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); +RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize); +RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output); + #if defined(__cplusplus) } #endif diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index 2913c7e5..ecd187e2 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -114,6 +114,12 @@ namespace randomx { rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); } + template + void VmBase::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) { + hashAndFillAes1Rx4(scratchpad, ScratchpadSize, ®.a, fill_state); + rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + template void VmBase::initScratchpad(void* seed) { fillAes1Rx4(seed, ScratchpadSize, scratchpad); diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index c85af009..d3718d04 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -39,6 +39,7 @@ public: virtual ~randomx_vm() = 0; virtual void setScratchpad(uint8_t *scratchpad) = 0; virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -82,6 +83,7 @@ namespace randomx { void setScratchpad(uint8_t *scratchpad) override; void initScratchpad(void* seed) override; void getFinalResult(void* out, size_t outSize) override; + void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override; protected: void generateProgram(void* seed);