RandomX fixes
Intel JCC erratum fix and various other improvements, see more here: https://www.phoronix.com/scan.php?page=article&item=intel-jcc-microcode&num=1
This commit is contained in:
@ -33,6 +33,7 @@
|
||||
#include "base/io/Console.h"
|
||||
#include "base/io/log/Log.h"
|
||||
#include "base/kernel/Signals.h"
|
||||
#include "base/kernel/Platform.h"
|
||||
#include "core/config/Config.h"
|
||||
#include "core/Controller.h"
|
||||
#include "core/Miner.h"
|
||||
@ -89,6 +90,8 @@ int xmrig::App::exec()
|
||||
|
||||
m_controller->start();
|
||||
|
||||
Platform::setThreadPriority(5);
|
||||
|
||||
rc = uv_run(uv_default_loop(), UV_RUN_DEFAULT);
|
||||
uv_loop_close(uv_default_loop());
|
||||
|
||||
|
@ -109,6 +109,11 @@ void xmrig::Workers<T>::start(const std::vector<T> &data)
|
||||
|
||||
for (Thread<T> *worker : m_workers) {
|
||||
worker->start(Workers<T>::onReady);
|
||||
|
||||
// This sleep is important for optimal caching!
|
||||
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
|
||||
// Sub-optimal caching can result in up to 0.5% hashrate penalty
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(20));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -185,8 +185,20 @@ void xmrig::CpuWorker<N>::start()
|
||||
consumeJob();
|
||||
}
|
||||
|
||||
uint64_t storeStatsMask = 7;
|
||||
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
bool first = true;
|
||||
uint64_t tempHash[8] = {};
|
||||
|
||||
// RandomX is faster, we don't need to store stats so often
|
||||
if (m_job.currentJob().algorithm().family() == Algorithm::RANDOM_X) {
|
||||
storeStatsMask = 63;
|
||||
}
|
||||
# endif
|
||||
|
||||
while (!Nonce::isOutdated(Nonce::CPU, m_job.sequence())) {
|
||||
if ((m_count & 0x7) == 0) {
|
||||
if ((m_count & storeStatsMask) == 0) {
|
||||
storeStats();
|
||||
}
|
||||
|
||||
@ -196,26 +208,34 @@ void xmrig::CpuWorker<N>::start()
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t current_job_nonces[N];
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
current_job_nonces[i] = *m_job.nonce(i);
|
||||
}
|
||||
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
if (job.algorithm().family() == Algorithm::RANDOM_X) {
|
||||
randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash);
|
||||
if (first) {
|
||||
first = false;
|
||||
randomx_calculate_hash_first(m_vm->get(), tempHash, m_job.blob(), job.size());
|
||||
}
|
||||
m_job.nextRound(kReserveCount, 1);
|
||||
randomx_calculate_hash_next(m_vm->get(), tempHash, m_job.blob(), job.size(), m_hash);
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
fn(job.algorithm())(m_job.blob(), job.size(), m_hash, m_ctx, job.height());
|
||||
m_job.nextRound(kReserveCount, 1);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (*reinterpret_cast<uint64_t*>(m_hash + (i * 32) + 24) < job.target()) {
|
||||
JobResults::submit(job, *m_job.nonce(i), m_hash + (i * 32));
|
||||
JobResults::submit(job, current_job_nonces[i], m_hash + (i * 32));
|
||||
}
|
||||
}
|
||||
|
||||
m_job.nextRound(kReserveCount, 1);
|
||||
m_count += N;
|
||||
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
consumeJob();
|
||||
|
@ -212,3 +212,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
|
||||
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<bool softAes>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
// initial state
|
||||
rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
|
||||
rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
|
||||
rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
|
||||
rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);
|
||||
|
||||
const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
|
||||
const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
|
||||
const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
|
||||
const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);
|
||||
|
||||
rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0);
|
||||
rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1);
|
||||
rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
|
||||
rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);
|
||||
|
||||
constexpr int PREFETCH_DISTANCE = 4096;
|
||||
const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
|
||||
scratchpadEnd -= PREFETCH_DISTANCE;
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0));
|
||||
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1));
|
||||
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2));
|
||||
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3));
|
||||
|
||||
fill_state0 = aesdec<softAes>(fill_state0, key0);
|
||||
fill_state1 = aesenc<softAes>(fill_state1, key1);
|
||||
fill_state2 = aesdec<softAes>(fill_state2, key2);
|
||||
fill_state3 = aesenc<softAes>(fill_state3, key3);
|
||||
|
||||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0);
|
||||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1);
|
||||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
|
||||
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3);
|
||||
|
||||
rx_prefetch_t0(prefetchPtr);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
prefetchPtr += 64;
|
||||
}
|
||||
prefetchPtr = (const char*) scratchpad;
|
||||
scratchpadEnd += PREFETCH_DISTANCE;
|
||||
}
|
||||
|
||||
rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0);
|
||||
rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1);
|
||||
rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2);
|
||||
rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
|
||||
rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);
|
||||
|
||||
hash_state0 = aesenc<softAes>(hash_state0, xkey0);
|
||||
hash_state1 = aesdec<softAes>(hash_state1, xkey0);
|
||||
hash_state2 = aesenc<softAes>(hash_state2, xkey0);
|
||||
hash_state3 = aesdec<softAes>(hash_state3, xkey0);
|
||||
|
||||
hash_state0 = aesenc<softAes>(hash_state0, xkey1);
|
||||
hash_state1 = aesdec<softAes>(hash_state1, xkey1);
|
||||
hash_state2 = aesenc<softAes>(hash_state2, xkey1);
|
||||
hash_state3 = aesdec<softAes>(hash_state3, xkey1);
|
||||
|
||||
//output hash
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0);
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1);
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
|
||||
}
|
||||
|
||||
template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
|
@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<bool softAes>
|
||||
void fillAes4Rx4(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<bool softAes>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
|
@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128;
|
||||
#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
|
||||
#define rx_aligned_free(a) _mm_free(a)
|
||||
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
|
||||
#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0)
|
||||
|
||||
#define rx_load_vec_f128 _mm_load_pd
|
||||
#define rx_store_vec_f128 _mm_store_pd
|
||||
@ -201,6 +202,7 @@ typedef union{
|
||||
#define rx_aligned_alloc(a, b) malloc(a)
|
||||
#define rx_aligned_free(a) free(a)
|
||||
#define rx_prefetch_nta(x)
|
||||
#define rx_prefetch_t0(x)
|
||||
|
||||
/* Splat 64-bit long long to 2 64-bit long longs */
|
||||
FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
|
||||
@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) {
|
||||
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
|
||||
}
|
||||
|
||||
inline void rx_prefetch_t0(const void* ptr) {
|
||||
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
|
||||
}
|
||||
|
||||
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
|
||||
return vld1q_f64((const float64_t*)pd);
|
||||
}
|
||||
@ -532,6 +538,7 @@ typedef union {
|
||||
#define rx_aligned_alloc(a, b) malloc(a)
|
||||
#define rx_aligned_free(a) free(a)
|
||||
#define rx_prefetch_nta(x)
|
||||
#define rx_prefetch_t0(x)
|
||||
|
||||
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
|
||||
rx_vec_f128 x;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -67,12 +67,17 @@ namespace randomx {
|
||||
|
||||
static InstructionGeneratorX86 engine[256];
|
||||
int registerUsage[RegistersCount];
|
||||
uint8_t* allocatedCode;
|
||||
uint8_t* code;
|
||||
int32_t codePos;
|
||||
|
||||
static bool BranchesWithin32B;
|
||||
|
||||
static void applyTweaks();
|
||||
void generateProgramPrologue(Program&, ProgramConfiguration&);
|
||||
void generateProgramEpilogue(Program&, ProgramConfiguration&);
|
||||
static void genAddressReg(const Instruction&, uint8_t* code, int& codePos, bool rax = true);
|
||||
template<bool rax>
|
||||
static void genAddressReg(const Instruction&, uint8_t* code, int& codePos);
|
||||
static void genAddressRegDst(const Instruction&, uint8_t* code, int& codePos);
|
||||
static void genAddressImm(const Instruction&, uint8_t* code, int& codePos);
|
||||
static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos);
|
||||
|
@ -473,4 +473,22 @@ extern "C" {
|
||||
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
|
||||
}
|
||||
|
||||
void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
|
||||
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
|
||||
machine->initScratchpad(tempHash);
|
||||
}
|
||||
|
||||
void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
|
||||
machine->resetRoundingMode();
|
||||
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
|
||||
machine->run(&tempHash);
|
||||
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
|
||||
}
|
||||
machine->run(&tempHash);
|
||||
|
||||
// Finish current hash and fill the scratchpad for the next hash at the same time
|
||||
rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0);
|
||||
machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -338,6 +338,9 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
|
||||
*/
|
||||
RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);
|
||||
|
||||
RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize);
|
||||
RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -114,6 +114,12 @@ namespace randomx {
|
||||
rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0);
|
||||
}
|
||||
|
||||
template<bool softAes>
|
||||
void VmBase<softAes>::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) {
|
||||
hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, ®.a, fill_state);
|
||||
rx_blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0);
|
||||
}
|
||||
|
||||
template<bool softAes>
|
||||
void VmBase<softAes>::initScratchpad(void* seed) {
|
||||
fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad);
|
||||
|
@ -39,6 +39,7 @@ public:
|
||||
virtual ~randomx_vm() = 0;
|
||||
virtual void setScratchpad(uint8_t *scratchpad) = 0;
|
||||
virtual void getFinalResult(void* out, size_t outSize) = 0;
|
||||
virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0;
|
||||
virtual void setDataset(randomx_dataset* dataset) { }
|
||||
virtual void setCache(randomx_cache* cache) { }
|
||||
virtual void initScratchpad(void* seed) = 0;
|
||||
@ -82,6 +83,7 @@ namespace randomx {
|
||||
void setScratchpad(uint8_t *scratchpad) override;
|
||||
void initScratchpad(void* seed) override;
|
||||
void getFinalResult(void* out, size_t outSize) override;
|
||||
void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override;
|
||||
|
||||
protected:
|
||||
void generateProgram(void* seed);
|
||||
|
Reference in New Issue
Block a user