Compare commits

...

27 Commits

Author SHA1 Message Date
XMRig
ec13337228 v6.3.5 2020-10-03 11:48:34 +07:00
XMRig
cfe2a098ce Merge branch 'dev' 2020-10-03 11:47:07 +07:00
xmrig
a89c2c8dd1 Update CHANGELOG.md 2020-10-02 22:39:26 +07:00
XMRig
1b4a124bc5 Fix x86 build. 2020-10-01 17:46:05 +07:00
XMRig
4bb8be8a29 Merge branch 'ph4r05-pr/001-with-sse' into dev 2020-10-01 11:00:52 +07:00
XMRig
d45bb24a32 Renamed WITH_SSE to WITH_SSE4_1 and make it work on all platforms. 2020-10-01 11:00:08 +07:00
Dusan Klinec
5a7bcb2d03 fies #1844, adds WITH_SSE cmake option
now it is possible to disable sse for Blake2, which is not supported on ARMs
2020-09-30 20:09:54 +02:00
xmrig
f1ec8a18f6 Merge pull request #1859 from SChernykh/dev
RandomX: optimized soft AES code
2020-09-30 09:01:45 +07:00
SChernykh
7b4f768114 RandomX: optimized soft AES code
Unrolled loop was 5-10% slower depending on CPU.
2020-09-29 21:22:11 +02:00
xmrig
dfab81e9fa Merge pull request #1858 from SChernykh/dev
RandomX: removed duplicate constants in Blake2b
2020-09-27 16:51:03 +07:00
SChernykh
3025c265e8 RandomX: removed duplicate constatns in Blake2b 2020-09-27 11:50:08 +02:00
xmrig
ee603ab9e2 Merge pull request #1857 from SChernykh/dev
RandomX: isolate SSE4.1 code to fix crashes on old CPUs
2020-09-27 16:47:56 +07:00
SChernykh
84f8a0dc54 RandomX: isolate SSE4.1 code to fix crashes on old CPUs 2020-09-27 11:46:32 +02:00
xmrig
481deff163 Merge pull request #1856 from SChernykh/dev
Fixed SSE4.1 for old CPUs
2020-09-27 14:01:34 +07:00
SChernykh
0e9ed351a1 Fixed SSE4.1 for old CPUs
Enable SSE4.1 only where it's needed.
2020-09-27 08:55:57 +02:00
xmrig
8952f6892d Merge pull request #1852 from cohcho/fix_string
String: distinguish nullptr/empty str
2020-09-27 07:56:33 +07:00
xmrig
d51fe01273 Merge pull request #1849 from cohcho/soft_aes_optimization1
soft_aes: fix previous optimization
2020-09-27 07:56:03 +07:00
cohcho
f7d6348948 String: distinguish nullptr/empty str 2020-09-26 16:41:15 +00:00
xmrig
3a01ebe277 Merge pull request #1850 from cohcho/filter_invalid_algos
Miner: filter invalid algos
2020-09-26 15:15:23 +07:00
cohcho
189cc78d44 Miner: filter invalid algos 2020-09-25 17:52:13 +00:00
cohcho
9be3b69109 soft_aes: fix previous optimization
the best order of hash/fill/prefetch depends on hw/soft AES
only hw AES is faster after previous optimization
2020-09-25 15:26:19 +00:00
xmrig
7b38af703e Merge pull request #1846 from SChernykh/dev
KawPow: fixed OpenCL memory leak
2020-09-25 15:55:36 +07:00
SChernykh
bef9031b03 KawPow: fixed OpenCL memory leak 2020-09-25 10:53:24 +02:00
xmrig
e4929d7c06 Merge pull request #1845 from SChernykh/dev
Fix for ARM compilation
2020-09-23 16:48:08 +07:00
SChernykh
1e26e58660 Fix for ARM compilation 2020-09-23 11:44:08 +02:00
XMRig
8fe0577d60 v6.3.5-dev 2020-09-23 08:06:28 +07:00
XMRig
64f42feba9 Merge branch 'master' into dev 2020-09-23 08:05:58 +07:00
19 changed files with 203 additions and 116 deletions

View File

@ -1,3 +1,10 @@
# v6.3.5
- [#1845](https://github.com/xmrig/xmrig/pull/1845) [#1861](https://github.com/xmrig/xmrig/pull/1861) Fixed ARM build and added CMake option `WITH_SSE4_1`.
- [#1846](https://github.com/xmrig/xmrig/pull/1846) KawPow: fixed OpenCL memory leak.
- [#1849](https://github.com/xmrig/xmrig/pull/1849) [#1859](https://github.com/xmrig/xmrig/pull/1859) RandomX: optimized soft AES code.
- [#1850](https://github.com/xmrig/xmrig/pull/1850) [#1852](https://github.com/xmrig/xmrig/pull/1852) General code improvements.
- [#1853](https://github.com/xmrig/xmrig/issues/1853) [#1856](https://github.com/xmrig/xmrig/pull/1856) [#1857](https://github.com/xmrig/xmrig/pull/1857) Fixed crash on old CPUs.
# v6.3.4
- [#1823](https://github.com/xmrig/xmrig/pull/1823) RandomX: added new option `scratchpad_prefetch_mode`.
- [#1827](https://github.com/xmrig/xmrig/pull/1827) [#1831](https://github.com/xmrig/xmrig/pull/1831) Improved nonce iteration performance.

View File

@ -24,6 +24,7 @@ option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
option(WITH_PROFILING "Enable profiling for developers" OFF)
option(WITH_SSE4_1 "Enable SSE 4.1 for Blake2" ON)
option(BUILD_STATIC "Build static binary" OFF)
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)

View File

@ -2,9 +2,10 @@ if (NOT CMAKE_SYSTEM_PROCESSOR)
message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined")
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
add_definitions(/DRAPIDJSON_SSE2)
else()
set(WITH_SSE4_1 OFF)
endif()
if (NOT ARM_TARGET)
@ -41,3 +42,7 @@ if (ARM_TARGET AND ARM_TARGET GREATER 6)
add_definitions(/DXMRIG_ARMv7)
endif()
endif()
if (WITH_SSE4_1)
add_definitions(/DXMRIG_FEATURE_SSE4_1)
endif()

View File

@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
add_definitions(/DHAVE_ROTR)
endif()
@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR)
if (HAVE_ROTR)

View File

@ -64,6 +64,14 @@ if (WITH_RANDOMX)
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
endif()
if (WITH_SSE4_1)
list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/blake2b_sse41.c)
if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1)
endif()
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang)
set_source_files_properties(src/crypto/randomx/jit_compiler_x86.cpp PROPERTIES COMPILE_FLAGS -Wno-unused-const-variable)
endif()

View File

@ -22,6 +22,7 @@ This feature add external dependency to libhwloc (1.10.0+) (except MSVC builds).
* **`-DWITH_EMBEDDED_CONFIG=ON`** Enable [embedded](https://github.com/xmrig/xmrig/issues/957) config support.
* **`-DWITH_OPENCL=OFF`** Disable OpenCL backend.
* **`-DWITH_CUDA=OFF`** Disable CUDA backend.
* **`-DWITH_SSE4_1=OFF`** Disable SSE 4.1 for Blake2 (useful for arm builds).
## Debug options

View File

@ -69,8 +69,6 @@ OclKawPowRunner::~OclKawPowRunner()
delete m_calculateDagKernel;
OclLib::release(m_searchKernel);
OclLib::release(m_controlQueue);
OclLib::release(m_stop);
@ -120,8 +118,7 @@ void OclKawPowRunner::run(uint32_t nonce, uint32_t *hashOutput)
void OclKawPowRunner::set(const Job &job, uint8_t *blob)
{
m_blockHeight = static_cast<uint32_t>(job.height());
m_searchProgram = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
m_searchKernel = OclLib::createKernel(m_searchProgram, "progpow_search");
m_searchKernel = OclKawPow::get(*this, m_blockHeight, m_workGroupSize);
const uint32_t epoch = m_blockHeight / KPHash::EPOCH_LENGTH;

View File

@ -69,7 +69,6 @@ private:
KawPow_CalculateDAGKernel* m_calculateDagKernel = nullptr;
cl_program m_searchProgram = nullptr;
cl_kernel m_searchKernel = nullptr;
size_t m_workGroupSize = 256;

View File

@ -54,8 +54,9 @@ namespace xmrig {
class KawPowCacheEntry
{
public:
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program) :
inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel) :
program(program),
kernel(kernel),
m_algo(algo),
m_index(index),
m_period(period),
@ -65,9 +66,10 @@ public:
inline bool isExpired(uint64_t period) const { return m_period + 1 < period; }
inline bool match(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) const { return m_algo == algo && m_period == period && m_worksize == worksize && m_index == index; }
inline bool match(const IOclRunner &runner, uint64_t period, uint32_t worksize) const { return match(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline void release() { OclLib::release(program); }
inline void release() { OclLib::release(kernel); OclLib::release(program); }
cl_program program;
cl_kernel kernel;
private:
Algorithm m_algo;
@ -82,16 +84,16 @@ class KawPowCache
public:
KawPowCache() = default;
inline cl_program search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline cl_kernel search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); }
inline cl_program search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
inline cl_kernel search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index)
{
std::lock_guard<std::mutex> lock(m_mutex);
for (const auto &entry : m_data) {
if (entry.match(algo, period, worksize, index)) {
return entry.program;
return entry.kernel;
}
}
@ -99,9 +101,10 @@ public:
}
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program)
void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel)
{
if (search(algo, period, worksize, index)) {
OclLib::release(kernel);
OclLib::release(program);
return;
}
@ -109,7 +112,7 @@ public:
std::lock_guard<std::mutex> lock(m_mutex);
gc(period);
m_data.emplace_back(algo, period, worksize, index, program);
m_data.emplace_back(algo, period, worksize, index, program, kernel);
}
@ -159,15 +162,15 @@ static KawPowCache cache;
class KawPowBuilder
{
public:
cl_program build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
cl_kernel build(const IOclRunner &runner, uint64_t period, uint32_t worksize)
{
std::lock_guard<std::mutex> lock(m_mutex);
const uint64_t ts = Chrono::steadyMSecs();
cl_program program = cache.search(runner, period, worksize);
if (program) {
return program;
cl_kernel kernel = cache.search(runner, period, worksize);
if (kernel) {
return kernel;
}
cl_int ret;
@ -175,7 +178,7 @@ public:
cl_device_id device = runner.data().device.id();
const char *s = source.c_str();
program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
cl_program program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret);
if (ret != CL_SUCCESS) {
return nullptr;
}
@ -199,11 +202,17 @@ public:
return nullptr;
}
kernel = OclLib::createKernel(program, "progpow_search", &ret);
if (ret != CL_SUCCESS) {
OclLib::release(program);
return nullptr;
}
LOG_INFO("%s " YELLOW("KawPow") " program for period " WHITE_BOLD("%" PRIu64) " compiled " BLACK_BOLD("(%" PRIu64 "ms)"), Tags::opencl(), period, Chrono::steadyMSecs() - ts);
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program);
cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program, kernel);
return program;
return kernel;
}
@ -382,7 +391,7 @@ public:
static KawPowBuilder builder;
cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
cl_kernel OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize)
{
const uint64_t period = height / KPHash::PERIOD_LENGTH;
@ -396,9 +405,9 @@ cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t wo
[](uv_work_t *req, int) { delete static_cast<KawPowBaton*>(req->data); }
);
cl_program program = cache.search(runner, period, worksize);
if (program) {
return program;
cl_kernel kernel = cache.search(runner, period, worksize);
if (kernel) {
return kernel;
}
return builder.build(runner, period, worksize);

View File

@ -30,7 +30,7 @@
#include <cstdint>
using cl_program = struct _cl_program *;
using cl_kernel = struct _cl_kernel *;
namespace xmrig {
@ -42,7 +42,7 @@ class IOclRunner;
class OclKawPow
{
public:
static cl_program get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
static cl_kernel get(const IOclRunner &runner, uint64_t height, uint32_t worksize);
static void clear();
};

View File

@ -96,7 +96,7 @@ public:
inline bool isCN() const { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO; }
inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; }
inline bool isValid() const { return m_id != INVALID; }
inline bool isValid() const { return m_id != INVALID && family() != UNKNOWN; }
inline const char *name() const { return name(false); }
inline const char *shortName() const { return name(true); }
inline Family family() const { return family(m_id); }

View File

@ -33,7 +33,7 @@
xmrig::String::String(const char *str) :
m_size(str == nullptr ? 0 : strlen(str))
{
if (m_size == 0) {
if (str == nullptr) {
return;
}

View File

@ -121,7 +121,7 @@ public:
for (int i = 0; i < Algorithm::MAX; ++i) {
const Algorithm algo(static_cast<Algorithm::Id>(i));
if (isEnabled(algo)) {
if (algo.isValid() && isEnabled(algo)) {
algorithms.push_back(algo);
}
}

View File

@ -260,17 +260,32 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3);
HASH_STATE(0);
HASH_STATE(1);
switch(softAes) {
case 0:
HASH_STATE(0);
HASH_STATE(1);
FILL_STATE(0);
FILL_STATE(1);
FILL_STATE(0);
FILL_STATE(1);
rx_prefetch_t0(prefetchPtr);
rx_prefetch_t0(prefetchPtr + 64);
rx_prefetch_t0(prefetchPtr);
rx_prefetch_t0(prefetchPtr + 64);
scratchpadPtr += 128;
prefetchPtr += 128;
scratchpadPtr += 128;
prefetchPtr += 128;
break;
default:
HASH_STATE(0);
FILL_STATE(0);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
break;
}
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;

View File

@ -39,40 +39,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/blake2/blake2.h"
#include "crypto/randomx/blake2/blake2-impl.h"
#if defined(_M_X64) || defined(__x86_64__)
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <smmintrin.h>
#include "blake2b-round.h"
#endif
static const uint64_t blake2b_IV[8] = {
const uint64_t blake2b_IV[8] = {
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
#if defined(_M_X64) || defined(__x86_64__)
static const uint8_t blake2b_sigma_sse41[12][16] = {
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
};
#endif
static const uint8_t blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@ -207,46 +179,6 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t
return 0;
}
#if defined(_M_X64) || defined(__x86_64__)
static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
const uint64_t* m = (const uint64_t*)(block);
for (uint32_t r = 0; r < 12; ++r) {
ROUND(r);
}
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
}
#undef ROUND
#endif
static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) {
uint64_t m[16];
uint64_t v[16];
@ -305,9 +237,10 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block)
#undef ROUND
}
#if defined(_M_X64) || defined(__x86_64__)
#if defined(XMRIG_FEATURE_SSE4_1)
uint32_t rx_blake2b_use_sse41 = 0;
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block);
#define rx_blake2b_compress(S, block) \
if (rx_blake2b_use_sse41) \

View File

@ -0,0 +1,108 @@
/*
* Copyright (c) 2018-2019, tevador <tevador@gmail.com>
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from Argon2 reference source code package used under CC0 Licence
* https://github.com/P-H-C/phc-winner-argon2
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*/
#if defined(_M_X64) || defined(__x86_64__)
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "crypto/randomx/blake2/blake2.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <smmintrin.h>
#include "blake2b-round.h"
extern const uint64_t blake2b_IV[8];
static const uint8_t blake2b_sigma_sse41[12][16] = {
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
};
void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
row2h = LOADU(&S->h[6]);
row3l = LOADU(&blake2b_IV[0]);
row3h = LOADU(&blake2b_IV[2]);
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
const uint64_t* m = (const uint64_t*)(block);
for (uint32_t r = 0; r < 12; ++r) {
ROUND(r);
}
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
}
#endif

View File

@ -337,12 +337,16 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
INST_HANDLE(FDIV_M, FMUL_R);
INST_HANDLE(FSQRT_R, FDIV_M);
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->jccErratum()) {
INST_HANDLE2(CBRANCH, CBRANCH<true>, FSQRT_R);
}
else {
INST_HANDLE2(CBRANCH, CBRANCH<false>, FSQRT_R);
}
#else
INST_HANDLE(CBRANCH, FSQRT_R);
#endif
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) {

View File

@ -31,7 +31,7 @@
#include "crypto/rx/RxVm.h"
#if defined(_M_X64) || defined(__x86_64__)
#if defined(XMRIG_FEATURE_SSE4_1)
extern "C" uint32_t rx_blake2b_use_sse41;
#endif
@ -60,7 +60,7 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
flags |= RANDOMX_FLAG_AMD;
}
# if defined(_M_X64) || defined(__x86_64__)
# if defined(XMRIG_FEATURE_SSE4_1)
rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0;
# endif

View File

@ -28,7 +28,7 @@
#define APP_ID "xmrig"
#define APP_NAME "XMRig"
#define APP_DESC "XMRig miner"
#define APP_VERSION "6.3.4"
#define APP_VERSION "6.3.5"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"
@ -36,7 +36,7 @@
#define APP_VER_MAJOR 6
#define APP_VER_MINOR 3
#define APP_VER_PATCH 4
#define APP_VER_PATCH 5
#ifdef _MSC_VER
# if (_MSC_VER >= 1920)