From 11748fad78e31c513a58eb033b49f034efecba93 Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 5 Oct 2018 15:02:52 +0300 Subject: [PATCH] Add ASM code. --- CMakeLists.txt | 11 +- algo/cryptonight/cryptonight.c | 68 ++- algo/cryptonight/cryptonight_av1.c | 54 +++ cmake/asm.cmake | 33 ++ cpu.c | 10 + cpu.h | 7 +- .../asm/cnv2_double_main_loop_sandybridge.inc | 410 ++++++++++++++++++ crypto/asm/cnv2_main_loop.S | 37 ++ crypto/asm/cnv2_main_loop.asm | 25 ++ crypto/asm/cnv2_main_loop_ivybridge.inc | 186 ++++++++ crypto/asm/cnv2_main_loop_ryzen.inc | 179 ++++++++ crypto/asm/win64/cnv2_main_loop.S | 21 + options.c | 30 +- options.h | 17 +- 14 files changed, 1062 insertions(+), 26 deletions(-) create mode 100644 cmake/asm.cmake create mode 100644 crypto/asm/cnv2_double_main_loop_sandybridge.inc create mode 100644 crypto/asm/cnv2_main_loop.S create mode 100644 crypto/asm/cnv2_main_loop.asm create mode 100644 crypto/asm/cnv2_main_loop_ivybridge.inc create mode 100644 crypto/asm/cnv2_main_loop_ryzen.inc create mode 100644 crypto/asm/win64/cnv2_main_loop.S diff --git a/CMakeLists.txt b/CMakeLists.txt index e7417c76..51b8b167 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ project(xmrig C) option(WITH_LIBCPUID "Use Libcpuid" ON) option(WITH_AEON "CryptoNight-Lite support" ON) +option(WITH_ASM "Enable ASM PoW implementations" ON) set(HEADERS algo/cryptonight/cryptonight.h @@ -125,6 +126,8 @@ else() set(SOURCES_CPUID cpu_stub.c) endif() +include(cmake/asm.cmake) + if (WITH_AEON) set(SOURCES_AEON algo/cryptonight-lite/cryptonight_lite_av1.c @@ -139,10 +142,10 @@ else() endif() if (CMAKE_SIZEOF_VOID_P EQUAL 8) - add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) + add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES}) + target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) else() - add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig32 jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) + add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES}) + target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) endif() diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index 728e5822..62dbdc50 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -33,6 +33,7 @@ # include "xmrig.h" #endif +#include "cpu.h" #include "crypto/c_blake256.h" #include "crypto/c_groestl.h" #include "crypto/c_jh.h" @@ -68,6 +69,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, #endif +#ifndef XMRIG_NO_ASM +void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +#endif + + static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) { cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant); @@ -116,12 +124,46 @@ static bool self_test() { } +size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly) +{ + const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; + +# ifndef XMRIG_NO_ASM + if (assembly == ASM_AUTO) { + assembly = cpu_info.assembly; + } + + if (assembly == ASM_NONE) { + return index; + } + + const size_t offset = VARIANT_MAX * 4 * 2; + + if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) { + if (av == AV_SINGLE) { + return offset + assembly - 2; + } + + if (av == AV_DOUBLE) { + return offset + 2; + } + } +# endif + + return index; +} + + cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant) { assert(av > AV_AUTO && av < AV_MAX); assert(variant > VARIANT_AUTO && variant < VARIANT_MAX); +# ifndef XMRIG_NO_ASM + static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = { +# else static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { +# endif cryptonight_av1_v0, cryptonight_av2_v0, cryptonight_av3_v0, @@ -147,13 +189,31 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V NULL, NULL, NULL, - NULL + NULL, +# else + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, +# endif +# ifndef XMRIG_NO_ASM + cryptonight_single_hash_asm_intel, + cryptonight_single_hash_asm_ryzen, + cryptonight_double_hash_asm # endif }; - const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; - # ifndef NDEBUG + const size_t index = fn_index(algorithm, av, variant, opt_assembly); + cn_hash_fun func = func_table[index]; assert(index < sizeof(func_table) / sizeof(func_table[0])); @@ -161,7 +221,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V return func; # else - return func_table[index]; + return func_table[fn_index(algorithm, av, variant, opt_assembly)]; # endif } diff --git a/algo/cryptonight/cryptonight_av1.c b/algo/cryptonight/cryptonight_av1.c index 9ef83b07..c71635ea 100644 --- a/algo/cryptonight/cryptonight_av1.c +++ b/algo/cryptonight/cryptonight_av1.c @@ -191,3 +191,57 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res keccakf(h0, 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } + + +#ifndef XMRIG_NO_ASM +extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx); +extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx); +extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1); + + +void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + cnv2_mainloop_ivybridge_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + cnv2_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); + + cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); + + keccakf((uint64_t*) ctx[0]->state, 24); + keccakf((uint64_t*) ctx[1]->state, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} +#endif diff --git a/cmake/asm.cmake b/cmake/asm.cmake new file mode 100644 index 00000000..4420342c --- /dev/null +++ b/cmake/asm.cmake @@ -0,0 +1,33 @@ +if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(XMRIG_ASM_LIBRARY "xmrig-asm") + + if (CMAKE_C_COMPILER_ID MATCHES MSVC) + enable_language(ASM_MASM) + + if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141) + set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm") + else() + set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) + else() + enable_language(ASM) + + if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) + set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S") + else() + set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) + endif() + + add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE}) + set(XMRIG_ASM_SOURCES "") + set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C) +else() + set(XMRIG_ASM_SOURCES "") + set(XMRIG_ASM_LIBRARY "") + add_definitions(/DXMRIG_NO_ASM) +endif() diff --git a/cpu.c b/cpu.c index 2f6ef8b6..0d28559a 100644 --- a/cpu.c +++ b/cpu.c @@ -31,6 +31,7 @@ #endif #include "cpu.h" +#include "options.h" #ifndef BUILD_TEST @@ -63,6 +64,15 @@ void cpu_init_common() { if (data.flags[CPU_FEATURE_AES]) { cpu_info.flags |= CPU_FLAG_AES; + +# ifndef XMRIG_NO_ASM + if (data.vendor == VENDOR_AMD) { + cpu_info.assembly = ASM_RYZEN; + } + else if (data.vendor == VENDOR_INTEL) { + cpu_info.assembly = ASM_INTEL; + } +# endif } if (data.flags[CPU_FEATURE_BMI2]) { diff --git a/cpu.h b/cpu.h index 419192bf..e9314bbe 100644 --- a/cpu.h +++ b/cpu.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __CPU_H__ -#define __CPU_H__ +#ifndef XMRIG_CPU_H +#define XMRIG_CPU_H #include @@ -34,6 +34,7 @@ struct cpu_info { int l2_cache; int l3_cache; char brand[64]; + int assembly; }; extern struct cpu_info cpu_info; @@ -50,4 +51,4 @@ void cpu_init(); int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage); int affine_to_cpu_mask(int id, unsigned long mask); -#endif /* __CPU_H__ */ +#endif /* XMRIG_CPU_H */ diff --git a/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/crypto/asm/cnv2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..e8251bc7 --- /dev/null +++ b/crypto/asm/cnv2_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 16 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/crypto/asm/cnv2_main_loop.S b/crypto/asm/cnv2_main_loop.S new file mode 100644 index 00000000..4dbcbbda --- /dev/null +++ b/crypto/asm/cnv2_main_loop.S @@ -0,0 +1,37 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) + +ALIGN 16 +FN_PREFIX(cnv2_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cnv2_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 diff --git a/crypto/asm/cnv2_main_loop.asm b/crypto/asm/cnv2_main_loop.asm new file mode 100644 index 00000000..d9522267 --- /dev/null +++ b/crypto/asm/cnv2_main_loop.asm @@ -0,0 +1,25 @@ +_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv2_mainloop_ivybridge_asm +PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_double_mainloop_sandybridge_asm + +ALIGN 64 +cnv2_mainloop_ivybridge_asm PROC + INCLUDE cnv2_main_loop_ivybridge.inc + ret 0 +cnv2_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cnv2_mainloop_ryzen_asm PROC + INCLUDE cnv2_main_loop_ryzen.inc + ret 0 +cnv2_mainloop_ryzen_asm ENDP + +ALIGN 64 +cnv2_double_mainloop_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_sandybridge.inc + ret 0 +cnv2_double_mainloop_sandybridge_asm ENDP + +_TEXT_CNV2_MAINLOOP ENDS +END diff --git a/crypto/asm/cnv2_main_loop_ivybridge.inc b/crypto/asm/cnv2_main_loop_ivybridge.inc new file mode 100644 index 00000000..8c2c2d3b --- /dev/null +++ b/crypto/asm/cnv2_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 16 +main_loop_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 +sqrt_fixup_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/crypto/asm/cnv2_main_loop_ryzen.inc b/crypto/asm/cnv2_main_loop_ryzen.inc new file mode 100644 index 00000000..d386aa2d --- /dev/null +++ b/crypto/asm/cnv2_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/crypto/asm/win64/cnv2_main_loop.S b/crypto/asm/win64/cnv2_main_loop.S new file mode 100644 index 00000000..78eb1185 --- /dev/null +++ b/crypto/asm/win64/cnv2_main_loop.S @@ -0,0 +1,21 @@ +#define ALIGN .align +.intel_syntax noprefix +.section .text +.global cnv2_mainloop_ivybridge_asm +.global cnv2_mainloop_ryzen_asm +.global cnv2_double_mainloop_sandybridge_asm + +ALIGN 16 +cnv2_mainloop_ivybridge_asm: + #include "../cnv2_main_loop_ivybridge.inc" + ret 0 + +ALIGN 16 +cnv2_mainloop_ryzen_asm: + #include "../cnv2_main_loop_ryzen.inc" + ret 0 + +ALIGN 16 +cnv2_double_mainloop_sandybridge_asm: + #include "../cnv2_double_main_loop_sandybridge.inc" + ret 0 diff --git a/options.c b/options.c index 41921784..f276b3f1 100644 --- a/options.c +++ b/options.c @@ -54,9 +54,10 @@ char *opt_userpass = NULL; char *opt_user = NULL; char *opt_pass = NULL; -enum Algo opt_algo = ALGO_CRYPTONIGHT; -enum Variant opt_variant = VARIANT_AUTO; -enum AlgoVariant opt_av = AV_AUTO; +enum Algo opt_algo = ALGO_CRYPTONIGHT; +enum Variant opt_variant = VARIANT_AUTO; +enum AlgoVariant opt_av = AV_AUTO; +enum Assembly opt_assembly = ASM_AUTO; struct AlgoData @@ -137,6 +138,7 @@ static struct option const options[] = { { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, { "variant", 1, NULL, 1021 }, + { "asm", 1, NULL, 1022 }, { NULL, 0, NULL, 0 } }; @@ -157,13 +159,21 @@ static const char *variant_names[] = { }; +static const char *asm_names[] = { + "none", + "auto", + "intel", + "ryzen" +}; + + #ifndef XMRIG_NO_AEON static int get_cryptonight_lite_variant(int variant) { - if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) { - return (cpu_info.flags & CPU_FLAG_AES) ? AEON_AV2_AESNI_DOUBLE : AEON_AV4_SOFT_AES_DOUBLE; + if (variant <= AV_AUTO || variant >= AV_MAX) { + return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT; } - if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AEON_AV2_AESNI_DOUBLE) { + if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) { return variant + 2; } @@ -212,6 +222,14 @@ static void parse_arg(int key, char *arg) { } break; + case 1022: /* --asm */ + for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) { + if (strcasecmp(arg, asm_names[i]) == 0) { + opt_assembly = i; + } + } + break; + case 'O': /* --userpass */ p = strchr(arg, ':'); if (!p) { diff --git a/options.h b/options.h index 4f543275..7117130b 100644 --- a/options.h +++ b/options.h @@ -27,6 +27,7 @@ #include #include + #ifndef ARRAY_SIZE # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif @@ -57,16 +58,13 @@ enum AlgoVariant { }; -#ifndef XMRIG_NO_AEON -enum aeon_algo_variant { - AEON_AV0_AUTO, - AEON_AV1_AESNI, - AEON_AV2_AESNI_DOUBLE, - AEON_AV3_SOFT_AES, - AEON_AV4_SOFT_AES_DOUBLE, - AEON_AV_MAX +enum Assembly { + ASM_NONE, + ASM_AUTO, + ASM_INTEL, + ASM_RYZEN, + ASM_MAX }; -#endif extern bool opt_colors; @@ -90,6 +88,7 @@ extern int64_t opt_affinity; extern enum Algo opt_algo; extern enum Variant opt_variant; extern enum AlgoVariant opt_av; +extern enum Assembly opt_assembly; void parse_cmdline(int argc, char *argv[]); void show_usage_and_exit(int status);