Add ASM code.

This commit is contained in:
XMRig
2018-10-05 15:02:52 +03:00
parent e0dc51edf9
commit 11748fad78
14 changed files with 1062 additions and 26 deletions

View File

@ -3,6 +3,7 @@ project(xmrig C)
option(WITH_LIBCPUID "Use Libcpuid" ON) option(WITH_LIBCPUID "Use Libcpuid" ON)
option(WITH_AEON "CryptoNight-Lite support" ON) option(WITH_AEON "CryptoNight-Lite support" ON)
option(WITH_ASM "Enable ASM PoW implementations" ON)
set(HEADERS set(HEADERS
algo/cryptonight/cryptonight.h algo/cryptonight/cryptonight.h
@ -125,6 +126,8 @@ else()
set(SOURCES_CPUID cpu_stub.c) set(SOURCES_CPUID cpu_stub.c)
endif() endif()
include(cmake/asm.cmake)
if (WITH_AEON) if (WITH_AEON)
set(SOURCES_AEON set(SOURCES_AEON
algo/cryptonight-lite/cryptonight_lite_av1.c algo/cryptonight-lite/cryptonight_lite_av1.c
@ -139,10 +142,10 @@ else()
endif() endif()
if (CMAKE_SIZEOF_VOID_P EQUAL 8) if (CMAKE_SIZEOF_VOID_P EQUAL 8)
add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
target_link_libraries(xmrig jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
else() else()
add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
target_link_libraries(xmrig32 jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
endif() endif()

View File

@ -33,6 +33,7 @@
# include "xmrig.h" # include "xmrig.h"
#endif #endif
#include "cpu.h"
#include "crypto/c_blake256.h" #include "crypto/c_blake256.h"
#include "crypto/c_groestl.h" #include "crypto/c_groestl.h"
#include "crypto/c_jh.h" #include "crypto/c_jh.h"
@ -68,6 +69,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output,
#endif #endif
#ifndef XMRIG_NO_ASM
void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
#endif
static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
{ {
cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant); cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
@ -116,12 +124,46 @@ static bool self_test() {
} }
size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
{
const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
# ifndef XMRIG_NO_ASM
if (assembly == ASM_AUTO) {
assembly = cpu_info.assembly;
}
if (assembly == ASM_NONE) {
return index;
}
const size_t offset = VARIANT_MAX * 4 * 2;
if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) {
if (av == AV_SINGLE) {
return offset + assembly - 2;
}
if (av == AV_DOUBLE) {
return offset + 2;
}
}
# endif
return index;
}
cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant) cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
{ {
assert(av > AV_AUTO && av < AV_MAX); assert(av > AV_AUTO && av < AV_MAX);
assert(variant > VARIANT_AUTO && variant < VARIANT_MAX); assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);
# ifndef XMRIG_NO_ASM
static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = {
# else
static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
# endif
cryptonight_av1_v0, cryptonight_av1_v0,
cryptonight_av2_v0, cryptonight_av2_v0,
cryptonight_av3_v0, cryptonight_av3_v0,
@ -147,13 +189,31 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL NULL,
# else
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
# endif
# ifndef XMRIG_NO_ASM
cryptonight_single_hash_asm_intel,
cryptonight_single_hash_asm_ryzen,
cryptonight_double_hash_asm
# endif # endif
}; };
const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
# ifndef NDEBUG # ifndef NDEBUG
const size_t index = fn_index(algorithm, av, variant, opt_assembly);
cn_hash_fun func = func_table[index]; cn_hash_fun func = func_table[index];
assert(index < sizeof(func_table) / sizeof(func_table[0])); assert(index < sizeof(func_table) / sizeof(func_table[0]));
@ -161,7 +221,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
return func; return func;
# else # else
return func_table[index]; return func_table[fn_index(algorithm, av, variant, opt_assembly)];
# endif # endif
} }

View File

@ -191,3 +191,57 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res
keccakf(h0, 24); keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
} }
#ifndef XMRIG_NO_ASM
extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);
void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cnv2_mainloop_ivybridge_asm(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cnv2_mainloop_ryzen_asm(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
keccakf((uint64_t*) ctx[1]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
#endif

33
cmake/asm.cmake Normal file
View File

@ -0,0 +1,33 @@
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
set(XMRIG_ASM_LIBRARY "xmrig-asm")
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
enable_language(ASM_MASM)
if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm")
else()
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
else()
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S")
else()
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
endif()
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE})
set(XMRIG_ASM_SOURCES "")
set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
else()
set(XMRIG_ASM_SOURCES "")
set(XMRIG_ASM_LIBRARY "")
add_definitions(/DXMRIG_NO_ASM)
endif()

10
cpu.c
View File

@ -31,6 +31,7 @@
#endif #endif
#include "cpu.h" #include "cpu.h"
#include "options.h"
#ifndef BUILD_TEST #ifndef BUILD_TEST
@ -63,6 +64,15 @@ void cpu_init_common() {
if (data.flags[CPU_FEATURE_AES]) { if (data.flags[CPU_FEATURE_AES]) {
cpu_info.flags |= CPU_FLAG_AES; cpu_info.flags |= CPU_FLAG_AES;
# ifndef XMRIG_NO_ASM
if (data.vendor == VENDOR_AMD) {
cpu_info.assembly = ASM_RYZEN;
}
else if (data.vendor == VENDOR_INTEL) {
cpu_info.assembly = ASM_INTEL;
}
# endif
} }
if (data.flags[CPU_FEATURE_BMI2]) { if (data.flags[CPU_FEATURE_BMI2]) {

7
cpu.h
View File

@ -21,8 +21,8 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#ifndef __CPU_H__ #ifndef XMRIG_CPU_H
#define __CPU_H__ #define XMRIG_CPU_H
#include <stdbool.h> #include <stdbool.h>
@ -34,6 +34,7 @@ struct cpu_info {
int l2_cache; int l2_cache;
int l3_cache; int l3_cache;
char brand[64]; char brand[64];
int assembly;
}; };
extern struct cpu_info cpu_info; extern struct cpu_info cpu_info;
@ -50,4 +51,4 @@ void cpu_init();
int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage); int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage);
int affine_to_cpu_mask(int id, unsigned long mask); int affine_to_cpu_mask(int id, unsigned long mask);
#endif /* __CPU_H__ */ #endif /* XMRIG_CPU_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,37 @@
#define ALIGN .align
.intel_syntax noprefix
#ifdef __APPLE__
# define FN_PREFIX(fn) _ ## fn
.text
#else
# define FN_PREFIX(fn) fn
.section .text
#endif
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
ALIGN 16
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_ivybridge.inc"
add rsp, 48
ret 0
ALIGN 16
FN_PREFIX(cnv2_mainloop_ryzen_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_ryzen.inc"
add rsp, 48
ret 0
ALIGN 16
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
mov rdx, rsi
#include "cnv2_double_main_loop_sandybridge.inc"
add rsp, 48
ret 0

View File

@ -0,0 +1,25 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
ALIGN 64
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cnv2_main_loop_ivybridge.inc
ret 0
cnv2_mainloop_ivybridge_asm ENDP
ALIGN 64
cnv2_mainloop_ryzen_asm PROC
INCLUDE cnv2_main_loop_ryzen.inc
ret 0
cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cnv2_double_main_loop_sandybridge.inc
ret 0
cnv2_double_mainloop_sandybridge_asm ENDP
_TEXT_CNV2_MAINLOOP ENDS
END

View File

@ -0,0 +1,186 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 524288
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN 16
main_loop_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movq rdx, xmm3
test edx, 524287
je sqrt_fixup_ivybridge
psrlq xmm3, 19
sqrt_fixup_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne main_loop_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_main_loop_ivybridge_endp
sqrt_fixup_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp sqrt_fixup_ivybridge_ret
cnv2_main_loop_ivybridge_endp:

View File

@ -0,0 +1,179 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
main_loop_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_ryzen
shr rdi, 19
sqrt_fixup_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne main_loop_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ryzen_endp
sqrt_fixup_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ryzen_ret
cnv2_main_loop_ryzen_endp:

View File

@ -0,0 +1,21 @@
#define ALIGN .align
.intel_syntax noprefix
.section .text
.global cnv2_mainloop_ivybridge_asm
.global cnv2_mainloop_ryzen_asm
.global cnv2_double_mainloop_sandybridge_asm
ALIGN 16
cnv2_mainloop_ivybridge_asm:
#include "../cnv2_main_loop_ivybridge.inc"
ret 0
ALIGN 16
cnv2_mainloop_ryzen_asm:
#include "../cnv2_main_loop_ryzen.inc"
ret 0
ALIGN 16
cnv2_double_mainloop_sandybridge_asm:
#include "../cnv2_double_main_loop_sandybridge.inc"
ret 0

View File

@ -54,9 +54,10 @@ char *opt_userpass = NULL;
char *opt_user = NULL; char *opt_user = NULL;
char *opt_pass = NULL; char *opt_pass = NULL;
enum Algo opt_algo = ALGO_CRYPTONIGHT; enum Algo opt_algo = ALGO_CRYPTONIGHT;
enum Variant opt_variant = VARIANT_AUTO; enum Variant opt_variant = VARIANT_AUTO;
enum AlgoVariant opt_av = AV_AUTO; enum AlgoVariant opt_av = AV_AUTO;
enum Assembly opt_assembly = ASM_AUTO;
struct AlgoData struct AlgoData
@ -137,6 +138,7 @@ static struct option const options[] = {
{ "userpass", 1, NULL, 'O' }, { "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' }, { "version", 0, NULL, 'V' },
{ "variant", 1, NULL, 1021 }, { "variant", 1, NULL, 1021 },
{ "asm", 1, NULL, 1022 },
{ NULL, 0, NULL, 0 } { NULL, 0, NULL, 0 }
}; };
@ -157,13 +159,21 @@ static const char *variant_names[] = {
}; };
static const char *asm_names[] = {
"none",
"auto",
"intel",
"ryzen"
};
#ifndef XMRIG_NO_AEON #ifndef XMRIG_NO_AEON
static int get_cryptonight_lite_variant(int variant) { static int get_cryptonight_lite_variant(int variant) {
if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) { if (variant <= AV_AUTO || variant >= AV_MAX) {
return (cpu_info.flags & CPU_FLAG_AES) ? AEON_AV2_AESNI_DOUBLE : AEON_AV4_SOFT_AES_DOUBLE; return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT;
} }
if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AEON_AV2_AESNI_DOUBLE) { if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
return variant + 2; return variant + 2;
} }
@ -212,6 +222,14 @@ static void parse_arg(int key, char *arg) {
} }
break; break;
case 1022: /* --asm */
for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) {
if (strcasecmp(arg, asm_names[i]) == 0) {
opt_assembly = i;
}
}
break;
case 'O': /* --userpass */ case 'O': /* --userpass */
p = strchr(arg, ':'); p = strchr(arg, ':');
if (!p) { if (!p) {

View File

@ -27,6 +27,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#ifndef ARRAY_SIZE #ifndef ARRAY_SIZE
# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif #endif
@ -57,16 +58,13 @@ enum AlgoVariant {
}; };
#ifndef XMRIG_NO_AEON enum Assembly {
enum aeon_algo_variant { ASM_NONE,
AEON_AV0_AUTO, ASM_AUTO,
AEON_AV1_AESNI, ASM_INTEL,
AEON_AV2_AESNI_DOUBLE, ASM_RYZEN,
AEON_AV3_SOFT_AES, ASM_MAX
AEON_AV4_SOFT_AES_DOUBLE,
AEON_AV_MAX
}; };
#endif
extern bool opt_colors; extern bool opt_colors;
@ -90,6 +88,7 @@ extern int64_t opt_affinity;
extern enum Algo opt_algo; extern enum Algo opt_algo;
extern enum Variant opt_variant; extern enum Variant opt_variant;
extern enum AlgoVariant opt_av; extern enum AlgoVariant opt_av;
extern enum Assembly opt_assembly;
void parse_cmdline(int argc, char *argv[]); void parse_cmdline(int argc, char *argv[]);
void show_usage_and_exit(int status); void show_usage_and_exit(int status);