Merge branch 'classic-dev' into classic

2019-03-05 01:16:22 +07:00
parent b834c50aba d92c1a54de
commit 30e5e4a492
42 changed files with 5459 additions and 394 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -11,6 +11,7 @@ set(HEADERS
    algo/cryptonight/cryptonight_monero.h
    algo/cryptonight/cryptonight_softaes.h
    algo/cryptonight/cryptonight_test.h
    algo/cryptonight/variant4_random_math.h
    compat.h
    cpu.h
    donate.h
@ -29,6 +30,7 @@ set(HEADERS_CRYPTO
    crypto/c_blake256.h
    crypto/c_jh.h
    crypto/c_skein.h
    crypto/soft_aes.h
   )
 set(HEADERS_COMPAT
@ -48,6 +50,10 @@ set(SOURCES
    algo/cryptonight/cryptonight_av2.c
    algo/cryptonight/cryptonight_av3.c
    algo/cryptonight/cryptonight_av4.c
    algo/cryptonight/cryptonight_r_av1.c
    algo/cryptonight/cryptonight_r_av2.c
    algo/cryptonight/cryptonight_r_av3.c
    algo/cryptonight/cryptonight_r_av4.c
    util.c
    options.c
    stratum.c
@ -61,7 +67,6 @@ set(SOURCES_CRYPTO
    crypto/c_blake256.c
    crypto/c_jh.c
    crypto/c_skein.c
    crypto/soft_aes.c
   )
 set(SOURCES_UTILS
--- a/algo/cryptonight-lite/cryptonight_lite_softaes.h
+++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
- *
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,16 +22,15 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef __CRYPTONIGHT_LITE_SOFTAES_H__
+#ifndef XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
-#define __CRYPTONIGHT_LITE_SOFTAES_H__
+#define XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
 #include <x86intrin.h>
 #include <stdint.h>
-extern __m128i soft_aesenc(__m128i in, __m128i key);
+#include "crypto/soft_aes.h"
 extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@ -253,4 +252,4 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 }
-#endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */
+#endif /* XMRIG_CRYPTONIGHT_LITE_SOFTAES_H */
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@ -6,7 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -38,9 +39,13 @@
 #include "crypto/c_groestl.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
 #include "cryptonight.h"
 #include "cryptonight_test.h"
 #include "cryptonight.h"
 #include "options.h"
 #include "persistent_memory.h"
 static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {};
 void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
@ -56,6 +61,11 @@ void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, stru
 void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av3(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av4(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 #ifndef XMRIG_NO_AEON
 void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
@ -72,7 +82,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output,
 #ifndef XMRIG_NO_ASM
 void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av2_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 void cryptonight_r_av2_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 #endif
@ -89,6 +105,46 @@ static inline bool verify(enum Variant variant, uint8_t *output, struct cryptoni
 }
 static inline bool verify2(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
 {
    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
    if (func == NULL) {
        return false;
    }
    if (opt_double_hash) {
        uint8_t input[128];
        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
            const size_t size = cn_r_test_input[i].size;
            memcpy(input,        cn_r_test_input[i].data, size);
            memcpy(input + size, cn_r_test_input[i].data, size);
            ctx[0]->height = ctx[1]->height = cn_r_test_input[i].height;
            func(input, size, output, ctx);
            if (memcmp(output, referenceValue + i * 32, 32) != 0 || memcmp(output + 32, referenceValue + i * 32, 32) != 0) {
                return false;
            }
        }
    }
    else {
        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
            ctx[0]->height = cn_r_test_input[i].height;
            func(cn_r_test_input[i].data, cn_r_test_input[i].size, output, ctx);
            if (memcmp(output, referenceValue + i * 32, 32) != 0) {
                return false;
            }
        }
    }
    return true;
 }
 static bool self_test() {
    struct cryptonight_ctx *ctx[2];
    uint8_t output[64];
@ -97,15 +153,18 @@ static bool self_test() {
    const size_t size  = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE;
    bool result = false;
-    for (int i = 0; i < count; ++i) {
+    for (size_t i = 0; i < count; ++i) {
        ctx[i]         = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
        ctx[i]->memory = _mm_malloc(size, 16);
        init_cn_r(ctx[i]);
    }
    if (opt_algo == ALGO_CRYPTONIGHT) {
-        result = verify(VARIANT_0, output, ctx, test_output_v0) &&
+        result = verify(VARIANT_0,  output, ctx, test_output_v0) &&
-                 verify(VARIANT_1, output, ctx, test_output_v1) &&
+                 verify(VARIANT_1,  output, ctx, test_output_v1) &&
-                 verify(VARIANT_2, output, ctx, test_output_v2);
+                 verify(VARIANT_2,  output, ctx, test_output_v2) &&
                 verify2(VARIANT_4, output, ctx, test_output_r);
    }
 #   ifndef XMRIG_NO_AEON
    else {
@ -115,7 +174,7 @@ static bool self_test() {
 #   endif
-    for (int i = 0; i < count; ++i) {
+    for (size_t i = 0; i < count; ++i) {
        _mm_free(ctx[i]->memory);
        _mm_free(ctx[i]);
    }
@ -124,34 +183,20 @@ static bool self_test() {
 }
-size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
+#ifndef XMRIG_NO_ASM
 cn_hash_fun cryptonight_hash_asm_fn(enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
 {
    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
 #   ifndef XMRIG_NO_ASM
    if (assembly == ASM_AUTO) {
-        assembly = cpu_info.assembly;
+        assembly = (enum Assembly) cpu_info.assembly;
    }
    if (assembly == ASM_NONE) {
-        return index;
+        return NULL;
    }
-    const size_t offset = VARIANT_MAX * 4 * 2;
+    return asm_func_map[av][variant][assembly];
    if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) {
        if (av == AV_SINGLE) {
            return offset + assembly - 2;
        }
        if (av == AV_DOUBLE) {
            return offset + 2;
        }
    }
 #   endif
    return index;
 }
 #endif
 cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
@ -160,10 +205,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
    assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);
 #   ifndef XMRIG_NO_ASM
-    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = {
+    if (algorithm == ALGO_CRYPTONIGHT) {
-#   else
+        cn_hash_fun fun = cryptonight_hash_asm_fn(av, variant, opt_assembly);
-    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
+        if (fun) {
            return fun;
        }
    }
 #   endif
    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
        cryptonight_av1_v0,
        cryptonight_av2_v0,
        cryptonight_av3_v0,
@ -177,6 +227,11 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        cryptonight_av3_v2,
        cryptonight_av4_v2,
        cryptonight_r_av1,
        cryptonight_r_av2,
        cryptonight_r_av3,
        cryptonight_r_av4,
 #       ifndef XMRIG_NO_AEON
        cryptonight_lite_av1_v0,
        cryptonight_lite_av2_v0,
@ -190,6 +245,10 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
 #       else
        NULL,
        NULL,
@ -203,16 +262,15 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
        NULL,
        NULL,
        NULL,
-#       endif
+        NULL,
-#       ifndef XMRIG_NO_ASM
+        NULL,
-        cryptonight_single_hash_asm_intel,
+        NULL,
-        cryptonight_single_hash_asm_ryzen,
+        NULL,
        cryptonight_double_hash_asm
 #       endif
    };
 #   ifndef NDEBUG
-    const size_t index = fn_index(algorithm, av, variant, opt_assembly);
+    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
    cn_hash_fun func = func_table[index];
@ -221,7 +279,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V
    return func;
 #   else
-    return func_table[fn_index(algorithm, av, variant, opt_assembly)];
+    return func_table[VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1];
 #   endif
 }
@ -230,6 +288,24 @@ bool cryptonight_init(int av)
 {
    opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT;
 #   ifndef XMRIG_NO_ASM
    asm_func_map[AV_SINGLE][VARIANT_2][ASM_INTEL]     = cryptonight_single_hash_asm_intel;
    asm_func_map[AV_SINGLE][VARIANT_2][ASM_RYZEN]     = cryptonight_single_hash_asm_intel;
    asm_func_map[AV_SINGLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_single_hash_asm_bulldozer;
    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL]     = cryptonight_double_hash_asm;
    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN]     = cryptonight_double_hash_asm;
    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm;
    asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av1_asm_intel;
    asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av1_asm_intel;
    asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer;
    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av2_asm_intel;
    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av2_asm_intel;
    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av2_asm_bulldozer;
 #   endif
    return self_test();
 }
@ -267,6 +343,10 @@ static inline enum Variant cryptonight_variant(uint8_t version)
        return VARIANT_1;
    }
    if (version >= 10) {
        return VARIANT_4;
    }
    if (version >= 8) {
        return VARIANT_2;
    }
@ -276,7 +356,7 @@ static inline enum Variant cryptonight_variant(uint8_t version)
 #ifndef BUILD_TEST
-int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
    uint32_t *nonceptr   = (uint32_t*) (((char*) blob) + 39);
    enum Variant variant = cryptonight_variant(blob[0]);
@ -296,7 +376,7 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blo
 }
-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
    int rc               = 0;
    uint32_t *nonceptr0  = (uint32_t*) (((char*) blob) + 39);
    uint32_t *nonceptr1  = (uint32_t*) (((char*) blob) + 39 + blob_size);
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@ -6,7 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -38,9 +39,30 @@
 #define MEMORY_LITE 1048576 /* 1 MiB */
 #if defined _MSC_VER || defined XMRIG_ARM
 #define ABI_ATTRIBUTE
 #else
 #define ABI_ATTRIBUTE __attribute__((ms_abi))
 #endif
 struct cryptonight_ctx;
 typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE;
 typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE;
 struct cryptonight_ctx {
    uint8_t state[224] __attribute__((aligned(16)));
-    uint8_t* memory    __attribute__((aligned(16)));
+    uint8_t *memory    __attribute__((aligned(16)));
    uint8_t unused[40];
    const uint32_t *saes_table;
    cn_mainloop_fun_ms_abi generated_code;
    cn_mainloop_double_fun_ms_abi generated_code_double;
    uint64_t generated_code_height;
    uint64_t generated_code_double_height;
    uint64_t height;
 };
@ -52,7 +74,8 @@ extern void (* const extra_hashes[4])(const void *, size_t, char *);
 cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant);
 bool cryptonight_init(int av);
-int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
 #endif /* XMRIG_CRYPTONIGHT_H */
--- a/algo/cryptonight/cryptonight_av1.c
+++ b/algo/cryptonight/cryptonight_av1.c
@ -196,6 +196,7 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res
 #ifndef XMRIG_NO_ASM
 extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
 extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
 extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx);
 extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);
@ -225,6 +226,19 @@ void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t siz
 }
 void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input, size, ctx[0]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    cnv2_mainloop_bulldozer_asm(ctx[0]);
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    keccakf((uint64_t*) ctx[0]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
 void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input,        size, ctx[0]->state, 200);
--- a/algo/cryptonight/cryptonight_monero.h
+++ b/algo/cryptonight/cryptonight_monero.h
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018      SChernykh   <https://github.com/SChernykh>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -29,6 +29,8 @@
 #include <fenv.h>
 #include <math.h>
 #include <stdint.h>
 #include <x86intrin.h>
 static inline __m128i int_sqrt_v2(const uint64_t n0)
@ -87,6 +89,17 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
    }
 #   define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
    { \
        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
        _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
    }
 #   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
    { \
        const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
@ -99,4 +112,39 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
    }
 #ifndef NOINLINE
 #ifdef __GNUC__
 #define NOINLINE __attribute__ ((noinline))
 #elif _MSC_VER
 #define NOINLINE __declspec(noinline)
 #else
 #define NOINLINE
 #endif
 #endif
 #include "variant4_random_math.h"
 #define VARIANT4_RANDOM_MATH_INIT(part) \
  uint32_t r##part[9]; \
  struct V4_Instruction code##part[256]; \
  { \
    r##part[0] = (uint32_t)(h##part[12]); \
    r##part[1] = (uint32_t)(h##part[12] >> 32); \
    r##part[2] = (uint32_t)(h##part[13]); \
    r##part[3] = (uint32_t)(h##part[13] >> 32); \
  } \
  v4_random_math_init(code##part, ctx[part]->height);
 #define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
  { \
    cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
    r##part[4] = (uint32_t)(al); \
    r##part[5] = (uint32_t)(ah); \
    r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \
    r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \
    r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
    v4_random_math(code##part, r##part); \
  }
 #endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/algo/cryptonight/cryptonight_r_av1.c
+++ b/algo/cryptonight/cryptonight_r_av1.c
@ -0,0 +1,143 @@
 /* XMRig
 * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
 * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <x86intrin.h>
 #include <string.h>
 #include "crypto/c_keccak.h"
 #include "cryptonight.h"
 #include "cryptonight_aesni.h"
 #include "cryptonight_monero.h"
 void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input, size, ctx[0]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    const uint8_t* l0 = ctx[0]->memory;
    uint64_t* h0 = (uint64_t*) ctx[0]->state;
    VARIANT2_INIT(0);
    VARIANT2_SET_ROUNDING_MODE();
    VARIANT4_RANDOM_MATH_INIT(0);
    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t ah0 = h0[1] ^ h0[5];
    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
    uint64_t idx0 = al0;
    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
        cx = _mm_aesenc_si128(cx, ax0);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
        idx0 = _mm_cvtsi128_si64(cx);
        uint64_t hi, lo, cl, ch;
        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
        lo = _umul128(idx0, cl, &hi);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
        al0 += hi;
        ah0 += lo;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
        al0 ^= cl;
        ah0 ^= ch;
        idx0 = al0;
        bx1 = bx0;
        bx0 = cx;
    }
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    keccakf(h0, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
 #ifndef XMRIG_NO_ASM
 void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
 void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    if (ctx[0]->generated_code_height != ctx[0]->height) {
        struct V4_Instruction code[256];
        const int code_size = v4_random_math_init(code, ctx[0]->height);
        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL);
        ctx[0]->generated_code_height = ctx[0]->height;
    }
    keccak(input, size, ctx[0]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    ctx[0]->generated_code(ctx[0]);
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    keccakf((uint64_t*) ctx[0]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
 void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    if (ctx[0]->generated_code_height != ctx[0]->height) {
        struct V4_Instruction code[256];
        const int code_size = v4_random_math_init(code, ctx[0]->height);
        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER);
        ctx[0]->generated_code_height = ctx[0]->height;
    }
    keccak(input, size, ctx[0]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    ctx[0]->generated_code(ctx[0]);
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    keccakf((uint64_t*) ctx[0]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
 #endif
--- a/algo/cryptonight/cryptonight_r_av2.c
+++ b/algo/cryptonight/cryptonight_r_av2.c
@ -0,0 +1,202 @@
 /* XMRig
 * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
 * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <x86intrin.h>
 #include <string.h>
 #include "crypto/c_keccak.h"
 #include "cryptonight.h"
 #include "cryptonight_aesni.h"
 #include "cryptonight_monero.h"
 void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input,        size, ctx[0]->state, 200);
    keccak(input + size, size, ctx[1]->state, 200);
    const uint8_t* l0 = ctx[0]->memory;
    const uint8_t* l1 = ctx[1]->memory;
    uint64_t* h0 = (uint64_t*) ctx[0]->state;
    uint64_t* h1 = (uint64_t*) ctx[1]->state;
    VARIANT2_INIT(0);
    VARIANT2_INIT(1);
    VARIANT2_SET_ROUNDING_MODE();
    VARIANT4_RANDOM_MATH_INIT(0);
    VARIANT4_RANDOM_MATH_INIT(1);
    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
    uint64_t ah0 = h0[1] ^ h0[5];
    uint64_t ah1 = h1[1] ^ h1[5];
    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
    uint64_t idx0 = al0;
    uint64_t idx1 = al1;
    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
        cx0 = _mm_aesenc_si128(cx0, ax0);
        cx1 = _mm_aesenc_si128(cx1, ax1);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
        idx0 = _mm_cvtsi128_si64(cx0);
        idx1 = _mm_cvtsi128_si64(cx1);
        uint64_t hi, lo, cl, ch;
        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
        lo = _umul128(idx0, cl, &hi);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
        al0 += hi;
        ah0 += lo;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
        al0 ^= cl;
        ah0 ^= ch;
        idx0 = al0;
        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
        lo = _umul128(idx1, cl, &hi);
        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
        al1 += hi;
        ah1 += lo;
        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
        al1 ^= cl;
        ah1 ^= ch;
        idx1 = al1;
        bx01 = bx00;
        bx11 = bx10;
        bx00 = cx0;
        bx10 = cx1;
    }
    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
    keccakf(h0, 24);
    keccakf(h1, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
 }
 #ifndef XMRIG_NO_ASM
 void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
 void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    if (ctx[0]->generated_code_height != ctx[0]->height) {
        struct V4_Instruction code[256];
        const int code_size = v4_random_math_init(code, ctx[0]->height);
        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL);
        ctx[0]->generated_code_height = ctx[0]->height;
    }
    keccak(input,        size, ctx[0]->state, 200);
    keccak(input + size, size, ctx[1]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
    ctx[0]->generated_code_double(ctx[0], ctx[1]);
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
    keccakf((uint64_t *) ctx[0]->state, 24);
    keccakf((uint64_t *) ctx[1]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
 }
 void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    if (ctx[0]->generated_code_height != ctx[0]->height) {
        struct V4_Instruction code[256];
        const int code_size = v4_random_math_init(code, ctx[0]->height);
        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER);
        ctx[0]->generated_code_height = ctx[0]->height;
    }
    keccak(input,        size, ctx[0]->state, 200);
    keccak(input + size, size, ctx[1]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
    ctx[0]->generated_code_double(ctx[0], ctx[1]);
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
    keccakf((uint64_t *) ctx[0]->state, 24);
    keccakf((uint64_t *) ctx[1]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
 }
 #endif
--- a/algo/cryptonight/cryptonight_r_av3.c
+++ b/algo/cryptonight/cryptonight_r_av3.c
@ -0,0 +1,112 @@
 /* XMRig
 * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
 * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <x86intrin.h>
 #include <string.h>
 #include "crypto/c_keccak.h"
 #include "cryptonight.h"
 #include "cryptonight_monero.h"
 #include "cryptonight_softaes.h"
 #ifndef XMRIG_NO_ASM
 void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
 #endif
 void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input, size, ctx[0]->state, 200);
    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
 #   ifndef XMRIG_NO_ASM
    if (ctx[0]->generated_code_height != ctx[0]->height) {
        struct V4_Instruction code[256];
        const int code_size = v4_random_math_init(code, ctx[0]->height);
        v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE);
        ctx[0]->generated_code_height = ctx[0]->height;
    }
    ctx[0]->saes_table = (const uint32_t*)saes_table;
    ctx[0]->generated_code(ctx[0]);
 #   else
    const uint8_t* l0 = ctx[0]->memory;
    uint64_t* h0 = (uint64_t*) ctx[0]->state;
    VARIANT2_INIT(0);
    VARIANT2_SET_ROUNDING_MODE();
    VARIANT4_RANDOM_MATH_INIT(0);
    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t ah0 = h0[1] ^ h0[5];
    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
    uint64_t idx0 = al0;
    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
        cx = soft_aesenc(cx, ax0);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
        idx0 = _mm_cvtsi128_si64(cx);
        uint64_t hi, lo, cl, ch;
        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
        lo = _umul128(idx0, cl, &hi);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
        al0 += hi;
        ah0 += lo;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
        al0 ^= cl;
        ah0 ^= ch;
        idx0 = al0;
        bx1 = bx0;
        bx0 = cx;
    }
 #   endif
    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
    keccakf((uint64_t *) ctx[0]->state, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
 }
--- a/algo/cryptonight/cryptonight_r_av4.c
+++ b/algo/cryptonight/cryptonight_r_av4.c
@ -0,0 +1,143 @@
 /* XMRig
 * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
 * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <x86intrin.h>
 #include <string.h>
 #include "crypto/c_keccak.h"
 #include "cryptonight.h"
 #include "cryptonight_monero.h"
 #include "cryptonight_softaes.h"
 void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
 {
    keccak(input,        size, ctx[0]->state, 200);
    keccak(input + size, size, ctx[1]->state, 200);
    const uint8_t* l0 = ctx[0]->memory;
    const uint8_t* l1 = ctx[1]->memory;
    uint64_t* h0 = (uint64_t*) ctx[0]->state;
    uint64_t* h1 = (uint64_t*) ctx[1]->state;
    VARIANT2_INIT(0);
    VARIANT2_INIT(1);
    VARIANT2_SET_ROUNDING_MODE();
    VARIANT4_RANDOM_MATH_INIT(0);
    VARIANT4_RANDOM_MATH_INIT(1);
    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
    uint64_t al0 = h0[0] ^ h0[4];
    uint64_t al1 = h1[0] ^ h1[4];
    uint64_t ah0 = h0[1] ^ h0[5];
    uint64_t ah1 = h1[1] ^ h1[5];
    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
    uint64_t idx0 = al0;
    uint64_t idx1 = al1;
    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
        cx0 = soft_aesenc(cx0, ax0);
        cx1 = soft_aesenc(cx1, ax1);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
        idx0 = _mm_cvtsi128_si64(cx0);
        idx1 = _mm_cvtsi128_si64(cx1);
        uint64_t hi, lo, cl, ch;
        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
        lo = _umul128(idx0, cl, &hi);
        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
        al0 += hi;
        ah0 += lo;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
        al0 ^= cl;
        ah0 ^= ch;
        idx0 = al0;
        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
        lo = _umul128(idx1, cl, &hi);
        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
        al1 += hi;
        ah1 += lo;
        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
        al1 ^= cl;
        ah1 ^= ch;
        idx1 = al1;
        bx01 = bx00;
        bx11 = bx10;
        bx00 = cx0;
        bx10 = cx1;
    }
    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
    keccakf(h0, 24);
    keccakf(h1, 24);
    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
 }
--- a/algo/cryptonight/cryptonight_softaes.h
+++ b/algo/cryptonight/cryptonight_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
- *
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -30,8 +30,7 @@
 #include <stdint.h>
-extern __m128i soft_aesenc(__m128i in, __m128i key);
+#include "crypto/soft_aes.h"
 extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
--- a/algo/cryptonight/cryptonight_test.h
+++ b/algo/cryptonight/cryptonight_test.h
@ -6,8 +6,8 @@
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
- * Copyright 2018      SChernykh   <https://github.com/SChernykh>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
- * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -27,6 +27,9 @@
 #define XMRIG_CRYPTONIGHT_TEST_H
 #include <stdint.h>
 const static uint8_t test_input[152] = {
    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
@ -67,6 +70,42 @@ const static uint8_t test_output_v2[64] = {
 };
 struct cn_r_test_input_data
 {
    uint64_t height;
    size_t size;
    uint8_t data[64];
 };
 const static struct cn_r_test_input_data cn_r_test_input[] = {
    { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
    { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
    { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
    { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
    { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
    { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
    { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
    { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
    { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
    { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
 };
 // "cn/r"
 const static uint8_t test_output_r[] = {
    0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
    0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
    0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
    0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
    0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
    0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
    0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
    0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
    0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
    0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
 };
 #ifndef XMRIG_NO_AEON
 const static uint8_t test_output_v0_lite[64] = {
--- a/algo/cryptonight/variant4_random_math.h
+++ b/algo/cryptonight/variant4_random_math.h
--- a/cmake/asm.cmake
+++ b/cmake/asm.cmake
@ -1,30 +1,24 @@
 if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
    set(XMRIG_ASM_LIBRARY "xmrig-asm")
-    if (CMAKE_C_COMPILER_ID MATCHES MSVC)
+    enable_language(ASM)
        enable_language(ASM_MASM)
-        if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
+    if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
-            set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm")
+        set(XMRIG_ASM_FILES
-        else()
+            "crypto/asm/win64/cn_main_loop.S"
-            set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm")
+            "crypto/asm/CryptonightR_template.S"
-        endif()
+        )
        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
    else()
-        enable_language(ASM)
+        set(XMRIG_ASM_FILES
-
+            "crypto/asm/cn_main_loop.S"
-        if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
+            "crypto/asm/CryptonightR_template.S"
-            set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S")
+        )
        else()
            set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S")
        endif()
        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
    endif()
-    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE})
+    set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C)
-    set(XMRIG_ASM_SOURCES "")
+
    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES})
    set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c")
    set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
 else()
    set(XMRIG_ASM_SOURCES "")
--- a/cpu.c
+++ b/cpu.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
- *
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -64,20 +65,20 @@ void cpu_init_common() {
    if (data.flags[CPU_FEATURE_AES]) {
        cpu_info.flags |= CPU_FLAG_AES;
 #       ifndef XMRIG_NO_ASM
        if (data.vendor == VENDOR_AMD) {
            cpu_info.assembly = ASM_RYZEN;
        }
        else if (data.vendor == VENDOR_INTEL) {
            cpu_info.assembly = ASM_INTEL;
        }
 #       endif
    }
    if (data.flags[CPU_FEATURE_BMI2]) {
        cpu_info.flags |= CPU_FLAG_BMI2;
    }
 #   ifndef XMRIG_NO_ASM
    if (data.vendor == VENDOR_AMD) {
        cpu_info.assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER;
    }
    else if (data.vendor == VENDOR_INTEL) {
        cpu_info.assembly = ASM_INTEL;
    }
 #   endif
 }
 #endif
--- a/cpu.h
+++ b/cpu.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
- *
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
--- a/crypto/CryptonightR_gen.c
+++ b/crypto/CryptonightR_gen.c
@ -0,0 +1,146 @@
 /* XMRig
 * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
 * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
 * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
 * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
 * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
 * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <string.h>
 #include "algo/cryptonight/cryptonight_monero.h"
 #include "crypto/asm/CryptonightR_template.h"
 #include "persistent_memory.h"
 static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)())
 {
    const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1);
    if (size > 0) {
        memcpy(*p, (const void *) p1, size);
        *p += size;
    }
 }
 static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM)
 {
    uint32_t prev_rot_src = (uint32_t)(-1);
    for (int i = 0;; ++i) {
        const struct V4_Instruction inst = code[i];
        if (inst.opcode == RET) {
            break;
        }
        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
        uint8_t dst_index = inst.dst_index;
        uint8_t src_index = inst.src_index;
        const uint32_t a = inst.dst_index;
        const uint32_t b = inst.src_index;
        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
        switch (inst.opcode) {
        case ROR:
        case ROL:
            if (b != prev_rot_src) {
                prev_rot_src = b;
                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
            }
            break;
        }
        if (a == prev_rot_src) {
            prev_rot_src = (uint32_t)(-1);
        }
        void_func begin = instructions[c];
        if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
            uint8_t* prefix = (uint8_t*) begin;
            if (*prefix == 0x49) {
                **p = 0x41;
                *p += 1;
            }
            begin = (void_func)(prefix + 1);
        }
        add_code(p, begin, instructions[c + 1]);
        if (inst.opcode == ADD) {
            *(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
            if (is_64_bit) {
                prev_rot_src = (uint32_t)(-1);
            }
        }
    }
 }
 void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
 {
    uint8_t* p0 = machine_code;
    uint8_t* p  = p0;
    add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2);
    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
    add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3);
    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
    add_code(&p, CryptonightR_template_part3, CryptonightR_template_end);
    flush_instruction_cache(machine_code, p - p0);
 }
 void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
 {
    uint8_t* p0 = (uint8_t*) machine_code;
    uint8_t* p = p0;
    add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
    add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
    add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
    add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
    flush_instruction_cache(machine_code, p - p0);
 }
 void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
 {
    uint8_t* p0 = machine_code;
    uint8_t* p  = p0;
    add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
    add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
    add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
    flush_instruction_cache(machine_code, p - p0);
 }
--- a/crypto/asm/CryptonightR_soft_aes_template.inc
+++ b/crypto/asm/CryptonightR_soft_aes_template.inc
@ -0,0 +1,279 @@
 PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
 PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
 PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
 PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
 PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
 ALIGN(64)
 FN_PREFIX(CryptonightR_soft_aes_template_part1):
 	mov	QWORD PTR [rsp+8], rcx
 	push	rbx
 	push	rbp
 	push	rsi
 	push	rdi
 	push	r12
 	push	r13
 	push	r14
 	push	r15
 	sub	rsp, 232
 	mov	eax, [rcx+96]
 	mov	ebx, [rcx+100]
 	mov	esi, [rcx+104]
 	mov	edx, [rcx+108]
 	mov [rsp+144], eax
 	mov [rsp+148], ebx
 	mov [rsp+152], esi
 	mov [rsp+156], edx
 	mov	rax, QWORD PTR [rcx+48]
 	mov	r10, rcx
 	xor	rax, QWORD PTR [rcx+16]
 	mov	r8, QWORD PTR [rcx+32]
 	xor	r8, QWORD PTR [rcx]
 	mov	r9, QWORD PTR [rcx+40]
 	xor	r9, QWORD PTR [rcx+8]
 	movq	xmm4, rax
 	mov	rdx, QWORD PTR [rcx+56]
 	xor	rdx, QWORD PTR [rcx+24]
 	mov	r11, QWORD PTR [rcx+224]
 	mov	rcx, QWORD PTR [rcx+88]
 	xor	rcx, QWORD PTR [r10+72]
 	mov	rax, QWORD PTR [r10+80]
 	movq	xmm0, rdx
 	xor	rax, QWORD PTR [r10+64]
 	movaps	XMMWORD PTR [rsp+16], xmm6
 	movaps	XMMWORD PTR [rsp+32], xmm7
 	movaps	XMMWORD PTR [rsp+48], xmm8
 	movaps	XMMWORD PTR [rsp+64], xmm9
 	movaps	XMMWORD PTR [rsp+80], xmm10
 	movaps	XMMWORD PTR [rsp+96], xmm11
 	movaps	XMMWORD PTR [rsp+112], xmm12
 	movaps	XMMWORD PTR [rsp+128], xmm13
 	movq	xmm5, rax
 	mov	rax, r8
 	punpcklqdq xmm4, xmm0
 	and	eax, 2097136
 	movq	xmm10, QWORD PTR [r10+96]
 	movq	xmm0, rcx
 	mov	rcx, QWORD PTR [r10+104]
 	xorps	xmm9, xmm9
 	mov	QWORD PTR [rsp+328], rax
 	movq	xmm12, r11
 	mov	QWORD PTR [rsp+320], r9
 	punpcklqdq xmm5, xmm0
 	movq xmm13, rcx
 	mov r12d, 524288
 	ALIGN(64)
 FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
 	movd xmm11, r12d
 	mov	r12, QWORD PTR [r10+272]
 	lea	r13, QWORD PTR [rax+r11]
 	mov	esi, DWORD PTR [r13]
 	movq	xmm0, r9
 	mov	r10d, DWORD PTR [r13+4]
 	movq	xmm7, r8
 	mov	ebp, DWORD PTR [r13+12]
 	mov	r14d, DWORD PTR [r13+8]
 	mov	rdx, QWORD PTR [rsp+328]
 	movzx	ecx, sil
 	shr	esi, 8
 	punpcklqdq xmm7, xmm0
 	mov	r15d, DWORD PTR [r12+rcx*4]
 	movzx	ecx, r10b
 	shr	r10d, 8
 	mov	edi, DWORD PTR [r12+rcx*4]
 	movzx	ecx, r14b
 	shr	r14d, 8
 	mov	ebx, DWORD PTR [r12+rcx*4]
 	movzx	ecx, bpl
 	shr	ebp, 8
 	mov	r9d, DWORD PTR [r12+rcx*4]
 	movzx	ecx, r10b
 	shr	r10d, 8
 	xor	r15d, DWORD PTR [r12+rcx*4+1024]
 	movzx	ecx, r14b
 	shr	r14d, 8
 	mov	eax, r14d
 	shr	eax, 8
 	xor	edi, DWORD PTR [r12+rcx*4+1024]
 	add	eax, 256
 	movzx	ecx, bpl
 	shr	ebp, 8
 	xor	ebx, DWORD PTR [r12+rcx*4+1024]
 	movzx	ecx, sil
 	shr	esi, 8
 	xor	r9d, DWORD PTR [r12+rcx*4+1024]
 	add	r12, 2048
 	movzx	ecx, r10b
 	shr	r10d, 8
 	add	r10d, 256
 	mov	r11d, DWORD PTR [r12+rax*4]
 	xor	r11d, DWORD PTR [r12+rcx*4]
 	xor	r11d, r9d
 	movzx	ecx, sil
 	mov	r10d, DWORD PTR [r12+r10*4]
 	shr	esi, 8
 	add	esi, 256
 	xor	r10d, DWORD PTR [r12+rcx*4]
 	movzx	ecx, bpl
 	xor	r10d, ebx
 	shr	ebp, 8
 	movd	xmm1, r11d
 	add	ebp, 256
 	movq	r11, xmm12
 	mov	r9d, DWORD PTR [r12+rcx*4]
 	xor	r9d, DWORD PTR [r12+rsi*4]
 	mov	eax, DWORD PTR [r12+rbp*4]
 	xor	r9d, edi
 	movzx	ecx, r14b
 	movd	xmm0, r10d
 	movd	xmm2, r9d
 	xor	eax, DWORD PTR [r12+rcx*4]
 	mov	rcx, rdx
 	xor	eax, r15d
 	punpckldq xmm2, xmm1
 	xor	rcx, 16
 	movd	xmm6, eax
 	mov	rax, rdx
 	punpckldq xmm6, xmm0
 	xor	rax, 32
 	punpckldq xmm6, xmm2
 	xor	rdx, 48
 	movdqu	xmm2, XMMWORD PTR [rcx+r11]
 	pxor xmm6, xmm2
 	pxor	xmm6, xmm7
 	paddq	xmm2, xmm4
 	movdqu	xmm1, XMMWORD PTR [rax+r11]
 	movdqu	xmm0, XMMWORD PTR [rdx+r11]
 	pxor xmm6, xmm1
 	pxor xmm6, xmm0
 	paddq	xmm0, xmm5
 	movdqu	XMMWORD PTR [rcx+r11], xmm0
 	movdqu	XMMWORD PTR [rax+r11], xmm2
 	movq rcx, xmm13
 	paddq	xmm1, xmm7
 	movdqu	XMMWORD PTR [rdx+r11], xmm1
 	movq	rdi, xmm6
 	mov	r10, rdi
 	and	r10d, 2097136
 	movdqa	xmm0, xmm6
 	pxor	xmm0, xmm4
 	movdqu	XMMWORD PTR [r13], xmm0
 	mov ebx, [rsp+144]
 	mov ebp, [rsp+152]
 	add ebx, [rsp+148]
 	add ebp, [rsp+156]
 	shl rbp, 32
 	or rbx, rbp
 	xor rbx, QWORD PTR [r10+r11]
 	lea	r14, QWORD PTR [r10+r11]
 	mov	rbp, QWORD PTR [r14+8]
 	mov [rsp+160], rbx
 	mov [rsp+168], rdi
 	mov [rsp+176], rbp
 	mov [rsp+184], r10
 	mov r10, rsp
 	mov ebx, [rsp+144]
 	mov esi, [rsp+148]
 	mov edi, [rsp+152]
 	mov ebp, [rsp+156]
 	movd esp, xmm7
 	movaps xmm0, xmm7
 	psrldq xmm0, 8
 	movd r15d, xmm0
 	movd eax, xmm4
 	movd edx, xmm5
 	movaps xmm0, xmm5
 	psrldq xmm0, 8
 	movd r9d, xmm0
 FN_PREFIX(CryptonightR_soft_aes_template_part2):
 	mov rsp, r10
 	mov [rsp+144], ebx
 	mov [rsp+148], esi
 	mov [rsp+152], edi
 	mov [rsp+156], ebp
 	mov edi, edi
 	shl rbp, 32
 	or rbp, rdi
 	xor r8, rbp
 	mov ebx, ebx
 	shl rsi, 32
 	or rsi, rbx
 	xor QWORD PTR [rsp+320], rsi
 	mov rbx, [rsp+160]
 	mov rdi, [rsp+168]
 	mov rbp, [rsp+176]
 	mov r10, [rsp+184]
 	mov	r9, r10
 	xor	r9, 16
 	mov	rcx, r10
 	xor	rcx, 32
 	xor	r10, 48
 	mov	rax, rbx
 	mul	rdi
 	movdqu	xmm2, XMMWORD PTR [r9+r11]
 	movdqu	xmm1, XMMWORD PTR [rcx+r11]
 	pxor xmm6, xmm2
 	pxor xmm6, xmm1
 	paddq	xmm1, xmm7
 	add	r8, rdx
 	movdqu	xmm0, XMMWORD PTR [r10+r11]
 	pxor xmm6, xmm0
 	paddq	xmm0, xmm5
 	paddq	xmm2, xmm4
 	movdqu	XMMWORD PTR [r9+r11], xmm0
 	movdqa	xmm5, xmm4
 	mov	r9, QWORD PTR [rsp+320]
 	movdqa	xmm4, xmm6
 	add	r9, rax
 	movdqu	XMMWORD PTR [rcx+r11], xmm2
 	movdqu	XMMWORD PTR [r10+r11], xmm1
 	mov	r10, QWORD PTR [rsp+304]
 	movd r12d, xmm11
 	mov	QWORD PTR [r14], r8
 	xor	r8, rbx
 	mov	rax, r8
 	mov	QWORD PTR [r14+8], r9
 	and	eax, 2097136
 	xor	r9, rbp
 	mov	QWORD PTR [rsp+320], r9
 	mov	QWORD PTR [rsp+328], rax
 	sub	r12d, 1
 	jne	FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
 FN_PREFIX(CryptonightR_soft_aes_template_part3):
 	movaps	xmm6, XMMWORD PTR [rsp+16]
 	movaps	xmm7, XMMWORD PTR [rsp+32]
 	movaps	xmm8, XMMWORD PTR [rsp+48]
 	movaps	xmm9, XMMWORD PTR [rsp+64]
 	movaps	xmm10, XMMWORD PTR [rsp+80]
 	movaps	xmm11, XMMWORD PTR [rsp+96]
 	movaps	xmm12, XMMWORD PTR [rsp+112]
 	movaps	xmm13, XMMWORD PTR [rsp+128]
 	add	rsp, 232
 	pop	r15
 	pop	r14
 	pop	r13
 	pop	r12
 	pop	rdi
 	pop	rsi
 	pop	rbp
 	pop	rbx
 	ret
 FN_PREFIX(CryptonightR_soft_aes_template_end):
--- a/crypto/asm/CryptonightR_template.S
+++ b/crypto/asm/CryptonightR_template.S
--- a/crypto/asm/CryptonightR_template.h
+++ b/crypto/asm/CryptonightR_template.h
--- a/crypto/asm/CryptonightR_template.inc
+++ b/crypto/asm/CryptonightR_template.inc
--- a/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
@ -94,7 +94,7 @@
 	lea	r9, QWORD PTR [rdx+r13]
 	movdqu	xmm15, XMMWORD PTR [r9]
-	ALIGN 16
+	ALIGN(64)
 main_loop_double_sandybridge:
 	movdqu	xmm9, xmm15
 	mov eax, edx
--- a/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
@ -0,0 +1,180 @@
 	mov	QWORD PTR [rsp+16], rbx
 	mov	QWORD PTR [rsp+24], rbp
 	mov	QWORD PTR [rsp+32], rsi
 	push	rdi
 	push	r12
 	push	r13
 	push	r14
 	push	r15
 	sub	rsp, 64
 	stmxcsr DWORD PTR [rsp]
 	mov DWORD PTR [rsp+4], 24448
 	ldmxcsr DWORD PTR [rsp+4]
 	mov	rax, QWORD PTR [rcx+48]
 	mov	r9, rcx
 	xor	rax, QWORD PTR [rcx+16]
 	mov	ebp, 524288
 	mov	r8, QWORD PTR [rcx+32]
 	xor	r8, QWORD PTR [rcx]
 	mov	r11, QWORD PTR [rcx+40]
 	mov	r10, r8
 	mov	rdx, QWORD PTR [rcx+56]
 	movq	xmm3, rax
 	xor	rdx, QWORD PTR [rcx+24]
 	xor	r11, QWORD PTR [rcx+8]
 	mov	rbx, QWORD PTR [rcx+224]
 	mov	rax, QWORD PTR [r9+80]
 	xor	rax, QWORD PTR [r9+64]
 	movq	xmm0, rdx
 	mov	rcx, QWORD PTR [rcx+88]
 	xor	rcx, QWORD PTR [r9+72]
 	mov	rdi, QWORD PTR [r9+104]
 	and	r10d, 2097136
 	movaps	XMMWORD PTR [rsp+48], xmm6
 	movq	xmm4, rax
 	movaps	XMMWORD PTR [rsp+32], xmm7
 	movaps	XMMWORD PTR [rsp+16], xmm8
 	xorps	xmm8, xmm8
 	mov ax, 1023
 	shl rax, 52
 	movq xmm7, rax
 	mov	r15, QWORD PTR [r9+96]
 	punpcklqdq xmm3, xmm0
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0
 	ALIGN(64)
 cnv2_main_loop_bulldozer:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq xmm6, r8
 	pinsrq xmm6, r11, 1
 	lea	rdx, QWORD PTR [r10+rbx]
 	lea	r9, QWORD PTR [rdi+rdi]
 	shl	rdi, 32
 	mov	ecx, r10d
 	mov	eax, r10d
 	xor	ecx, 16
 	xor	eax, 32
 	xor	r10d, 48
 	aesenc	xmm5, xmm6
 	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
 	movdqa	xmm1, XMMWORD PTR [rax+rbx]
 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	paddq	xmm2, xmm3
 	paddq	xmm1, xmm6
 	paddq	xmm0, xmm4
 	movdqa	XMMWORD PTR [rcx+rbx], xmm0
 	movdqa	XMMWORD PTR [rax+rbx], xmm2
 	movdqa	XMMWORD PTR [r10+rbx], xmm1
 	movaps	xmm1, xmm8
 	mov	rsi, r15
 	xor	rsi, rdi
 	mov edi, 1023
 	shl rdi, 52
 	movq	r14, xmm5
 	pextrq rax, xmm5, 1
 	movdqa	xmm0, xmm5
 	pxor	xmm0, xmm3
 	mov	r10, r14
 	and	r10d, 2097136
 	movdqa	XMMWORD PTR [rdx], xmm0
 	xor	rsi, QWORD PTR [r10+rbx]
 	lea	r12, QWORD PTR [r10+rbx]
 	mov	r13, QWORD PTR [r10+rbx+8]
 	add	r9d, r14d
 	or	r9d, -2147483647
 	xor	edx, edx
 	div	r9
 	mov	eax, eax
 	shl	rdx, 32
 	lea	r15, [rax+rdx]
 	lea	rax, [r14+r15]
 	shr	rax, 12
 	add	rax, rdi
 	movq	xmm0, rax
 	sqrtsd	xmm1, xmm0
 	movq	rdi, xmm1
 	test	rdi, 524287
 	je	sqrt_fixup_bulldozer
 	shr	rdi, 19
 sqrt_fixup_bulldozer_ret:
 	mov	rax, rsi
 	mul	r14
 	movq xmm1, rax
 	movq xmm0, rdx
 	punpcklqdq xmm0, xmm1
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
 	xor rdx, [rcx+rbx]
 	xor rax, [rcx+rbx+8]
 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
 	pxor xmm2, xmm0
 	paddq xmm4, XMMWORD PTR [r10+rbx]
 	paddq	xmm2, xmm3
 	paddq	xmm1, xmm6
 	movdqa	XMMWORD PTR [r9+rbx], xmm4
 	movdqa	XMMWORD PTR [rcx+rbx], xmm2
 	movdqa	XMMWORD PTR [r10+rbx], xmm1
 	movdqa	xmm4, xmm3
 	add	r8, rdx
 	add	r11, rax
 	mov	QWORD PTR [r12], r8
 	xor	r8, rsi
 	mov	QWORD PTR [r12+8], r11
 	mov	r10, r8
 	xor	r11, r13
 	and	r10d, 2097136
 	movdqa	xmm3, xmm5
 	dec	ebp
 	jne	cnv2_main_loop_bulldozer
 	ldmxcsr DWORD PTR [rsp]
 	movaps	xmm6, XMMWORD PTR [rsp+48]
 	lea	r11, QWORD PTR [rsp+64]
 	mov	rbx, QWORD PTR [r11+56]
 	mov	rbp, QWORD PTR [r11+64]
 	mov	rsi, QWORD PTR [r11+72]
 	movaps	xmm8, XMMWORD PTR [r11-48]
 	movaps	xmm7, XMMWORD PTR [rsp+32]
 	mov	rsp, r11
 	pop	r15
 	pop	r14
 	pop	r13
 	pop	r12
 	pop	rdi
 	jmp cnv2_main_loop_bulldozer_endp
 sqrt_fixup_bulldozer:
 	movq r9, xmm5
 	add r9, r15
 	dec	rdi
 	mov edx, -1022
 	shl rdx, 32
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
 	lea	rcx, [rcx+rdx+1]
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
 	adc	rdi, 0
 	jmp	sqrt_fixup_bulldozer_ret
 cnv2_main_loop_bulldozer_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
@ -50,7 +50,7 @@
 	punpcklqdq xmm5, xmm0
 	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
-	ALIGN 16
+	ALIGN(64)
 main_loop_ivybridge:
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
--- a/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
@ -45,7 +45,7 @@
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0
-	ALIGN 16
+	ALIGN(64)
 main_loop_ryzen:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq	xmm0, r11
--- a/crypto/asm/cnv2_main_loop.S
+++ b/crypto/asm/cnv2_main_loop.S
@ -1,4 +1,8 @@
-#define ALIGN .align
+#ifdef __APPLE__
 #   define ALIGN(x) .align 6
 #else
 #   define ALIGN(x) .align 64
 #endif
 .intel_syntax noprefix
 #ifdef __APPLE__
 #   define FN_PREFIX(fn) _ ## fn
@ -9,29 +13,42 @@
 #endif
 .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
 .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
 .global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
 .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
-ALIGN 16
+ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cnv2_main_loop_ivybridge.inc"
+	#include "cn2/cnv2_main_loop_ivybridge.inc"
 	add rsp, 48
 	ret 0
 	mov eax, 3735929054
-ALIGN 16
+ALIGN(64)
 FN_PREFIX(cnv2_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cnv2_main_loop_ryzen.inc"
+	#include "cn2/cnv2_main_loop_ryzen.inc"
 	add rsp, 48
 	ret 0
 	mov eax, 3735929054
-ALIGN 16
+ALIGN(64)
 FN_PREFIX(cnv2_mainloop_bulldozer_asm):
 	sub rsp, 48
 	mov rcx, rdi
 	#include "cn2/cnv2_main_loop_bulldozer.inc"
 	add rsp, 48
 	ret 0
 	mov eax, 3735929054
 ALIGN(64)
 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
 	mov rdx, rsi
-	#include "cnv2_double_main_loop_sandybridge.inc"
+	#include "cn2/cnv2_double_main_loop_sandybridge.inc"
 	add rsp, 48
 	ret 0
 	mov eax, 3735929054
--- a/crypto/asm/cnv2_main_loop.asm
+++ b/crypto/asm/cnv2_main_loop.asm
@ -1,25 +0,0 @@
 _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cnv2_mainloop_ivybridge_asm
 PUBLIC cnv2_mainloop_ryzen_asm
 PUBLIC cnv2_double_mainloop_sandybridge_asm
 ALIGN 64
 cnv2_mainloop_ivybridge_asm PROC
 	INCLUDE cnv2_main_loop_ivybridge.inc
 	ret 0
 cnv2_mainloop_ivybridge_asm ENDP
 ALIGN 64
 cnv2_mainloop_ryzen_asm PROC
 	INCLUDE cnv2_main_loop_ryzen.inc
 	ret 0
 cnv2_mainloop_ryzen_asm ENDP
 ALIGN 64
 cnv2_double_mainloop_sandybridge_asm PROC
 	INCLUDE cnv2_double_main_loop_sandybridge.inc
 	ret 0
 cnv2_double_mainloop_sandybridge_asm ENDP
 _TEXT_CNV2_MAINLOOP ENDS
 END
--- a/crypto/asm/win64/cn_main_loop.S
+++ b/crypto/asm/win64/cn_main_loop.S
@ -0,0 +1,31 @@
 #define ALIGN(x) .align 64
 .intel_syntax noprefix
 .section .text
 .global cnv2_mainloop_ivybridge_asm
 .global cnv2_mainloop_ryzen_asm
 .global cnv2_mainloop_bulldozer_asm
 .global cnv2_double_mainloop_sandybridge_asm
 ALIGN(64)
 cnv2_mainloop_ivybridge_asm:
 	#include "../cn2/cnv2_main_loop_ivybridge.inc"
 	ret 0
 	mov eax, 3735929054
 ALIGN(64)
 cnv2_mainloop_ryzen_asm:
 	#include "../cn2/cnv2_main_loop_ryzen.inc"
 	ret 0
 	mov eax, 3735929054
 ALIGN(64)
 cnv2_mainloop_bulldozer_asm:
 	#include "../cn2/cnv2_main_loop_bulldozer.inc"
 	ret 0
 	mov eax, 3735929054
 ALIGN(64)
 cnv2_double_mainloop_sandybridge_asm:
 	#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
 	ret 0
 	mov eax, 3735929054
--- a/crypto/asm/win64/cnv2_main_loop.S
+++ b/crypto/asm/win64/cnv2_main_loop.S
@ -1,21 +0,0 @@
 #define ALIGN .align
 .intel_syntax noprefix
 .section .text
 .global cnv2_mainloop_ivybridge_asm
 .global cnv2_mainloop_ryzen_asm
 .global cnv2_double_mainloop_sandybridge_asm
 ALIGN 16
 cnv2_mainloop_ivybridge_asm:
 	#include "../cnv2_main_loop_ivybridge.inc"
 	ret 0
 ALIGN 16
 cnv2_mainloop_ryzen_asm:
 	#include "../cnv2_main_loop_ryzen.inc"
 	ret 0
 ALIGN 16
 cnv2_double_mainloop_sandybridge_asm:
 	#include "../cnv2_double_main_loop_sandybridge.inc"
 	ret 0
--- a/crypto/soft_aes.c
+++ b/crypto/soft_aes.c
@ -1,212 +0,0 @@
 /*
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  * Additional permission under GNU GPL version 3 section 7
  *
  * If you modify this Program, or any covered work, by linking or combining
  * it with OpenSSL (or a modified version of that library), containing parts
  * covered by the terms of OpenSSL License and SSLeay License, the licensors
  * of this Program grant you additional permission to convey the resulting work.
  *
  */
 /*
 * The orginal author of this AES implementation is Karl Malbrain.
 */
 #ifdef __GNUC__
 #include <x86intrin.h>
 #else
 #include <intrin.h>
 #endif // __GNUC__
 #include <inttypes.h>
 #define TABLE_ALIGN     32
 #define WPOLY           0x011b
 #define N_COLS          4
 #define AES_BLOCK_SIZE  16
 #define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
 #if defined(_MSC_VER)
 #define ALIGN __declspec(align(TABLE_ALIGN))
 #elif defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(16)))
 #else
 #define ALIGN
 #endif
 #define rf1(r,c) (r)
 #define word_in(x,c) (*((uint32_t*)(x)+(c)))
 #define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
 #define s(x,c) x[c]
 #define si(y,x,c) (s(y,c) = word_in(x, c))
 #define so(y,x,c) word_out(y, c, s(x,c))
 #define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
 #define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
 #define round(y,x,k) \
 y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
 y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
 y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
 y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
 #define to_byte(x) ((x) & 0xff)
 #define bval(x,n) to_byte((x) >> (8 * (n)))
 #define fwd_var(x,r,c)\
 ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
 : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
 : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
 :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
 #define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
 #define sb_data(w) {\
    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
 #define rc_data(w) {\
    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
    w(0x1b), w(0x36) }
 #define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
 #define h0(x)   (x)
 #define w0(p)   bytes2word(p, 0, 0, 0)
 #define w1(p)   bytes2word(0, p, 0, 0)
 #define w2(p)   bytes2word(0, 0, p, 0)
 #define w3(p)   bytes2word(0, 0, 0, p)
 #define u0(p)   bytes2word(f2(p), p, p, f3(p))
 #define u1(p)   bytes2word(f3(p), f2(p), p, p)
 #define u2(p)   bytes2word(p, f3(p), f2(p), p)
 #define u3(p)   bytes2word(p, p, f3(p), f2(p))
 #define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
 #define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
 #define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
 #define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
 #define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
 #define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
 #define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
 #define f3(x)   (f2(x) ^ x)
 #define f9(x)   (f8(x) ^ x)
 #define fb(x)   (f8(x) ^ f2(x) ^ x)
 #define fd(x)   (f8(x) ^ f4(x) ^ x)
 #define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
 #define t_dec(m,n) t_##m##n
 #define t_set(m,n) t_##m##n
 #define t_use(m,n) t_##m##n
 #define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
 #define four_tables(x,tab,vf,rf,c) \
    (tab[0][bval(vf(x,0,c),rf(0,c))] \
    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
 d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
 __m128i soft_aesenc(__m128i in, __m128i key)
 {
    uint32_t x0, x1, x2, x3;
    x0 = _mm_cvtsi128_si32(in);
    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
    __m128i out = _mm_set_epi32(
        (t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]),
        (t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]),
        (t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]),
        (t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24]));
    return _mm_xor_si128(out, key);
 }
 uint8_t Sbox[256] = {       // forward s-box
 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
 static inline void sub_word(uint8_t* key)
 {
    key[0] = Sbox[key[0]];
    key[1] = Sbox[key[1]];
    key[2] = Sbox[key[2]];
    key[3] = Sbox[key[3]];
 }
 #ifdef __clang__
 uint32_t _rotr(uint32_t value, uint32_t amount)
 {
    return (value >> amount) | (value << ((32 - amount) & 31));
 }
 #endif
 __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
 {
    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
    sub_word((uint8_t*)&X1);
    sub_word((uint8_t*)&X3);
    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
 }
--- a/Show More
+++ b/Show More