From a4d5d0a75a00f7463215d2c57a33ee982e68f4b3 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Sun, 28 Nov 2021 16:10:09 +0100
Subject: [PATCH] Added VAES support for Cryptonight variants

---
 CHANGELOG.md                              |  12 +
 CMakeLists.txt                            |   8 +
 src/backend/cpu/CpuWorker.cpp             |  37 +-
 src/backend/cpu/interfaces/ICpuInfo.h     |   2 +
 src/backend/cpu/platform/BasicCpuInfo.cpp |   6 +-
 src/backend/cpu/platform/BasicCpuInfo.h   |   1 +
 src/crypto/cn/CnHash.cpp                  |   2 +-
 src/crypto/cn/CryptoNight_test.h          |  23 +
 src/crypto/cn/CryptoNight_x86.h           | 103 ++++-
 src/crypto/cn/CryptoNight_x86_vaes.cpp    | 513 ++++++++++++++++++++++
 src/crypto/cn/CryptoNight_x86_vaes.h      |  48 ++
 src/crypto/ghostrider/README.md           |   4 +-
 src/crypto/ghostrider/ghostrider.cpp      |  28 +-
 src/crypto/ghostrider/ghostrider.h        |   2 +-
 14 files changed, 749 insertions(+), 40 deletions(-)
 create mode 100644 src/crypto/cn/CryptoNight_x86_vaes.cpp
 create mode 100644 src/crypto/cn/CryptoNight_x86_vaes.h
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 309fa592..e0d97b6d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+# v6.16.1
+- [#2729](https://github.com/xmrig/xmrig/pull/2729) GhostRider fixes:
+  - Added average hashrate display
+  - Fixed the number of threads shown at startup
+  - Fixed `--threads` or `-t` command line option (but `--cpu-max-threads-hint` is recommended to use)
+- [#2738](https://github.com/xmrig/xmrig/pull/2738) GhostRider fixes:
+  - Fixed "difficulty is not a number" error when diff is high on some pools
+  - Fixed GhostRider compilation when WITH_KAWPOW=OFF
+- [#2740](https://github.com/xmrig/xmrig/pull/2740) Added VAES support for Cryptonight variants **+4% speedup on Zen3**
+  - VAES instructions are available on Intel Ice Lake/AMD Zen3 and newer CPUs.
+  - +4% speedup on Ryzen 5 5600X.
+
 # v6.16.0
 - [#2712](https://github.com/xmrig/xmrig/pull/2712) **GhostRider algorithm (Raptoreum) support**: read the [RELEASE NOTES](src/crypto/ghostrider/README.md) for quick start guide and performance comparisons.
 - [#2682](https://github.com/xmrig/xmrig/pull/2682) Fixed: use cn-heavy optimization only for Vermeer CPUs.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7f9c98b..b8c8d1c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,14 @@ if (CMAKE_C_COMPILER_ID MATCHES GNU)
     set_source_files_properties(src/crypto/cn/CnHash.cpp PROPERTIES COMPILE_FLAGS "-Ofast -fno-tree-vectorize")
 endif()
 
+if (NOT XMRIG_ARM)
+    set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_x86_vaes.h)
+    set(SOURCES_CRYPTO "${SOURCES_CRYPTO}" src/crypto/cn/CryptoNight_x86_vaes.cpp)
+    if (CMAKE_C_COMPILER_ID MATCHES GNU)
+        set_source_files_properties(src/crypto/cn/CryptoNight_x86_vaes.cpp PROPERTIES COMPILE_FLAGS "-Ofast -fno-tree-vectorize -mavx2 -mvaes")
+    endif()
+endif()
+
 if (WITH_HWLOC)
     list(APPEND HEADERS_CRYPTO
         src/crypto/common/NUMAMemoryPool.h
diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp
index 9d3fa093..32a7a6be 100644
--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@@ -161,14 +161,14 @@ bool xmrig::CpuWorker<N>::selfTest()
     }
 #   endif
 
+    allocateCnCtx();
+
 #   ifdef XMRIG_ALGO_GHOSTRIDER
     if (m_algorithm.family() == Algorithm::GHOSTRIDER) {
-        return N == 8;
+        return (N == 8) && verify(Algorithm::GHOSTRIDER_RTM, test_output_gr);
     }
 #   endif
 
-    allocateCnCtx();
-
     if (m_algorithm.family() == Algorithm::CN) {
         const bool rc = verify(Algorithm::CN_0,      test_output_v0)   &&
                         verify(Algorithm::CN_1,      test_output_v1)   &&
@@ -397,6 +397,37 @@ bool xmrig::CpuWorker<N>::nextRound()
 template<size_t N>
 bool xmrig::CpuWorker<N>::verify(const Algorithm &algorithm, const uint8_t *referenceValue)
 {
+#   ifdef XMRIG_ALGO_GHOSTRIDER
+    if (algorithm == Algorithm::GHOSTRIDER_RTM) {
+        uint8_t blob[N * 80] = {};
+        for (size_t i = 0; i < N; ++i) {
+            blob[i * 80 + 0] = static_cast<uint8_t>(i);
+            blob[i * 80 + 4] = 0x10;
+            blob[i * 80 + 5] = 0x02;
+        }
+
+        uint8_t hash1[N * 32] = {};
+        ghostrider::hash_octa(blob, 80, hash1, m_ctx, 0, false);
+
+        for (size_t i = 0; i < N; ++i) {
+            blob[i * 80 + 0] = static_cast<uint8_t>(i);
+            blob[i * 80 + 4] = 0x43;
+            blob[i * 80 + 5] = 0x05;
+        }
+
+        uint8_t hash2[N * 32] = {};
+        ghostrider::hash_octa(blob, 80, hash2, m_ctx, 0, false);
+
+        for (size_t i = 0; i < N * 32; ++i) {
+            if ((hash1[i] ^ hash2[i]) != referenceValue[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+#   endif
+
     cn_hash_fun func = fn(algorithm);
     if (!func) {
         return false;
diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h
index 54e6aabf..f2b56009 100644
--- a/src/backend/cpu/interfaces/ICpuInfo.h
+++ b/src/backend/cpu/interfaces/ICpuInfo.h
@@ -61,6 +61,7 @@ public:
 
     enum Flag : uint32_t {
         FLAG_AES,
+        FLAG_VAES,
         FLAG_AVX,
         FLAG_AVX2,
         FLAG_AVX512F,
@@ -90,6 +91,7 @@ public:
     virtual Assembly::Id assembly() const                                           = 0;
     virtual bool has(Flag feature) const                                            = 0;
     virtual bool hasAES() const                                                     = 0;
+    virtual bool hasVAES() const                                                    = 0;
     virtual bool hasAVX() const                                                     = 0;
     virtual bool hasAVX2() const                                                    = 0;
     virtual bool hasBMI2() const                                                    = 0;
diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp
index 0a5c17db..d64612e5 100644
--- a/src/backend/cpu/platform/BasicCpuInfo.cpp
+++ b/src/backend/cpu/platform/BasicCpuInfo.cpp
@@ -52,8 +52,8 @@
 namespace xmrig {
 
 
-constexpr size_t kCpuFlagsSize                                  = 14;
-static const std::array<const char *, kCpuFlagsSize> flagNames  = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
+constexpr size_t kCpuFlagsSize                                  = 15;
+static const std::array<const char *, kCpuFlagsSize> flagNames  = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
 static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
 
 
@@ -140,6 +140,7 @@ static inline bool has_osxsave()    { return has_feature(PROCESSOR_INFO,
 static inline bool has_aes_ni()     { return has_feature(PROCESSOR_INFO,        ECX_Reg, 1 << 25); }
 static inline bool has_avx()        { return has_feature(PROCESSOR_INFO,        ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
 static inline bool has_avx2()       { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
+static inline bool has_vaes()       { return has_feature(EXTENDED_FEATURES,     ECX_Reg, 1 << 9); }
 static inline bool has_avx512f()    { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
 static inline bool has_bmi2()       { return has_feature(EXTENDED_FEATURES,     EBX_Reg, 1 << 8); }
 static inline bool has_pdpe1gb()    { return has_feature(PROCESSOR_EXT_INFO,    EDX_Reg, 1 << 26); }
@@ -178,6 +179,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
     m_flags.set(FLAG_AES,     has_aes_ni());
     m_flags.set(FLAG_AVX,     has_avx());
     m_flags.set(FLAG_AVX2,    has_avx2());
+    m_flags.set(FLAG_VAES,    has_vaes());
     m_flags.set(FLAG_AVX512F, has_avx512f());
     m_flags.set(FLAG_BMI2,    has_bmi2());
     m_flags.set(FLAG_OSXSAVE, has_osxsave());
diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h
index 574b8c9d..9405fa75 100644
--- a/src/backend/cpu/platform/BasicCpuInfo.h
+++ b/src/backend/cpu/platform/BasicCpuInfo.h
@@ -44,6 +44,7 @@ protected:
     inline Assembly::Id assembly() const override               { return m_assembly; }
     inline bool has(Flag flag) const override                   { return m_flags.test(flag); }
     inline bool hasAES() const override                         { return has(FLAG_AES); }
+    inline bool hasVAES() const override                        { return has(FLAG_VAES); }
     inline bool hasAVX() const override                         { return has(FLAG_AVX); }
     inline bool hasAVX2() const override                        { return has(FLAG_AVX2); }
     inline bool hasBMI2() const override                        { return has(FLAG_BMI2); }
diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp
index 2f57e44d..0df12bca 100644
--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@@ -348,7 +348,7 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av,
 
 #   ifdef XMRIG_ALGO_CN_HEAVY
     // cn-heavy optimization for Zen3 CPUs
-    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3)) {
+    if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) && (Cpu::info()->model() == 0x21)) {
         switch (algorithm.id()) {
         case Algorithm::CN_HEAVY_0:
             return cryptonight_single_hash<Algorithm::CN_HEAVY_0, false, 3>;
diff --git a/src/crypto/cn/CryptoNight_test.h b/src/crypto/cn/CryptoNight_test.h
index 41adab67..1df168e0 100644
--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@@ -450,6 +450,29 @@ const static uint8_t astrobwt_dero_test_out[256] = {
 #endif
 
 
+#ifdef XMRIG_ALGO_GHOSTRIDER
+// "GhostRider"
+const static uint8_t test_output_gr[256] = {
+    0x42, 0x17, 0x0C, 0xC1, 0x85, 0xE6, 0x76, 0x3C, 0xC7, 0xCB, 0x27, 0xC4, 0x17, 0x39, 0x2D, 0xE2,
+    0x29, 0x6B, 0x40, 0x66, 0x85, 0xA4, 0xE3, 0xD3, 0x8C, 0xE9, 0xA5, 0x8F, 0x10, 0xFC, 0x81, 0xE4,
+    0x90, 0x56, 0xF2, 0x9E, 0x00, 0xD0, 0xF8, 0xA1, 0x88, 0x82, 0x86, 0xC0, 0x86, 0x04, 0x6B, 0x0E,
+    0x9A, 0xDB, 0xDB, 0xFD, 0x23, 0x16, 0x77, 0x94, 0xFE, 0x58, 0x93, 0x05, 0x10, 0x3F, 0x27, 0x75,
+    0x51, 0x44, 0xF3, 0x5F, 0xE2, 0xF9, 0x61, 0xBE, 0xC0, 0x30, 0xB5, 0x8E, 0xB1, 0x1B, 0xA1, 0xF7,
+    0x06, 0x4E, 0xF1, 0x6A, 0xFD, 0xA5, 0x44, 0x8E, 0x64, 0x47, 0x8C, 0x67, 0x51, 0xE2, 0x5C, 0x55,
+    0x3E, 0x39, 0xA6, 0xA5, 0xF7, 0xB8, 0xD0, 0x5E, 0xE2, 0xBF, 0x92, 0x44, 0xD9, 0xAA, 0x76, 0x22,
+    0xE3, 0x3E, 0x15, 0x96, 0xD8, 0x6A, 0x78, 0x2D, 0xA9, 0x77, 0x24, 0x1A, 0x4B, 0xE7, 0x5A, 0x2E,
+    0x89, 0x77, 0xAE, 0x92, 0xE4, 0xA4, 0x2D, 0xAF, 0x0B, 0x27, 0x09, 0xB2, 0x5F, 0x95, 0x61, 0xA9,
+    0xA8, 0xBE, 0x5D, 0x39, 0xBE, 0x41, 0x5F, 0x9C, 0x67, 0x28, 0x48, 0x4F, 0xAE, 0x2A, 0x50, 0x2B,
+    0xB8, 0xC7, 0x42, 0x73, 0x51, 0x60, 0x59, 0xD8, 0x9C, 0xBA, 0x22, 0x2F, 0x8E, 0x34, 0xDE, 0xC8,
+    0x1B, 0xAE, 0x9E, 0xBD, 0xF7, 0xE8, 0xFD, 0x8A, 0x97, 0xBE, 0xF0, 0x47, 0xAC, 0x27, 0xDD, 0x28,
+    0xC9, 0x28, 0xA8, 0x7B, 0x2A, 0xB8, 0x90, 0x3E, 0xCA, 0xB4, 0x78, 0x44, 0xCE, 0xCD, 0x91, 0xEC,
+    0xC2, 0x5A, 0x17, 0x59, 0x7C, 0x14, 0xF8, 0x95, 0x28, 0x14, 0xC3, 0xAD, 0xC4, 0xE1, 0x13, 0x5A,
+    0xC4, 0xA7, 0xC7, 0x77, 0xAD, 0xF8, 0x09, 0x61, 0x16, 0xBB, 0xAA, 0x7E, 0xAB, 0xC3, 0x00, 0x25,
+    0xBA, 0xA8, 0x97, 0xC7, 0x7D, 0x38, 0x46, 0x0E, 0x59, 0xAC, 0xCB, 0xAE, 0xFE, 0x3C, 0x6F, 0x01
+};
+#endif
+
+
 } // namespace xmrig
 
 
diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h
index e00ebba1..732d47fa 100644
--- a/src/crypto/cn/CryptoNight_x86.h
+++ b/src/crypto/cn/CryptoNight_x86.h
@@ -40,6 +40,7 @@
 #include "crypto/cn/CnAlgo.h"
 #include "crypto/cn/CryptoNight_monero.h"
 #include "crypto/cn/CryptoNight.h"
+#include "crypto/cn/CryptoNight_x86_vaes.h"
 #include "crypto/cn/soft_aes.h"
 
 
@@ -289,6 +290,11 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
 {
     constexpr CnAlgo<ALGO> props;
 
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes<ALGO>(ctx);
+        return;
+    }
+
     constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
 
     __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
@@ -341,7 +347,7 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
     constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
     constexpr int prefetch_dist = 2048 / sizeof(__m128i);
 
-    __m128i* e = output + N - prefetch_dist;
+    __m128i* e = output + (N << interleave) - prefetch_dist;
     __m128i* prefetch_ptr = output + prefetch_dist;
 
     for (int i = 0; i < 2; ++i) {
@@ -396,6 +402,11 @@ static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx)
 {
     constexpr CnAlgo<ALGO> props;
 
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes<ALGO>(ctx);
+        return;
+    }
+
     constexpr bool IS_HEAVY = props.isHeavy();
     constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
 
@@ -996,8 +1007,14 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
         ctx[0]->first_half = true;
         ctx[1]->first_half = true;
     }
-    cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+
+    if (!props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else {
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }
 
     if (ALGO == Algorithm::CN_2) {
         cnv2_double_mainloop_sandybridge_asm(ctx);
@@ -1036,8 +1053,13 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
         ctx[0]->generated_code(ctx);
     }
 
-    cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+    if (!props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else {
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
+    }
 
     keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
     keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
@@ -1092,8 +1114,14 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
         ctx[0]->first_half = true;
         ctx[1]->first_half = true;
     }
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }
 
     uint64_t al0 = h0[0] ^ h0[4];
     uint64_t al1 = h1[0] ^ h1[4];
@@ -1288,8 +1316,13 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
         bx10 = cx1;
     }
 
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+    }
+    else {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+    }
 
     keccakf(h0, 24);
     keccakf(h1, 24);
@@ -1350,10 +1383,16 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
         ctx[3]->first_half = true;
     }
 
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
-    cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }
 
     uint64_t al0 = h0[0] ^ h0[4];
     uint64_t al1 = h1[0] ^ h1[4];
@@ -1474,10 +1513,16 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
         if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
     }
 
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
-    cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }
 
     keccakf(h0, 24);
     keccakf(h1, 24);
@@ -1714,7 +1759,17 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
         if (props.half_mem()) {
             ctx[i]->first_half = true;
         }
-        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
+    }
+
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else {
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
     }
 
     uint8_t* l0  = ctx[0]->memory;
@@ -1766,8 +1821,18 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
         CN_STEP4(3, ax3, bx30, bx31, cx3, l3, mc3, ptr3, idx3);
     }
 
+    if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
+        cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
+    }
+    else {
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
+        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
+    }
+
     for (size_t i = 0; i < 4; i++) {
-        cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
         keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
         extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
     }
diff --git a/src/crypto/cn/CryptoNight_x86_vaes.cpp b/src/crypto/cn/CryptoNight_x86_vaes.cpp
new file mode 100644
index 00000000..b0b41147
--- /dev/null
+++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp
@@ -0,0 +1,513 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "CryptoNight_x86_vaes.h"
+#include "CryptoNight_monero.h"
+#include "CryptoNight.h"
+
+
+#ifdef __GNUC__
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static FORCEINLINE __m128i sl_xor(__m128i tmp1)
+{
+    __m128i tmp4;
+    tmp4 = _mm_slli_si128(tmp1, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    tmp4 = _mm_slli_si128(tmp4, 0x04);
+    tmp1 = _mm_xor_si128(tmp1, tmp4);
+    return tmp1;
+}
+
+
+template<uint8_t rcon>
+static FORCEINLINE void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
+{
+    __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
+    xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    *xout0 = sl_xor(*xout0);
+    *xout0 = _mm_xor_si128(*xout0, xout1);
+    xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
+    xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    *xout2 = sl_xor(*xout2);
+    *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+
+static NOINLINE void vaes_genkey(const __m128i* memory, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory);
+    __m128i xout2 = _mm_load_si128(memory + 1);
+    *k0 = _mm256_set_m128i(xout0, xout0);
+    *k1 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x01>(&xout0, &xout2);
+    *k2 = _mm256_set_m128i(xout0, xout0);
+    *k3 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x02>(&xout0, &xout2);
+    *k4 = _mm256_set_m128i(xout0, xout0);
+    *k5 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x04>(&xout0, &xout2);
+    *k6 = _mm256_set_m128i(xout0, xout0);
+    *k7 = _mm256_set_m128i(xout2, xout2);
+
+    aes_genkey_sub<0x08>(&xout0, &xout2);
+    *k8 = _mm256_set_m128i(xout0, xout0);
+    *k9 = _mm256_set_m128i(xout2, xout2);
+}
+
+
+static NOINLINE void vaes_genkey_double(const __m128i* memory1, const __m128i* memory2, __m256i* k0, __m256i* k1, __m256i* k2, __m256i* k3, __m256i* k4, __m256i* k5, __m256i* k6, __m256i* k7, __m256i* k8, __m256i* k9)
+{
+    __m128i xout0 = _mm_load_si128(memory1);
+    __m128i xout1 = _mm_load_si128(memory1 + 1);
+    __m128i xout2 = _mm_load_si128(memory2);
+    __m128i xout3 = _mm_load_si128(memory2 + 1);
+    *k0 = _mm256_set_m128i(xout2, xout0);
+    *k1 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x01>(&xout0, &xout1);
+    aes_genkey_sub<0x01>(&xout2, &xout3);
+    *k2 = _mm256_set_m128i(xout2, xout0);
+    *k3 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x02>(&xout0, &xout1);
+    aes_genkey_sub<0x02>(&xout2, &xout3);
+    *k4 = _mm256_set_m128i(xout2, xout0);
+    *k5 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x04>(&xout0, &xout1);
+    aes_genkey_sub<0x04>(&xout2, &xout3);
+    *k6 = _mm256_set_m128i(xout2, xout0);
+    *k7 = _mm256_set_m128i(xout3, xout1);
+
+    aes_genkey_sub<0x08>(&xout0, &xout1);
+    aes_genkey_sub<0x08>(&xout2, &xout3);
+    *k8 = _mm256_set_m128i(xout2, xout0);
+    *k9 = _mm256_set_m128i(xout3, xout1);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x01, __m256i& x23, __m256i& x45, __m256i& x67)
+{
+    x01 = _mm256_aesenc_epi128(x01, key);
+    x23 = _mm256_aesenc_epi128(x23, key);
+    x45 = _mm256_aesenc_epi128(x45, key);
+    x67 = _mm256_aesenc_epi128(x67, key);
+}
+
+
+static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256i& x2, __m256i& x3, __m256i& x4, __m256i& x5, __m256i& x6, __m256i& x7)
+{
+    x0 = _mm256_aesenc_epi128(x0, key);
+    x1 = _mm256_aesenc_epi128(x1, key);
+    x2 = _mm256_aesenc_epi128(x2, key);
+    x3 = _mm256_aesenc_epi128(x3, key);
+    x4 = _mm256_aesenc_epi128(x4, key);
+    x5 = _mm256_aesenc_epi128(x5, key);
+    x6 = _mm256_aesenc_epi128(x6, key);
+    x7 = _mm256_aesenc_epi128(x7, key);
+}
+
+
+namespace xmrig {
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xin01, xin23, xin45, xin67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input = reinterpret_cast<const __m128i*>(ctx->state);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->memory);
+
+    vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    if (props.half_mem() && !ctx->first_half) {
+        const __m256i* p = reinterpret_cast<const __m256i*>(ctx->save_state);
+        xin01 = _mm256_load_si256(p + 0);
+        xin23 = _mm256_load_si256(p + 1);
+        xin45 = _mm256_load_si256(p + 2);
+        xin67 = _mm256_load_si256(p + 3);
+    }
+    else {
+        xin01 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 4));
+        xin23 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 6));
+        xin45 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 8));
+        xin67 = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + 10));
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m256i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m256i);
+
+    __m256i* e = output + N - prefetch_dist;
+    __m256i* prefetch_ptr = output + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin01, xin23, xin45, xin67);
+            vaes_round(k1, xin01, xin23, xin45, xin67);
+            vaes_round(k2, xin01, xin23, xin45, xin67);
+            vaes_round(k3, xin01, xin23, xin45, xin67);
+            vaes_round(k4, xin01, xin23, xin45, xin67);
+            vaes_round(k5, xin01, xin23, xin45, xin67);
+            vaes_round(k6, xin01, xin23, xin45, xin67);
+            vaes_round(k7, xin01, xin23, xin45, xin67);
+            vaes_round(k8, xin01, xin23, xin45, xin67);
+            vaes_round(k9, xin01, xin23, xin45, xin67);
+
+            _mm256_store_si256(output + 0, xin01);
+            _mm256_store_si256(output + 1, xin23);
+
+            _mm256_store_si256(output + output_increment + 0, xin45);
+            _mm256_store_si256(output + output_increment + 1, xin67);
+
+            output += output_increment * 2;
+            prefetch_ptr += output_increment * 2;
+        } while (output < e);
+        e += prefetch_dist;
+        prefetch_ptr = output;
+    }
+
+    if (props.half_mem() && ctx->first_half) {
+        __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state);
+        _mm256_store_si256(p + 0, xin01);
+        _mm256_store_si256(p + 1, xin23);
+        _mm256_store_si256(p + 2, xin45);
+        _mm256_store_si256(p + 3, xin67);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->state);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->state);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->memory);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->memory);
+
+    vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    {
+        const bool b = props.half_mem() && !ctx1->first_half && !ctx2->first_half;
+        const __m128i* p1 = b ? reinterpret_cast<const __m128i*>(ctx1->save_state) : (input1 + 4);
+        const __m128i* p2 = b ? reinterpret_cast<const __m128i*>(ctx2->save_state) : (input2 + 4);
+        xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0);
+        xin1 = _mm256_loadu2_m128i(p2 + 1, p1 + 1);
+        xin2 = _mm256_loadu2_m128i(p2 + 2, p1 + 2);
+        xin3 = _mm256_loadu2_m128i(p2 + 3, p1 + 3);
+        xin4 = _mm256_loadu2_m128i(p2 + 4, p1 + 4);
+        xin5 = _mm256_loadu2_m128i(p2 + 5, p1 + 5);
+        xin6 = _mm256_loadu2_m128i(p2 + 6, p1 + 6);
+        xin7 = _mm256_loadu2_m128i(p2 + 7, p1 + 7);
+    }
+
+    constexpr int output_increment = 64 / sizeof(__m128i);
+    constexpr int prefetch_dist = 2048 / sizeof(__m128i);
+
+    __m128i* e = output1 + N - prefetch_dist;
+    __m128i* prefetch_ptr1 = output1 + prefetch_dist;
+    __m128i* prefetch_ptr2 = output2 + prefetch_dist;
+
+    for (int i = 0; i < 2; ++i) {
+        do {
+            _mm_prefetch((const char*)(prefetch_ptr1), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr1 + output_increment), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2), _MM_HINT_T0);
+            _mm_prefetch((const char*)(prefetch_ptr2 + output_increment), _MM_HINT_T0);
+
+            vaes_round(k0, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k1, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k2, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k3, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k4, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k5, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k6, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k7, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k8, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+            vaes_round(k9, xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+
+            _mm256_storeu2_m128i(output2 + 0, output1 + 0, xin0);
+            _mm256_storeu2_m128i(output2 + 1, output1 + 1, xin1);
+            _mm256_storeu2_m128i(output2 + 2, output1 + 2, xin2);
+            _mm256_storeu2_m128i(output2 + 3, output1 + 3, xin3);
+
+            _mm256_storeu2_m128i(output2 + output_increment + 0, output1 + output_increment + 0, xin4);
+            _mm256_storeu2_m128i(output2 + output_increment + 1, output1 + output_increment + 1, xin5);
+            _mm256_storeu2_m128i(output2 + output_increment + 2, output1 + output_increment + 2, xin6);
+            _mm256_storeu2_m128i(output2 + output_increment + 3, output1 + output_increment + 3, xin7);
+
+            output1 += output_increment * 2;
+            prefetch_ptr1 += output_increment * 2;
+            output2 += output_increment * 2;
+            prefetch_ptr2 += output_increment * 2;
+        } while (output1 < e);
+        e += prefetch_dist;
+        prefetch_ptr1 = output1;
+        prefetch_ptr2 = output2;
+    }
+
+    if (props.half_mem() && ctx1->first_half && ctx2->first_half) {
+        __m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state);
+        __m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state);
+        _mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0);
+        _mm256_storeu2_m128i(p2 + 1, p1 + 1, xin1);
+        _mm256_storeu2_m128i(p2 + 2, p1 + 2, xin2);
+        _mm256_storeu2_m128i(p2 + 3, p1 + 3, xin3);
+        _mm256_storeu2_m128i(p2 + 4, p1 + 4, xin4);
+        _mm256_storeu2_m128i(p2 + 5, p1 + 5, xin5);
+        _mm256_storeu2_m128i(p2 + 6, p1 + 6, xin6);
+        _mm256_storeu2_m128i(p2 + 7, p1 + 7, xin7);
+    }
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xout01, xout23, xout45, xout67;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m256i* input = reinterpret_cast<const __m256i*>(ctx->memory);
+    __m256i* output = reinterpret_cast<__m256i*>(ctx->state);
+
+    vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout01 = _mm256_load_si256(output + 2);
+    xout23 = _mm256_load_si256(output + 3);
+    xout45 = _mm256_load_si256(output + 4);
+    xout67 = _mm256_load_si256(output + 5);
+
+    const __m256i* input_begin = input;
+    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
+        if (props.half_mem() && (part == 1)) {
+            input = input_begin;
+            ctx->first_half = false;
+            cn_explode_scratchpad_vaes<ALGO>(ctx);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout01 = _mm256_xor_si256(xout01, input[0]);
+            xout23 = _mm256_xor_si256(xout23, input[1]);
+
+            constexpr int input_increment = 64 / sizeof(__m256i);
+
+            xout45 = _mm256_xor_si256(xout45, input[input_increment]);
+            xout67 = _mm256_xor_si256(xout67, input[input_increment + 1]);
+
+            input += input_increment * 2;
+            i += 4;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout01, xout23, xout45, xout67);
+            vaes_round(k1, xout01, xout23, xout45, xout67);
+            vaes_round(k2, xout01, xout23, xout45, xout67);
+            vaes_round(k3, xout01, xout23, xout45, xout67);
+            vaes_round(k4, xout01, xout23, xout45, xout67);
+            vaes_round(k5, xout01, xout23, xout45, xout67);
+            vaes_round(k6, xout01, xout23, xout45, xout67);
+            vaes_round(k7, xout01, xout23, xout45, xout67);
+            vaes_round(k8, xout01, xout23, xout45, xout67);
+            vaes_round(k9, xout01, xout23, xout45, xout67);
+        }
+    }
+
+    _mm256_store_si256(output + 2, xout01);
+    _mm256_store_si256(output + 3, xout23);
+    _mm256_store_si256(output + 4, xout45);
+    _mm256_store_si256(output + 5, xout67);
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
+{
+    constexpr CnAlgo<ALGO> props;
+
+    constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
+
+    __m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+    __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+    const __m128i* input1 = reinterpret_cast<const __m128i*>(ctx1->memory);
+    const __m128i* input2 = reinterpret_cast<const __m128i*>(ctx2->memory);
+
+    __m128i* output1 = reinterpret_cast<__m128i*>(ctx1->state);
+    __m128i* output2 = reinterpret_cast<__m128i*>(ctx2->state);
+
+    vaes_genkey_double(output1 + 2, output2 + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+    xout0 = _mm256_loadu2_m128i(output2 + 4, output1 + 4);
+    xout1 = _mm256_loadu2_m128i(output2 + 5, output1 + 5);
+    xout2 = _mm256_loadu2_m128i(output2 + 6, output1 + 6);
+    xout3 = _mm256_loadu2_m128i(output2 + 7, output1 + 7);
+    xout4 = _mm256_loadu2_m128i(output2 + 8, output1 + 8);
+    xout5 = _mm256_loadu2_m128i(output2 + 9, output1 + 9);
+    xout6 = _mm256_loadu2_m128i(output2 + 10, output1 + 10);
+    xout7 = _mm256_loadu2_m128i(output2 + 11, output1 + 11);
+
+    const __m128i* input_begin1 = input1;
+    const __m128i* input_begin2 = input2;
+    for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
+        if (props.half_mem() && (part == 1)) {
+            input1 = input_begin1;
+            input2 = input_begin2;
+            ctx1->first_half = false;
+            ctx2->first_half = false;
+            cn_explode_scratchpad_vaes_double<ALGO>(ctx1, ctx2);
+        }
+
+        for (size_t i = 0; i < N;) {
+            xout0 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 0, input1 + 0), xout0);
+            xout1 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 1, input1 + 1), xout1);
+            xout2 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 2, input1 + 2), xout2);
+            xout3 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + 3, input1 + 3), xout3);
+
+            constexpr int input_increment = 64 / sizeof(__m128i);
+
+            xout4 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 0, input1 + input_increment + 0), xout4);
+            xout5 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 1, input1 + input_increment + 1), xout5);
+            xout6 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 2, input1 + input_increment + 2), xout6);
+            xout7 = _mm256_xor_si256(_mm256_loadu2_m128i(input2 + input_increment + 3, input1 + input_increment + 3), xout7);
+
+            input1 += input_increment * 2;
+            input2 += input_increment * 2;
+            i += 8;
+
+            if (i < N) {
+                _mm_prefetch((const char*)(input1), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input1 + input_increment), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2), _MM_HINT_T0);
+                _mm_prefetch((const char*)(input2 + input_increment), _MM_HINT_T0);
+            }
+
+            vaes_round(k0, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k1, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k2, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k3, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k4, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k5, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k6, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k7, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k8, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+            vaes_round(k9, xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+        }
+    }
+
+    _mm256_storeu2_m128i(output2 + 4, output1 + 4, xout0);
+    _mm256_storeu2_m128i(output2 + 5, output1 + 5, xout1);
+    _mm256_storeu2_m128i(output2 + 6, output1 + 6, xout2);
+    _mm256_storeu2_m128i(output2 + 7, output1 + 7, xout3);
+    _mm256_storeu2_m128i(output2 + 8, output1 + 8, xout4);
+    _mm256_storeu2_m128i(output2 + 9, output1 + 9, xout5);
+    _mm256_storeu2_m128i(output2 + 10, output1 + 10, xout6);
+    _mm256_storeu2_m128i(output2 + 11, output1 + 11, xout7);
+
+    _mm256_zeroupper();
+}
+
+
+template<Algorithm::Id ALGO>
+void VAES_Instance()
+{
+    cn_explode_scratchpad_vaes<ALGO>(nullptr);
+    cn_explode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
+    cn_implode_scratchpad_vaes<ALGO>(nullptr);
+    cn_implode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
+}
+
+
+void (*vaes_instances[])() = {
+    VAES_Instance<Algorithm::CN_0>,
+    VAES_Instance<Algorithm::CN_1>,
+    VAES_Instance<Algorithm::CN_2>,
+    VAES_Instance<Algorithm::CN_R>,
+    VAES_Instance<Algorithm::CN_FAST>,
+    VAES_Instance<Algorithm::CN_HALF>,
+    VAES_Instance<Algorithm::CN_XAO>,
+    VAES_Instance<Algorithm::CN_RTO>,
+    VAES_Instance<Algorithm::CN_RWZ>,
+    VAES_Instance<Algorithm::CN_ZLS>,
+    VAES_Instance<Algorithm::CN_DOUBLE>,
+    VAES_Instance<Algorithm::CN_CCX>,
+    VAES_Instance<Algorithm::CN_LITE_0>,
+    VAES_Instance<Algorithm::CN_LITE_1>,
+    VAES_Instance<Algorithm::CN_HEAVY_0>,
+    VAES_Instance<Algorithm::CN_HEAVY_TUBE>,
+    VAES_Instance<Algorithm::CN_HEAVY_XHV>,
+    VAES_Instance<Algorithm::CN_PICO_0>,
+    VAES_Instance<Algorithm::CN_PICO_TLO>,
+    VAES_Instance<Algorithm::CN_UPX2>,
+    VAES_Instance<Algorithm::CN_GR_0>,
+    VAES_Instance<Algorithm::CN_GR_1>,
+    VAES_Instance<Algorithm::CN_GR_2>,
+    VAES_Instance<Algorithm::CN_GR_3>,
+    VAES_Instance<Algorithm::CN_GR_4>,
+    VAES_Instance<Algorithm::CN_GR_5>,
+};
+
+
+} // xmrig
diff --git a/src/crypto/cn/CryptoNight_x86_vaes.h b/src/crypto/cn/CryptoNight_x86_vaes.h
new file mode 100644
index 00000000..475780b8
--- /dev/null
+++ b/src/crypto/cn/CryptoNight_x86_vaes.h
@@ -0,0 +1,48 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_X86_VAES_H
+#define XMRIG_CRYPTONIGHT_X86_VAES_H
+
+
+#include "crypto/cn/CnAlgo.h"
+
+
+struct cryptonight_ctx;
+
+
+namespace xmrig {
+
+
+template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx);
+template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
+template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx);
+template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
+
+
+} // xmrig
+
+
+#endif /* XMRIG_CRYPTONIGHT_X86_VAES_H */
diff --git a/src/crypto/ghostrider/README.md b/src/crypto/ghostrider/README.md
index bce065c0..e91fc5bd 100644
--- a/src/crypto/ghostrider/README.md
+++ b/src/crypto/ghostrider/README.md
@@ -26,13 +26,13 @@ While individual algorithm implementations are a bit unoptimized, XMRig achieves
 
 For the same reason, XMRig can sometimes use less than 100% CPU on Ryzen 3000/5000 CPUs if it finds that running 1 thread per core is faster for some Cryptonight variants on your system.
 
-**Windows** (detailed results [here](https://imgur.com/a/GCjEWpl))
+**Windows** (detailed results [here](https://imgur.com/a/uRU1yO2))
 CPU|cpuminer-gr-avx2 (tuned), h/s|XMRig (MSVC build), h/s|Speedup
 -|-|-|-
 AMD Ryzen 7 4700U|632.6|731|+15.5%
 Intel Core i7-2600|496.4|533.6|+7.5%
 AMD Ryzen 7 3700X @ 4.1 GHz|2453.0|2469.1|+0.65%
-AMD Ryzen 5 5600X @ 4.65 GHz|2112.6|2221.2|+5.1%
+AMD Ryzen 5 5600X @ 4.65 GHz|2112.6|2313.2|+9.5%
 
 **Linux** (tested by **Delgon**, detailed results [here](https://cdn.discordapp.com/attachments/604375870236524574/913167614749048872/unknown.png))
 CPU|cpuminer-gr-avx2 (tuned), h/s|XMRig (GCC build), h/s|Speedup
diff --git a/src/crypto/ghostrider/ghostrider.cpp b/src/crypto/ghostrider/ghostrider.cpp
index 58882eda..a23150fc 100644
--- a/src/crypto/ghostrider/ghostrider.cpp
+++ b/src/crypto/ghostrider/ghostrider.cpp
@@ -538,7 +538,7 @@ void destroy_helper_thread(HelperThread* t)
 }
 
 
-void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper)
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper, bool verbose)
 {
     enum { N = 8 };
 
@@ -554,11 +554,13 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct
     uint32_t cn_indices[6];
     select_indices(cn_indices, data + 4);
 
-    static uint32_t prev_indices[3];
-    if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
-        memcpy(prev_indices, cn_indices, sizeof(prev_indices));
-        for (int i = 0; i < 3; ++i) {
-            LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+    if (verbose) {
+        static uint32_t prev_indices[3];
+        if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
+            memcpy(prev_indices, cn_indices, sizeof(prev_indices));
+            for (int i = 0; i < 3; ++i) {
+                LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+            }
         }
     }
 
@@ -765,7 +767,7 @@ HelperThread* create_helper_thread(int64_t, const std::vector<int64_t>&) { retur
 void destroy_helper_thread(HelperThread*) {}
 
 
-void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread*)
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread*, bool verbose)
 {
     constexpr uint32_t N = 8;
 
@@ -784,11 +786,13 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct
     uint32_t step[6] = { 4, 4, 1, 2, 4, 4 };
 #endif
 
-    static uint32_t prev_indices[3];
-    if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
-        memcpy(prev_indices, cn_indices, sizeof(prev_indices));
-        for (int i = 0; i < 3; ++i) {
-            LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+    if (verbose) {
+        static uint32_t prev_indices[3];
+        if (memcmp(cn_indices, prev_indices, sizeof(prev_indices)) != 0) {
+            memcpy(prev_indices, cn_indices, sizeof(prev_indices));
+            for (int i = 0; i < 3; ++i) {
+                LOG_INFO("%s GhostRider algo %d: %s", Tags::cpu(), i + 1, cn_names[cn_indices[i]]);
+            }
         }
     }
 
diff --git a/src/crypto/ghostrider/ghostrider.h b/src/crypto/ghostrider/ghostrider.h
index f37c46f3..081e4f6f 100644
--- a/src/crypto/ghostrider/ghostrider.h
+++ b/src/crypto/ghostrider/ghostrider.h
@@ -41,7 +41,7 @@ struct HelperThread;
 void benchmark();
 HelperThread* create_helper_thread(int64_t cpu_index, const std::vector<int64_t>& affinities);
 void destroy_helper_thread(HelperThread* t);
-void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper);
+void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ctx** ctx, HelperThread* helper, bool verbose = true);
 
 
 } // namespace ghostrider