AVX2 optimized code for AstroBWT

Added "astrobwt-avx2" parameter in config.json, it's turned off ("false") by default.

4-5% speedup on CPUs with proper AVX2 support (AMD Ryzen starting with Zen2, Intel Core starting with Haswell).

There will be no speedup on the following CPUs:

- Intel Pentium/Celeron don't support AVX2
- AMD Zen/Zen+ have only half-speed AVX

GCC compiled version is faster without AVX2, MSVC compiled version is faster with AVX2
This commit is contained in:
SChernykh
2020-03-10 22:03:16 +01:00
parent 8698b73036
commit e22f798085
14 changed files with 563 additions and 15 deletions

View File

@ -23,6 +23,16 @@ if (WITH_ASTROBWT)
src/crypto/astrobwt/salsa20_ref/salsa20.c
)
else()
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
enable_language(ASM_MASM)
add_definitions(/DASTROBWT_AVX2)
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.asm)
else()
list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.S)
endif()
endif()
list(APPEND HEADERS_CRYPTO
src/crypto/astrobwt/Salsa20.hpp
)

View File

@ -52,6 +52,7 @@ static const char *kArgon2Impl = "argon2-impl";
#ifdef XMRIG_ALGO_ASTROBWT
static const char* kAstroBWTMaxSize = "astrobwt-max-size";
static const char* kAstroBWTAVX2 = "astrobwt-avx2";
#endif
@ -94,6 +95,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const
# ifdef XMRIG_ALGO_ASTROBWT
obj.AddMember(StringRef(kAstroBWTMaxSize), m_astrobwtMaxSize, allocator);
obj.AddMember(StringRef(kAstroBWTAVX2), m_astrobwtAVX2, allocator);
# endif
m_threads.toJSON(obj, doc);
@ -148,12 +150,20 @@ void xmrig::CpuConfig::read(const rapidjson::Value &value)
# endif
# ifdef XMRIG_ALGO_ASTROBWT
const auto& obj = Json::getValue(value, kAstroBWTMaxSize);
if (obj.IsNull() || !obj.IsInt()) {
const auto& astroBWTMaxSize = Json::getValue(value, kAstroBWTMaxSize);
if (astroBWTMaxSize.IsNull() || !astroBWTMaxSize.IsInt()) {
m_shouldSave = true;
}
else {
m_astrobwtMaxSize = std::min(std::max(obj.GetInt(), 400), 1200);
m_astrobwtMaxSize = std::min(std::max(astroBWTMaxSize.GetInt(), 400), 1200);
}
const auto& astroBWTAVX2 = Json::getValue(value, kAstroBWTAVX2);
if (astroBWTAVX2.IsNull() || !astroBWTAVX2.IsBool()) {
m_shouldSave = true;
}
else {
m_astrobwtAVX2 = astroBWTAVX2.GetBool();
}
# endif

View File

@ -60,6 +60,7 @@ public:
inline const String &argon2Impl() const { return m_argon2Impl; }
inline const Threads<CpuThreads> &threads() const { return m_threads; }
inline int astrobwtMaxSize() const { return m_astrobwtMaxSize; }
inline bool astrobwtAVX2() const { return m_astrobwtAVX2; }
inline int priority() const { return m_priority; }
inline uint32_t limit() const { return m_limit; }
@ -77,6 +78,7 @@ private:
bool m_shouldSave = false;
bool m_yield = true;
int m_astrobwtMaxSize = 550;
bool m_astrobwtAVX2 = false;
int m_memoryPool = 0;
int m_priority = -1;
String m_argon2Impl;

View File

@ -39,6 +39,7 @@ xmrig::CpuLaunchData::CpuLaunchData(const Miner *miner, const Algorithm &algorit
hwAES(config.isHwAES()),
yield(config.isYield()),
astrobwtMaxSize(config.astrobwtMaxSize()),
astrobwtAVX2(config.astrobwtAVX2()),
priority(config.priority()),
affinity(thread.affinity()),
miner(miner),

View File

@ -62,6 +62,7 @@ public:
const bool hwAES;
const bool yield;
const int astrobwtMaxSize;
const bool astrobwtAVX2;
const int priority;
const int64_t affinity;
const Miner *miner;

View File

@ -81,6 +81,7 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
m_yield(data.yield),
m_av(data.av()),
m_astrobwtMaxSize(data.astrobwtMaxSize * 1000),
m_astrobwtAVX2(data.astrobwtAVX2),
m_miner(data.miner),
m_ctx()
{
@ -262,7 +263,7 @@ void xmrig::CpuWorker<N>::start()
{
# ifdef XMRIG_ALGO_ASTROBWT
if (job.algorithm().family() == Algorithm::ASTROBWT) {
if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize))
if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize, m_astrobwtAVX2))
valid = false;
}
else

View File

@ -74,6 +74,7 @@ private:
const bool m_yield;
const CnHash::AlgoVariant m_av;
const int m_astrobwtMaxSize;
const bool m_astrobwtAVX2;
const Miner *m_miner;
cryptonight_ctx *m_ctx[N];
uint8_t m_hash[N * 32]{ 0 };

View File

@ -41,6 +41,7 @@
#include "core/Miner.h"
#include "crypto/common/Nonce.h"
#include "crypto/rx/Rx.h"
#include "crypto/astrobwt/AstroBWT.h"
#include "rapidjson/document.h"
#include "version.h"
@ -242,6 +243,10 @@ public:
# endif
# ifdef XMRIG_ALGO_ASTROBWT
inline bool initAstroBWT() { return astrobwt::init(job); }
# endif
Algorithm algorithm;
Algorithms algorithms;
bool active = false;
@ -454,10 +459,14 @@ void xmrig::Miner::setJob(const Job &job, bool donate)
d_ptr->userJobId = job.id();
}
bool ready = true;
# ifdef XMRIG_ALGO_RANDOMX
const bool ready = d_ptr->initRX();
# else
constexpr const bool ready = true;
ready &= d_ptr->initRX();
# endif
# ifdef XMRIG_ALGO_ASTROBWT
ready &= d_ptr->initAstroBWT();
# endif
mutex.unlock();

View File

@ -30,6 +30,10 @@
#include "AstroBWT.h"
#include "sha3.h"
#include "crypto/cn/CryptoNight.h"
#include "base/net/stratum/Job.h"
#include "base/crypto/Algorithm.h"
#include "base/io/log/Log.h"
#include "backend/cpu/Cpu.h"
#include <limits>
constexpr int STAGE1_SIZE = 147253;
@ -38,6 +42,18 @@ constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE &
constexpr int COUNTING_SORT_BITS = 10;
constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS;
static bool astrobwtInitialized = false;
#ifdef ASTROBWT_AVX2
static bool hasAVX2 = false;
extern "C"
#ifdef __GNUC__
__attribute__((ms_abi))
#endif
void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
#endif
#ifdef _MSC_VER
#include <stdlib.h>
@ -155,7 +171,25 @@ void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indi
}
}
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size)
bool xmrig::astrobwt::init(const xmrig::Job& job)
{
if (job.algorithm().family() != xmrig::Algorithm::ASTROBWT)
return true;
if (astrobwtInitialized)
return true;
#ifdef ASTROBWT_AVX2
if (xmrig::Cpu::info()->hasAVX2()) {
hasAVX2 = true;
}
#endif
astrobwtInitialized = true;
return true;
}
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2)
{
uint8_t key[32];
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
@ -166,7 +200,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
uint8_t* stage1_result = (uint8_t*)(tmp_indices);
uint8_t* stage2_result = (uint8_t*)(tmp_indices);
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(input_data, input_size, key);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE);
@ -178,7 +217,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
}
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff);
if (stage2_size > stage2_max_size)
@ -203,7 +247,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
}
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
return true;
}
@ -211,5 +260,5 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
template<>
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
{
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max());
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max(), true);
}

View File

@ -33,9 +33,14 @@
struct cryptonight_ctx;
namespace xmrig { namespace astrobwt {
namespace xmrig {
bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size);
class Job;
namespace astrobwt {
bool init(const Job&);
bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2);
template<Algorithm::Id ALGO>
void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
@ -44,4 +49,4 @@ template<>
void single_hash<Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
}} // namespace xmrig::argon2
}} // namespace xmrig::astrobwt

View File

@ -0,0 +1,50 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#define DECL(x) _##x
#else
.section .text
#define DECL(x) x
#endif
#define ALIGN .balign
#define dq .quad
.global DECL(SHA3_256_AVX2_ASM)
#include "sha3_256_avx2.inc"
KeccakF1600_AVX2_ASM:
lea r8,[rip+rot_left+96]
lea r9,[rip+rot_right+96]
lea r10,[rip+rndc]
#include "sha3_256_keccakf1600_avx2.inc"

View File

@ -0,0 +1,42 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE
PUBLIC SHA3_256_AVX2_ASM
include sha3_256_avx2.inc
KeccakF1600_AVX2_ASM:
lea r8,[rot_left+96]
lea r9,[rot_right+96]
lea r10,[rndc]
include sha3_256_keccakf1600_avx2.inc
_SHA3_256_AVX2_ASM ENDS
END

View File

@ -0,0 +1,164 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
ALIGN 64
SHA3_256_AVX2_ASM:
vzeroupper
mov qword ptr [rsp+8],rbx
mov qword ptr [rsp+16],rsi
mov qword ptr [rsp+24],rdi
push rbp
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm11
movdqu xmmword ptr [rsp+48], xmm12
movdqu xmmword ptr [rsp+32], xmm13
movdqu xmmword ptr [rsp+16], xmm14
movdqu xmmword ptr [rsp+0], xmm15
sub rsp,320
lea rbp,[rsp+64]
and rbp,-32
vpxor xmm0,xmm0,xmm0
xor edi,edi
mov dword ptr [rbp],50462976
mov r12,rdx
mov dword ptr [rbp+4],169150212
mov r14,rdx
mov dword ptr [rbp+8],218436623
shr r14,3
and r12d,7
mov dword ptr [rbp+12],135009046
mov r13,r8
mov byte ptr [rbp+16],9
mov rsi,rcx
mov ebx,edi
vmovdqa ymmword ptr [rbp+32],ymm0
vmovdqa ymmword ptr [rbp+64],ymm0
vmovdqa ymmword ptr [rbp+96],ymm0
vmovdqa ymmword ptr [rbp+128],ymm0
vmovdqa ymmword ptr [rbp+160],ymm0
vmovdqa ymmword ptr [rbp+192],ymm0
vmovdqa ymmword ptr [rbp+224],ymm0
test r14,r14
je sha3_main_loop_end
sha3_main_loop:
movzx eax,byte ptr [rbp+rbx]
lea rcx,[rbp+32]
lea rcx,[rcx+rax*8]
mov rax,qword ptr [rsi]
xor qword ptr [rcx],rax
lea r15,[rbx+1]
cmp rbx,16
jne skip_keccak
lea rcx,[rbp+32]
call KeccakF1600_AVX2_ASM
skip_keccak:
cmp rbx,16
mov rax,rdi
cmovne rax,r15
add rsi,8
mov rbx,rax
sub r14,1
jne sha3_main_loop
sha3_main_loop_end:
mov rdx,rdi
test r12,r12
je sha3_tail_loop_end
mov r8,rdi
sha3_tail_loop:
movzx eax,byte ptr [rdx+rsi]
inc rdx
shlx rcx,rax,r8
or rdi,rcx
add r8,8
cmp rdx,r12
jb sha3_tail_loop
sha3_tail_loop_end:
movzx eax,byte ptr [rbp+rbx]
lea rdx,[rbp+32]
lea rdx,[rdx+rax*8]
mov ecx,6
lea rax,[r12*8]
shlx rcx,rcx,rax
xor rcx,qword ptr [rdx]
mov eax,1
shl rax,63
xor rcx,rdi
mov qword ptr [rdx],rcx
xor qword ptr [rbp+104],rax
lea rcx,[rbp+32]
call KeccakF1600_AVX2_ASM
vmovups ymm0,ymmword ptr [rbp+32]
vmovups ymmword ptr [r13],ymm0
vzeroupper
add rsp,320
movdqu xmm15, xmmword ptr [rsp]
movdqu xmm14, xmmword ptr [rsp+16]
movdqu xmm13, xmmword ptr [rsp+32]
movdqu xmm12, xmmword ptr [rsp+48]
movdqu xmm11, xmmword ptr [rsp+64]
add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
movdqu xmm7, xmmword ptr [rsp+48]
movdqu xmm6, xmmword ptr [rsp+64]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rbp
mov rbx,qword ptr [rsp+8]
mov rsi,qword ptr [rsp+16]
mov rdi,qword ptr [rsp+24]
ret

View File

@ -0,0 +1,203 @@
;# XMRig
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
;# Copyright 2018-2019 tevador <tevador@gmail.com>
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
;#
;# This program is free software: you can redistribute it and/or modify
;# it under the terms of the GNU General Public License as published by
;# the Free Software Foundation, either version 3 of the License, or
;# (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;# GNU General Public License for more details.
;#
;# You should have received a copy of the GNU General Public License
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
;#
mov eax,24
lea rcx,[rcx+96]
vpbroadcastq ymm0,QWORD PTR [rcx-96]
vmovdqu ymm1,YMMWORD PTR [rcx-88]
vmovdqu ymm2,YMMWORD PTR [rcx-56]
vmovdqu ymm3,YMMWORD PTR [rcx-24]
vmovdqu ymm4,YMMWORD PTR [rcx+8]
vmovdqu ymm5,YMMWORD PTR [rcx+40]
vmovdqu ymm6,YMMWORD PTR [rcx+72]
ALIGN 64
Loop_avx2:
vpshufd ymm13,ymm2,78
vpxor ymm12,ymm5,ymm3
vpxor ymm9,ymm4,ymm6
vpxor ymm12,ymm12,ymm1
vpxor ymm12,ymm12,ymm9
vpermq ymm11,ymm12,147
vpxor ymm13,ymm13,ymm2
vpermq ymm7,ymm13,78
vpsrlq ymm8,ymm12,63
vpaddq ymm9,ymm12,ymm12
vpor ymm8,ymm8,ymm9
vpermq ymm15,ymm8,57
vpxor ymm14,ymm8,ymm11
vpermq ymm14,ymm14,0
vpxor ymm13,ymm13,ymm0
vpxor ymm13,ymm13,ymm7
vpsrlq ymm7,ymm13,63
vpaddq ymm8,ymm13,ymm13
vpor ymm8,ymm8,ymm7
vpxor ymm2,ymm2,ymm14
vpxor ymm0,ymm0,ymm14
vpblendd ymm15,ymm15,ymm8,192
vpblendd ymm11,ymm11,ymm13,3
vpxor ymm15,ymm15,ymm11
vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96]
vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96]
vpor ymm2,ymm2,ymm10
vpxor ymm3,ymm3,ymm15
vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32]
vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32]
vpor ymm3,ymm3,ymm11
vpxor ymm4,ymm4,ymm15
vpsllvq ymm12,ymm4,YMMWORD PTR [r8]
vpsrlvq ymm4,ymm4,YMMWORD PTR [r9]
vpor ymm4,ymm4,ymm12
vpxor ymm5,ymm5,ymm15
vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32]
vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32]
vpor ymm5,ymm5,ymm13
vpxor ymm6,ymm6,ymm15
vpermq ymm10,ymm2,141
vpermq ymm11,ymm3,141
vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64]
vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64]
vpor ymm8,ymm8,ymm14
vpxor ymm1,ymm1,ymm15
vpermq ymm12,ymm4,27
vpermq ymm13,ymm5,114
vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64]
vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64]
vpor ymm9,ymm9,ymm15
vpsrldq ymm14,ymm8,8
vpandn ymm7,ymm8,ymm14
vpblendd ymm3,ymm9,ymm13,12
vpblendd ymm15,ymm11,ymm9,12
vpblendd ymm5,ymm10,ymm11,12
vpblendd ymm14,ymm9,ymm10,12
vpblendd ymm3,ymm3,ymm11,48
vpblendd ymm15,ymm15,ymm12,48
vpblendd ymm5,ymm5,ymm9,48
vpblendd ymm14,ymm14,ymm13,48
vpblendd ymm3,ymm3,ymm12,192
vpblendd ymm15,ymm15,ymm13,192
vpblendd ymm5,ymm5,ymm13,192
vpblendd ymm14,ymm14,ymm11,192
vpandn ymm3,ymm3,ymm15
vpandn ymm5,ymm5,ymm14
vpblendd ymm6,ymm12,ymm9,12
vpblendd ymm15,ymm10,ymm12,12
vpxor ymm3,ymm3,ymm10
vpblendd ymm6,ymm6,ymm10,48
vpblendd ymm15,ymm15,ymm11,48
vpxor ymm5,ymm5,ymm12
vpblendd ymm6,ymm6,ymm11,192
vpblendd ymm15,ymm15,ymm9,192
vpandn ymm6,ymm6,ymm15
vpxor ymm6,ymm6,ymm13
vpermq ymm4,ymm8,30
vpblendd ymm15,ymm4,ymm0,48
vpermq ymm1,ymm8,57
vpblendd ymm1,ymm1,ymm0,192
vpandn ymm1,ymm1,ymm15
vpblendd ymm2,ymm11,ymm12,12
vpblendd ymm14,ymm13,ymm11,12
vpblendd ymm2,ymm2,ymm13,48
vpblendd ymm14,ymm14,ymm10,48
vpblendd ymm2,ymm2,ymm10,192
vpblendd ymm14,ymm14,ymm12,192
vpandn ymm2,ymm2,ymm14
vpxor ymm2,ymm2,ymm9
vpermq ymm7,ymm7,0
vpermq ymm3,ymm3,27
vpermq ymm5,ymm5,141
vpermq ymm6,ymm6,114
vpblendd ymm4,ymm13,ymm10,12
vpblendd ymm14,ymm12,ymm13,12
vpblendd ymm4,ymm4,ymm12,48
vpblendd ymm14,ymm14,ymm9,48
vpblendd ymm4,ymm4,ymm9,192
vpblendd ymm14,ymm14,ymm10,192
vpandn ymm4,ymm4,ymm14
vpxor ymm0,ymm0,ymm7
vpxor ymm1,ymm1,ymm8
vpxor ymm4,ymm4,ymm11
vpxor ymm0,ymm0,YMMWORD PTR [r10]
lea r10,[r10+32]
dec eax
jnz Loop_avx2
vmovq QWORD PTR [rcx-96],xmm0
vmovdqu YMMWORD PTR [rcx-88],ymm1
vmovdqu YMMWORD PTR [rcx-56],ymm2
vmovdqu YMMWORD PTR [rcx-24],ymm3
vmovdqu YMMWORD PTR [rcx+8],ymm4
vmovdqu YMMWORD PTR [rcx+40],ymm5
vmovdqu YMMWORD PTR [rcx+72],ymm6
ret
ALIGN 32
rot_left:
dq 3, 18, 36, 41
dq 1, 62, 28, 27
dq 45, 6, 56, 39
dq 10, 61, 55, 8
dq 2, 15, 25, 20
dq 44, 43, 21, 14
ALIGN 32
rot_right:
dq 64-3, 64-18, 64-36, 64-41
dq 64-1, 64-62, 64-28, 64-27
dq 64-45, 64-6, 64-56, 64-39
dq 64-10, 64-61, 64-55, 64-8
dq 64-2, 64-15, 64-25, 64-20
dq 64-44, 64-43, 64-21, 64-14
ALIGN 32
rndc:
dq 1, 1, 1, 1
dq 32898, 32898, 32898, 32898
dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714
dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224
dq 32907, 32907, 32907, 32907
dq 2147483649, 2147483649, 2147483649, 2147483649
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585
dq 138, 138, 138, 138
dq 136, 136, 136, 136
dq 2147516425, 2147516425, 2147516425, 2147516425
dq 2147483658, 2147483658, 2147483658, 2147483658
dq 2147516555, 2147516555, 2147516555, 2147516555
dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947
dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713
dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579
dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578
dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936
dq 32778, 32778, 32778, 32778
dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704
dq 2147483649, 2147483649, 2147483649, 2147483649
dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232