crypto-native: add AArch64 AES-GCM native implementation
Type: feature Change-Id: I4f96b0af13b875d491704b010328a1814e1dbda1 Signed-off-by: Damjan Marion <dmarion@me.com>
This commit is contained in:

committed by
Neale Ranns

parent
4e96ddaec8
commit
622b5ce619
@ -26,7 +26,7 @@ endif()
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
list(APPEND VARIANTS "armv8\;-march=armv8.1-a+crc+crypto")
|
||||
set (COMPILE_FILES aes_cbc.c)
|
||||
set (COMPILE_FILES aes_cbc.c aes_gcm.c)
|
||||
set (COMPILE_OPTS -Wall -fno-common)
|
||||
endif()
|
||||
|
||||
|
@ -28,8 +28,6 @@ typedef enum
|
||||
#define AES_KEY_ROUNDS(x) (10 + x * 2)
|
||||
#define AES_KEY_BYTES(x) (16 + x * 8)
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
static const u8x16 byte_mask_scale = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
};
|
||||
@ -37,21 +35,31 @@ static const u8x16 byte_mask_scale = {
|
||||
static_always_inline u8x16
|
||||
aes_block_load (u8 * p)
|
||||
{
|
||||
return (u8x16) _mm_loadu_si128 ((__m128i *) p);
|
||||
return *(u8x16u *) p;
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_enc_round (u8x16 a, u8x16 k)
|
||||
{
|
||||
#if defined (__AES__)
|
||||
return (u8x16) _mm_aesenc_si128 ((__m128i) a, (__m128i) k);
|
||||
#elif defined (__ARM_FEATURE_AES)
|
||||
return vaesmcq_u8 (vaeseq_u8 (a, u8x16_splat (0))) ^ k;
|
||||
#endif
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_enc_last_round (u8x16 a, u8x16 k)
|
||||
{
|
||||
#if defined (__AES__)
|
||||
return (u8x16) _mm_aesenclast_si128 ((__m128i) a, (__m128i) k);
|
||||
#elif defined (__ARM_FEATURE_AES)
|
||||
return vaeseq_u8 (a, u8x16_splat (0)) ^ k;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_dec_round (u8x16 a, u8x16 k)
|
||||
{
|
||||
@ -63,11 +71,12 @@ aes_dec_last_round (u8x16 a, u8x16 k)
|
||||
{
|
||||
return (u8x16) _mm_aesdeclast_si128 ((__m128i) a, (__m128i) k);
|
||||
}
|
||||
#endif
|
||||
|
||||
static_always_inline void
|
||||
aes_block_store (u8 * p, u8x16 r)
|
||||
{
|
||||
_mm_storeu_si128 ((__m128i *) p, (__m128i) r);
|
||||
*(u8x16u *) p = r;
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
@ -91,31 +100,40 @@ aes_load_partial (u8x16u * p, int n_bytes)
|
||||
static_always_inline void
|
||||
aes_store_partial (void *p, u8x16 r, int n_bytes)
|
||||
{
|
||||
#if __aarch64__
|
||||
clib_memcpy_fast (p, &r, n_bytes);
|
||||
#else
|
||||
#ifdef __AVX512F__
|
||||
_mm_mask_storeu_epi8 (p, (1 << n_bytes) - 1, (__m128i) r);
|
||||
#else
|
||||
u8x16 mask = u8x16_is_greater (u8x16_splat (n_bytes), byte_mask_scale);
|
||||
_mm_maskmoveu_si128 ((__m128i) r, (__m128i) mask, p);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
|
||||
{
|
||||
int i;
|
||||
int rounds = AES_KEY_ROUNDS (ks);
|
||||
block ^= round_keys[0];
|
||||
for (i = 1; i < AES_KEY_ROUNDS (ks); i += 1)
|
||||
for (int i = 1; i < rounds; i += 1)
|
||||
block = aes_enc_round (block, round_keys[i]);
|
||||
return aes_enc_last_round (block, round_keys[i]);
|
||||
return aes_enc_last_round (block, round_keys[rounds]);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_inv_mix_column (u8x16 a)
|
||||
{
|
||||
#if defined (__AES__)
|
||||
return (u8x16) _mm_aesimc_si128 ((__m128i) a);
|
||||
#elif defined (__ARM_FEATURE_AES)
|
||||
return vaesimcq_u8 (a);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define aes_keygen_assist(a, b) \
|
||||
(u8x16) _mm_aeskeygenassist_si128((__m128i) a, b)
|
||||
|
||||
@ -244,12 +262,6 @@ aes256_key_expand (u8x16 * rk, u8x16u const *k)
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_inv_mix_column (u8x16 a)
|
||||
{
|
||||
return vaesimcq_u8 (a);
|
||||
}
|
||||
|
||||
static const u8x16 aese_prep_mask1 =
|
||||
{ 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
|
||||
static const u8x16 aese_prep_mask2 =
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include <vlib/vlib.h>
|
||||
#include <vnet/plugin/plugin.h>
|
||||
#include <vnet/crypto/crypto.h>
|
||||
#include <x86intrin.h>
|
||||
#include <crypto_native/crypto_native.h>
|
||||
#include <crypto_native/aes.h>
|
||||
#include <crypto_native/ghash.h>
|
||||
@ -35,18 +34,6 @@ typedef struct
|
||||
const u8x16 Ke[15];
|
||||
} aes_gcm_key_data_t;
|
||||
|
||||
static const u32x4 last_byte_one = { 0, 0, 0, 1 << 24 };
|
||||
|
||||
static const u8x16 bswap_mask = {
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
};
|
||||
|
||||
static_always_inline u8x16
|
||||
aesni_gcm_bswap (u8x16 x)
|
||||
{
|
||||
return (u8x16) _mm_shuffle_epi8 ((__m128i) x, (__m128i) bswap_mask);
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
aesni_gcm_load (u8x16 * d, u8x16u * inv, int n, int n_bytes)
|
||||
{
|
||||
@ -70,6 +57,8 @@ static_always_inline void
|
||||
aesni_gcm_enc_first_round (u8x16 * r, u32x4 * Y, u32 * ctr, u8x16 k,
|
||||
int n_blocks)
|
||||
{
|
||||
static const u32x4 last_byte_one = { 0, 0, 0, 1 << 24 };
|
||||
|
||||
if (PREDICT_TRUE ((u8) ctr[0] < (256 - n_blocks)))
|
||||
{
|
||||
for (int i = 0; i < n_blocks; i++)
|
||||
@ -115,9 +104,9 @@ aesni_gcm_ghash_blocks (u8x16 T, aes_gcm_key_data_t * kd,
|
||||
{
|
||||
ghash_data_t _gd, *gd = &_gd;
|
||||
const u8x16 *Hi = kd->Hi + n_blocks - 1;
|
||||
ghash_mul_first (gd, aesni_gcm_bswap (in[0]) ^ T, Hi[0]);
|
||||
ghash_mul_first (gd, u8x16_reflect (in[0]) ^ T, Hi[0]);
|
||||
for (int i = 1; i < n_blocks; i++)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap ((in[i])), Hi[-i]);
|
||||
ghash_mul_next (gd, u8x16_reflect ((in[i])), Hi[-i]);
|
||||
ghash_reduce (gd);
|
||||
ghash_reduce2 (gd);
|
||||
return ghash_final (gd);
|
||||
@ -158,7 +147,7 @@ aesni_gcm_ghash (u8x16 T, aes_gcm_key_data_t * kd, u8x16u * in, u32 n_left)
|
||||
if (n_left)
|
||||
{
|
||||
u8x16 r = aes_load_partial (in, n_left);
|
||||
T = ghash_mul (aesni_gcm_bswap (r) ^ T, kd->Hi[0]);
|
||||
T = ghash_mul (u8x16_reflect (r) ^ T, kd->Hi[0]);
|
||||
}
|
||||
return T;
|
||||
}
|
||||
@ -174,7 +163,7 @@ aesni_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
const u8x16 *rk = (u8x16 *) kd->Ke;
|
||||
int hidx = is_encrypt ? 4 : n, didx = 0;
|
||||
|
||||
_mm_prefetch (inv + 4, _MM_HINT_T0);
|
||||
clib_prefetch_load (inv + 4);
|
||||
|
||||
/* AES rounds 0 and 1 */
|
||||
aesni_gcm_enc_first_round (r, Y, ctr, rk[0], n);
|
||||
@ -186,7 +175,7 @@ aesni_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
|
||||
/* GHASH multiply block 1 */
|
||||
if (with_ghash)
|
||||
ghash_mul_first (gd, aesni_gcm_bswap (d[didx++]) ^ T, kd->Hi[--hidx]);
|
||||
ghash_mul_first (gd, u8x16_reflect (d[didx++]) ^ T, kd->Hi[--hidx]);
|
||||
|
||||
/* AES rounds 2 and 3 */
|
||||
aesni_gcm_enc_round (r, rk[2], n);
|
||||
@ -194,7 +183,7 @@ aesni_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
|
||||
/* GHASH multiply block 2 */
|
||||
if (with_ghash && hidx)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[didx++]), kd->Hi[--hidx]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[didx++]), kd->Hi[--hidx]);
|
||||
|
||||
/* AES rounds 4 and 5 */
|
||||
aesni_gcm_enc_round (r, rk[4], n);
|
||||
@ -202,7 +191,7 @@ aesni_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
|
||||
/* GHASH multiply block 3 */
|
||||
if (with_ghash && hidx)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[didx++]), kd->Hi[--hidx]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[didx++]), kd->Hi[--hidx]);
|
||||
|
||||
/* AES rounds 6 and 7 */
|
||||
aesni_gcm_enc_round (r, rk[6], n);
|
||||
@ -210,7 +199,7 @@ aesni_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
|
||||
/* GHASH multiply block 4 */
|
||||
if (with_ghash && hidx)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[didx++]), kd->Hi[--hidx]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[didx++]), kd->Hi[--hidx]);
|
||||
|
||||
/* AES rounds 8 and 9 */
|
||||
aesni_gcm_enc_round (r, rk[8], n);
|
||||
@ -259,28 +248,28 @@ aesni_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
aesni_gcm_load (d, inv, 4, 0);
|
||||
|
||||
/* GHASH multiply block 0 */
|
||||
ghash_mul_first (gd, aesni_gcm_bswap (d[0]) ^ T, kd->Hi[7]);
|
||||
ghash_mul_first (gd, u8x16_reflect (d[0]) ^ T, kd->Hi[7]);
|
||||
|
||||
/* AES rounds 2 and 3 */
|
||||
aesni_gcm_enc_round (r, rk[2], 4);
|
||||
aesni_gcm_enc_round (r, rk[3], 4);
|
||||
|
||||
/* GHASH multiply block 1 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[1]), kd->Hi[6]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[1]), kd->Hi[6]);
|
||||
|
||||
/* AES rounds 4 and 5 */
|
||||
aesni_gcm_enc_round (r, rk[4], 4);
|
||||
aesni_gcm_enc_round (r, rk[5], 4);
|
||||
|
||||
/* GHASH multiply block 2 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[2]), kd->Hi[5]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[2]), kd->Hi[5]);
|
||||
|
||||
/* AES rounds 6 and 7 */
|
||||
aesni_gcm_enc_round (r, rk[6], 4);
|
||||
aesni_gcm_enc_round (r, rk[7], 4);
|
||||
|
||||
/* GHASH multiply block 3 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[3]), kd->Hi[4]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[3]), kd->Hi[4]);
|
||||
|
||||
/* AES rounds 8 and 9 */
|
||||
aesni_gcm_enc_round (r, rk[8], 4);
|
||||
@ -301,7 +290,7 @@ aesni_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
aesni_gcm_load (d, inv + 4, 4, 0);
|
||||
|
||||
/* GHASH multiply block 4 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[0]), kd->Hi[3]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[0]), kd->Hi[3]);
|
||||
|
||||
/* AES rounds 0, 1 and 2 */
|
||||
aesni_gcm_enc_first_round (r, Y, ctr, rk[0], 4);
|
||||
@ -309,21 +298,21 @@ aesni_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
aesni_gcm_enc_round (r, rk[2], 4);
|
||||
|
||||
/* GHASH multiply block 5 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[1]), kd->Hi[2]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[1]), kd->Hi[2]);
|
||||
|
||||
/* AES rounds 3 and 4 */
|
||||
aesni_gcm_enc_round (r, rk[3], 4);
|
||||
aesni_gcm_enc_round (r, rk[4], 4);
|
||||
|
||||
/* GHASH multiply block 6 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[2]), kd->Hi[1]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[2]), kd->Hi[1]);
|
||||
|
||||
/* AES rounds 5 and 6 */
|
||||
aesni_gcm_enc_round (r, rk[5], 4);
|
||||
aesni_gcm_enc_round (r, rk[6], 4);
|
||||
|
||||
/* GHASH multiply block 7 */
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[3]), kd->Hi[0]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[3]), kd->Hi[0]);
|
||||
|
||||
/* AES rounds 7 and 8 */
|
||||
aesni_gcm_enc_round (r, rk[7], 4);
|
||||
@ -361,13 +350,13 @@ aesni_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
|
||||
if (n_bytes)
|
||||
d[n_blocks - 1] = aes_byte_mask (d[n_blocks - 1], n_bytes);
|
||||
|
||||
ghash_mul_first (gd, aesni_gcm_bswap (d[0]) ^ T, kd->Hi[n_blocks - 1]);
|
||||
ghash_mul_first (gd, u8x16_reflect (d[0]) ^ T, kd->Hi[n_blocks - 1]);
|
||||
if (n_blocks > 1)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[1]), kd->Hi[n_blocks - 2]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[1]), kd->Hi[n_blocks - 2]);
|
||||
if (n_blocks > 2)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[2]), kd->Hi[n_blocks - 3]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[2]), kd->Hi[n_blocks - 3]);
|
||||
if (n_blocks > 3)
|
||||
ghash_mul_next (gd, aesni_gcm_bswap (d[3]), kd->Hi[n_blocks - 4]);
|
||||
ghash_mul_next (gd, u8x16_reflect (d[3]), kd->Hi[n_blocks - 4]);
|
||||
ghash_reduce (gd);
|
||||
ghash_reduce2 (gd);
|
||||
return ghash_final (gd);
|
||||
@ -539,9 +528,9 @@ aes_gcm (u8x16u * in, u8x16u * out, u8x16u * addt, u8x16u * iv, u8x16u * tag,
|
||||
u32x4 Y0;
|
||||
ghash_data_t _gd, *gd = &_gd;
|
||||
|
||||
_mm_prefetch (iv, _MM_HINT_T0);
|
||||
_mm_prefetch (in, _MM_HINT_T0);
|
||||
_mm_prefetch (in + CLIB_CACHE_LINE_BYTES, _MM_HINT_T0);
|
||||
clib_prefetch_load (iv);
|
||||
clib_prefetch_load (in);
|
||||
clib_prefetch_load (in + 4);
|
||||
|
||||
/* calculate ghash for AAD - optimized for ipsec common cases */
|
||||
if (aad_bytes == 8)
|
||||
@ -561,7 +550,7 @@ aes_gcm (u8x16u * in, u8x16u * out, u8x16u * addt, u8x16u * iv, u8x16u * tag,
|
||||
else
|
||||
T = aesni_gcm_dec (T, kd, Y0, in, out, data_bytes, aes_rounds);
|
||||
|
||||
_mm_prefetch (tag, _MM_HINT_T0);
|
||||
clib_prefetch_load (tag);
|
||||
|
||||
/* Finalize ghash - data bytes and aad bytes converted to bits */
|
||||
/* *INDENT-OFF* */
|
||||
@ -581,7 +570,7 @@ aes_gcm (u8x16u * in, u8x16u * out, u8x16u * addt, u8x16u * iv, u8x16u * tag,
|
||||
for (; i < aes_rounds; i += 1)
|
||||
r = aes_enc_round (r, kd->Ke[i]);
|
||||
r = aes_enc_last_round (r, kd->Ke[aes_rounds]);
|
||||
T = aesni_gcm_bswap (T) ^ r;
|
||||
T = u8x16_reflect (T) ^ r;
|
||||
|
||||
/* tag_len 16 -> 0 */
|
||||
tag_len &= 0xf;
|
||||
@ -679,7 +668,7 @@ aesni_gcm_key_exp (vnet_crypto_key_t * key, aes_key_size_t ks)
|
||||
|
||||
/* pre-calculate H */
|
||||
H = aes_encrypt_block (u8x16_splat (0), kd->Ke, ks);
|
||||
H = aesni_gcm_bswap (H);
|
||||
H = u8x16_reflect (H);
|
||||
ghash_precompute (H, (u8x16 *) kd->Hi, 8);
|
||||
return kd;
|
||||
}
|
||||
@ -706,6 +695,8 @@ crypto_native_aes_gcm_init_vaes (vlib_main_t * vm)
|
||||
crypto_native_aes_gcm_init_avx512 (vlib_main_t * vm)
|
||||
#elif __AVX2__
|
||||
crypto_native_aes_gcm_init_avx2 (vlib_main_t * vm)
|
||||
#elif __aarch64__
|
||||
crypto_native_aes_gcm_init_neon (vlib_main_t * vm)
|
||||
#else
|
||||
crypto_native_aes_gcm_init_sse42 (vlib_main_t * vm)
|
||||
#endif
|
||||
|
@ -45,6 +45,7 @@ clib_error_t *crypto_native_aes_gcm_init_sse42 (vlib_main_t * vm);
|
||||
clib_error_t *crypto_native_aes_gcm_init_avx2 (vlib_main_t * vm);
|
||||
clib_error_t *crypto_native_aes_gcm_init_avx512 (vlib_main_t * vm);
|
||||
clib_error_t *crypto_native_aes_gcm_init_vaes (vlib_main_t * vm);
|
||||
clib_error_t *crypto_native_aes_gcm_init_neon (vlib_main_t * vm);
|
||||
#endif /* __crypto_native_h__ */
|
||||
|
||||
/*
|
||||
|
@ -102,9 +102,10 @@ crypto_native_init (vlib_main_t * vm)
|
||||
}
|
||||
#endif
|
||||
#if __aarch64__
|
||||
error = crypto_native_aes_cbc_init_neon (vm);
|
||||
if ((error = crypto_native_aes_cbc_init_neon (vm)))
|
||||
goto error;
|
||||
|
||||
if (error)
|
||||
if ((error = crypto_native_aes_gcm_init_neon (vm)))
|
||||
goto error;
|
||||
#endif
|
||||
|
||||
|
@ -194,6 +194,15 @@ u8x16_word_shift_right (u8x16 x, const int n)
|
||||
return vextq_u8 (x, u8x16_splat (0), n);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
u8x16_reflect (u8x16 v)
|
||||
{
|
||||
u8x16 mask = {
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
};
|
||||
return (u8x16) vqtbl1q_u8 (v, mask);
|
||||
}
|
||||
|
||||
#define CLIB_HAVE_VEC128_MSB_MASK
|
||||
|
||||
#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
|
||||
|
@ -635,6 +635,15 @@ u16x8_byte_swap (u16x8 v)
|
||||
return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
u8x16_reflect (u8x16 v)
|
||||
{
|
||||
u8x16 mask = {
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
};
|
||||
return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
|
||||
}
|
||||
|
||||
static_always_inline u32x4
|
||||
u32x4_hadd (u32x4 v1, u32x4 v2)
|
||||
{
|
||||
|
Reference in New Issue
Block a user