crypto-native: 256-bit AES CBC support

Used on intel client CPUs which suppport VAES instruction set without AVX512 Type: improvement Change-Id: I5f816a1ea9f89a8d298d2c0f38d8d7c06f414ba0 Signed-off-by: Damjan Marion <damarion@cisco.com>
2023-03-14 18:04:45 +00:00
parent 1ca681838c
commit adeaf16960
8 changed files with 369 additions and 86 deletions
--- a/src/plugins/crypto_native/CMakeLists.txt
+++ b/src/plugins/crypto_native/CMakeLists.txt
@ -20,6 +20,9 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
  if(compiler_flag_march_icelake_client AND compiler_flag_mprefer_vector_width_512)
    list(APPEND VARIANTS "icl\;-march=icelake-client -mprefer-vector-width=512")
  endif()
+  if(compiler_flag_march_alderlake)
+    list(APPEND VARIANTS "adl\;-march=alderlake -mprefer-vector-width=256")
+  endif()
  set (COMPILE_FILES aes_cbc.c aes_gcm.c)
  set (COMPILE_OPTS -Wall -fno-common -maes)
 endif()
--- a/src/plugins/crypto_native/aes.h
+++ b/src/plugins/crypto_native/aes.h
@ -48,7 +48,7 @@ aes_enc_round (u8x16 a, u8x16 k)
 #endif
 }

-#if defined (__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__)
 static_always_inline u8x64
 aes_enc_round_x4 (u8x64 a, u8x64 k)
 {
@ -74,6 +74,32 @@ aes_dec_last_round_x4 (u8x64 a, u8x64 k)
 }
 #endif

+#ifdef __VAES__
+static_always_inline u8x32
+aes_enc_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_enc_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k);
+}
+#endif
+
 static_always_inline u8x16
 aes_enc_last_round (u8x16 a, u8x16 k)
 {
--- a/src/plugins/crypto_native/aes_cbc.c
+++ b/src/plugins/crypto_native/aes_cbc.c
--- a/src/plugins/crypto_native/aes_gcm.c
+++ b/src/plugins/crypto_native/aes_gcm.c
@ -26,7 +26,7 @@
 #pragma GCC optimize ("O3")
 #endif

-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
 #define NUM_HI 32
 #else
 #define NUM_HI 8
@ -38,7 +38,7 @@ typedef struct
  const u8x16 Hi[NUM_HI];
  /* extracted AES key */
  const u8x16 Ke[15];
-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
  const u8x64 Ke4[15];
 #endif
 } aes_gcm_key_data_t;
@ -63,7 +63,6 @@ typedef enum

 static const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 };

-#ifndef __VAES__
 static_always_inline void
 aes_gcm_enc_first_round (u8x16 * r, aes_gcm_counter_t * ctr, u8x16 k,
 			 int n_blocks)
@ -107,7 +106,6 @@ aes_gcm_enc_last_round (u8x16 * r, u8x16 * d, u8x16 const *k,
  for (int i = 0; i < n_blocks; i++)
    d[i] ^= aes_enc_last_round (r[i], k[rounds]);
 }
-#endif

 static_always_inline u8x16
 aes_gcm_ghash_blocks (u8x16 T, aes_gcm_key_data_t * kd,
@ -163,11 +161,10 @@ aes_gcm_ghash (u8x16 T, aes_gcm_key_data_t * kd, u8x16u * in, u32 n_left)
  return T;
 }

-#ifndef __VAES__
-static_always_inline u8x16
-aes_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
-	      aes_gcm_counter_t * ctr, u8x16u * inv, u8x16u * outv,
-	      int rounds, int n, int last_block_bytes, aes_gcm_flags_t f)
+static_always_inline __clib_unused u8x16
+aes_gcm_calc (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d,
+	      aes_gcm_counter_t *ctr, u8x16u *inv, u8x16u *outv, int rounds,
+	      int n, int last_block_bytes, aes_gcm_flags_t f)
 {
  u8x16 r[n];
  ghash_data_t _gd = { }, *gd = &_gd;
@ -258,9 +255,9 @@ aes_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
  return T;
 }

-static_always_inline u8x16
-aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
-		     aes_gcm_counter_t * ctr, u8x16u * inv, u8x16u * outv,
+static_always_inline __clib_unused u8x16
+aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d,
+		     aes_gcm_counter_t *ctr, u8x16u *inv, u8x16u *outv,
 		     int rounds, aes_gcm_flags_t f)
 {
  u8x16 r[4];
@ -396,9 +393,9 @@ aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
  return ghash_final (gd);
 }

-static_always_inline u8x16
-aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
-		    int n_blocks, int n_bytes)
+static_always_inline __clib_unused u8x16
+aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d, int n_blocks,
+		    int n_bytes)
 {
  ghash_data_t _gd, *gd = &_gd;
  u8x16 *Hi = (u8x16 *) kd->Hi + NUM_HI - n_blocks;
@ -417,9 +414,8 @@ aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d,
  ghash_reduce2 (gd);
  return ghash_final (gd);
 }
-#endif

-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
 static const u32x16 ctr_inv_1234 = {
  0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24,
 };
@ -757,7 +753,7 @@ aes_gcm_enc (u8x16 T, aes_gcm_key_data_t * kd, aes_gcm_counter_t * ctr,
  if (n_left == 0)
    return T;

-#if __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
  u8x64 d4[4];
  if (n_left < 256)
    {
@ -939,7 +935,7 @@ aes_gcm_dec (u8x16 T, aes_gcm_key_data_t * kd, aes_gcm_counter_t * ctr,
 	     u8x16u * inv, u8x16u * outv, u32 n_left, int rounds)
 {
  aes_gcm_flags_t f = AES_GCM_F_WITH_GHASH | AES_GCM_F_DECRYPT;
-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
  u8x64 d4[4] = { };

  while (n_left >= 512)
@ -1045,7 +1041,7 @@ aes_gcm (u8x16u *in, u8x16u *out, u8x16u *addt, u8 *ivp, u8x16u *tag,
  Y0.as_u64x2[0] = *(u64u *) ivp;
  Y0.as_u32x4[2] = *(u32u *) (ivp + 8);
  Y0.as_u32x4 += ctr_inv_1;
-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
  ctr->Y4 = u32x16_splat_u32x4 (Y0.as_u32x4) + ctr_inv_1234;
 #else
  ctr->Y = Y0.as_u32x4 + ctr_inv_1;
@ -1177,7 +1173,7 @@ aes_gcm_key_exp (vnet_crypto_key_t * key, aes_key_size_t ks)
  H = aes_encrypt_block (u8x16_splat (0), kd->Ke, ks);
  H = u8x16_reflect (H);
  ghash_precompute (H, (u8x16 *) kd->Hi, NUM_HI);
-#ifdef __VAES__
+#if defined(__VAES__) && defined(__AVX512F__)
  u8x64 *Ke4 = (u8x64 *) kd->Ke4;
  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
    Ke4[i] = u8x64_splat_u8x16 (kd->Ke[i]);
@ -1201,8 +1197,8 @@ foreach_aes_gcm_handler_type;
 #undef _

 clib_error_t *
-#ifdef __VAES__
-crypto_native_aes_gcm_init_icl (vlib_main_t * vm)
+#if defined(__VAES__) && defined(__AVX512F__)
+crypto_native_aes_gcm_init_icl (vlib_main_t *vm)
 #elif __AVX512F__
 crypto_native_aes_gcm_init_skx (vlib_main_t * vm)
 #elif __AVX2__
--- a/src/plugins/crypto_native/crypto_native.h
+++ b/src/plugins/crypto_native/crypto_native.h
@ -29,7 +29,8 @@ typedef struct

 extern crypto_native_main_t crypto_native_main;

-#define foreach_crypto_native_march_variant _(slm) _(hsw) _(skx) _(icl) _(neon)
+#define foreach_crypto_native_march_variant                                   \
+  _ (slm) _ (hsw) _ (skx) _ (icl) _ (adl) _ (neon)

 #define _(v) \
 clib_error_t __clib_weak *crypto_native_aes_cbc_init_##v (vlib_main_t * vm); \
--- a/src/plugins/crypto_native/ghash.h
+++ b/src/plugins/crypto_native/ghash.h
@ -257,7 +257,7 @@ ghash_mul (u8x16 a, u8x16 b)
  return ghash_final (gd);
 }

-#ifdef __VPCLMULQDQ__
+#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)

 static const u8x64 ghash4_poly2 = {
  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
--- a/src/plugins/crypto_native/main.c
+++ b/src/plugins/crypto_native/main.c
@ -78,6 +78,8 @@ crypto_native_init (vlib_main_t * vm)
  else if (crypto_native_aes_cbc_init_icl && clib_cpu_supports_vaes () &&
 	   clib_cpu_supports_avx512f ())
    error = crypto_native_aes_cbc_init_icl (vm);
+  else if (crypto_native_aes_cbc_init_adl && clib_cpu_supports_vaes ())
+    error = crypto_native_aes_cbc_init_adl (vm);
  else if (crypto_native_aes_cbc_init_skx && clib_cpu_supports_avx512f ())
    error = crypto_native_aes_cbc_init_skx (vm);
  else if (crypto_native_aes_cbc_init_hsw && clib_cpu_supports_avx2 ())
@ -101,6 +103,8 @@ crypto_native_init (vlib_main_t * vm)
      if (crypto_native_aes_gcm_init_icl && clib_cpu_supports_vaes () &&
 	  clib_cpu_supports_avx512f ())
 	error = crypto_native_aes_gcm_init_icl (vm);
+      else if (crypto_native_aes_gcm_init_adl && clib_cpu_supports_vaes ())
+	error = crypto_native_aes_gcm_init_adl (vm);
      else if (crypto_native_aes_gcm_init_skx && clib_cpu_supports_avx512f ())
 	error = crypto_native_aes_gcm_init_skx (vm);
      else if (crypto_native_aes_gcm_init_hsw && clib_cpu_supports_avx2 ())
--- a/src/vppinfra/vector_avx2.h
+++ b/src/vppinfra/vector_avx2.h
@ -213,6 +213,16 @@ u32x8_hxor (u32x8 v)
  return v4[0];
 }

+static_always_inline u8x32
+u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
+{
+#if __AVX512F__
+  return (u8x32) _mm256_ternarylogic_epi32 ((__m256i) a, (__m256i) b,
+					    (__m256i) c, 0x96);
+#endif
+  return a ^ b ^ c;
+}
+
 static_always_inline u16x16
 u16x16_mask_last (u16x16 v, u8 n_last)
 {
@ -391,6 +401,12 @@ u64x4_transpose (u64x4 a[8])
  a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
 }

+static_always_inline u8x32
+u8x32_splat_u8x16 (u8x16 a)
+{
+  return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
+}
+
 #endif /* included_vector_avx2_h */

 /*