vppinfra: AES-CBC and AES-GCM refactor and optimizations

- crypto code moved to vppinfra for better testing and reuse
- added 256-bit VAES support (Intel Client CPUs)
- added AES_GMAC functions

Change-Id: I960c8e14ca0a0126703e8f1589d86f32e2a98361
Type: improvement
Signed-off-by: Damjan Marion <damarion@cisco.com>
This commit is contained in:
Damjan Marion
2023-03-15 11:42:06 +00:00
committed by Ole Tr�an
parent 5527a78ed9
commit b47376f0b4
15 changed files with 3307 additions and 2293 deletions
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+7 -1
View File
@@ -128,6 +128,11 @@ set(VPPINFRA_HEADERS
clib.h
cpu.h
crc32.h
crypto/sha2.h
crypto/ghash.h
crypto/aes.h
crypto/aes_cbc.h
crypto/aes_gcm.h
dlist.h
dlmalloc.h
elf_clib.h
@@ -168,7 +173,6 @@ set(VPPINFRA_HEADERS
random_isaac.h
rbtree.h
serialize.h
sha2.h
smp.h
socket.h
sparse_vec.h
@@ -278,6 +282,8 @@ if(VPP_BUILD_VPPINFRA_TESTS)
endif(VPP_BUILD_VPPINFRA_TESTS)
set(test_files
test/aes_cbc.c
test/aes_gcm.c
test/array_mask.c
test/compress.c
test/count_equal.c
@@ -28,10 +28,6 @@ typedef enum
#define AES_KEY_ROUNDS(x) (10 + x * 2)
#define AES_KEY_BYTES(x) (16 + x * 8)
static const u8x16 byte_mask_scale = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
static_always_inline u8x16
aes_block_load (u8 * p)
{
@@ -131,43 +127,6 @@ aes_block_store (u8 * p, u8x16 r)
*(u8x16u *) p = r;
}
static_always_inline u8x16
aes_byte_mask (u8x16 x, u8 n_bytes)
{
return x & (u8x16_splat (n_bytes) > byte_mask_scale);
}
static_always_inline u8x16
aes_load_partial (u8x16u * p, int n_bytes)
{
ASSERT (n_bytes <= 16);
#ifdef __AVX512F__
__m128i zero = { };
return (u8x16) _mm_mask_loadu_epi8 (zero, (1 << n_bytes) - 1, p);
#else
u8x16 v = {};
CLIB_ASSUME (n_bytes < 16);
clib_memcpy_fast (&v, p, n_bytes);
return v;
#endif
}
static_always_inline void
aes_store_partial (void *p, u8x16 r, int n_bytes)
{
#if __aarch64__
clib_memcpy_fast (p, &r, n_bytes);
#else
#ifdef __AVX512F__
_mm_mask_storeu_epi8 (p, (1 << n_bytes) - 1, (__m128i) r);
#else
u8x16 mask = u8x16_splat (n_bytes) > byte_mask_scale;
_mm_maskmoveu_si128 ((__m128i) r, (__m128i) mask, p);
#endif
#endif
}
static_always_inline u8x16
aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
{
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -86,7 +86,7 @@
* This allows us to improve performance by deferring reduction. For example
* to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
*
* __i128 Hi[4];
* u8x16 Hi[4];
* ghash_precompute (H, Hi, 4);
*
* ghash_data_t _gd, *gd = &_gd;
@@ -151,6 +151,8 @@ gmul_hi_hi (u8x16 a, u8x16 b)
typedef struct
{
u8x16 mid, hi, lo, tmp_lo, tmp_hi;
u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
int pending;
} ghash_data_t;
@@ -172,7 +174,7 @@ ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
/* a0 * b0 */
gd->lo = gmul_lo_lo (a, b);
/* a0 * b1 ^ a1 * b0 */
gd->mid = (gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b));
gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
/* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
there is no pending data in tmp_lo and tmp_hi */
@@ -270,12 +272,6 @@ static const u8x64 ghash4_poly2 = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
};
typedef struct
{
u8x64 hi, lo, mid, tmp_lo, tmp_hi;
int pending;
} ghash4_data_t;
static_always_inline u8x64
gmul4_lo_lo (u8x64 a, u8x64 b)
{
@@ -300,18 +296,17 @@ gmul4_hi_hi (u8x64 a, u8x64 b)
return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
}
static_always_inline void
ghash4_mul_first (ghash4_data_t * gd, u8x64 a, u8x64 b)
ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
{
gd->hi = gmul4_hi_hi (a, b);
gd->lo = gmul4_lo_lo (a, b);
gd->mid = (gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b));
gd->hi4 = gmul4_hi_hi (a, b);
gd->lo4 = gmul4_lo_lo (a, b);
gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
gd->pending = 0;
}
static_always_inline void
ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
{
u8x64 hi = gmul4_hi_hi (a, b);
u8x64 lo = gmul4_lo_lo (a, b);
@@ -319,63 +314,62 @@ ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
if (gd->pending)
{
/* there is peding data from previous invocation so we can XOR */
gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, hi);
gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, lo);
gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
gd->pending = 0;
}
else
{
/* there is no peding data from previous invocation so we postpone XOR */
gd->tmp_hi = hi;
gd->tmp_lo = lo;
gd->tmp_hi4 = hi;
gd->tmp_lo4 = lo;
gd->pending = 1;
}
gd->mid = u8x64_xor3 (gd->mid, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
}
static_always_inline void
ghash4_reduce (ghash4_data_t * gd)
ghash4_reduce (ghash_data_t *gd)
{
u8x64 r;
/* Final combination:
gd->lo ^= gd->mid << 64
gd->hi ^= gd->mid >> 64 */
gd->lo4 ^= gd->mid4 << 64
gd->hi4 ^= gd->mid4 >> 64 */
u8x64 midl = u8x64_word_shift_left (gd->mid, 8);
u8x64 midr = u8x64_word_shift_right (gd->mid, 8);
u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
if (gd->pending)
{
gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, midl);
gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, midr);
gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
}
else
{
gd->lo ^= midl;
gd->hi ^= midr;
gd->lo4 ^= midl;
gd->hi4 ^= midr;
}
r = gmul4_hi_lo (ghash4_poly2, gd->lo);
gd->lo ^= u8x64_word_shift_left (r, 8);
r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
gd->lo4 ^= u8x64_word_shift_left (r, 8);
}
static_always_inline void
ghash4_reduce2 (ghash4_data_t * gd)
ghash4_reduce2 (ghash_data_t *gd)
{
gd->tmp_lo = gmul4_lo_lo (ghash4_poly2, gd->lo);
gd->tmp_hi = gmul4_lo_hi (ghash4_poly2, gd->lo);
gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
}
static_always_inline u8x16
ghash4_final (ghash4_data_t * gd)
ghash4_final (ghash_data_t *gd)
{
u8x64 r;
u8x32 t;
r = u8x64_xor3 (gd->hi, u8x64_word_shift_right (gd->tmp_lo, 4),
u8x64_word_shift_left (gd->tmp_hi, 4));
r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
u8x64_word_shift_left (gd->tmp_hi4, 4));
/* horizontal XOR of 4 128-bit lanes */
t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
@@ -383,6 +377,117 @@ ghash4_final (ghash4_data_t * gd)
}
#endif
#if defined(__VPCLMULQDQ__)
static const u8x32 ghash2_poly2 = {
0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
};
static_always_inline u8x32
gmul2_lo_lo (u8x32 a, u8x32 b)
{
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
}
static_always_inline u8x32
gmul2_hi_lo (u8x32 a, u8x32 b)
{
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
}
static_always_inline u8x32
gmul2_lo_hi (u8x32 a, u8x32 b)
{
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
}
static_always_inline u8x32
gmul2_hi_hi (u8x32 a, u8x32 b)
{
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
}
static_always_inline void
ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
{
gd->hi2 = gmul2_hi_hi (a, b);
gd->lo2 = gmul2_lo_lo (a, b);
gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
gd->pending = 0;
}
static_always_inline void
ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
{
u8x32 hi = gmul2_hi_hi (a, b);
u8x32 lo = gmul2_lo_lo (a, b);
if (gd->pending)
{
/* there is peding data from previous invocation so we can XOR */
gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
gd->pending = 0;
}
else
{
/* there is no peding data from previous invocation so we postpone XOR */
gd->tmp_hi2 = hi;
gd->tmp_lo2 = lo;
gd->pending = 1;
}
gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
}
static_always_inline void
ghash2_reduce (ghash_data_t *gd)
{
u8x32 r;
/* Final combination:
gd->lo2 ^= gd->mid2 << 64
gd->hi2 ^= gd->mid2 >> 64 */
u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
if (gd->pending)
{
gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
}
else
{
gd->lo2 ^= midl;
gd->hi2 ^= midr;
}
r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
gd->lo2 ^= u8x32_word_shift_left (r, 8);
}
static_always_inline void
ghash2_reduce2 (ghash_data_t *gd)
{
gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
}
static_always_inline u8x16
ghash2_final (ghash_data_t *gd)
{
u8x32 r;
r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
u8x32_word_shift_left (gd->tmp_hi2, 4));
/* horizontal XOR of 2 128-bit lanes */
return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
}
#endif
static_always_inline void
ghash_precompute (u8x16 H, u8x16 * Hi, int n)
{
@@ -398,9 +503,7 @@ ghash_precompute (u8x16 H, u8x16 * Hi, int n)
#else
r32[3] = r32[0];
#endif
/* *INDENT-OFF* */
r32 = r32 == (u32x4) {1, 0, 0, 1};
/* *INDENT-ON* */
Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
/* calculate H^(i + 1) */
@@ -410,10 +513,3 @@ ghash_precompute (u8x16 H, u8x16 * Hi, int n)
#endif /* __ghash_h__ */
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/
+15 -18
View File
@@ -24,25 +24,21 @@ format_perfmon_bundle_default (u8 *s, va_list *args)
case 1:
return format (s, "%5.2f", (f64) d[2] / d[0]);
case 2:
if (c->n_ops > 1)
return format (s, "%8.2f", (f64) d[0] / c->n_ops);
else
return format (s, "%8u", d[0]);
return format (s, "%8u", d[0]);
case 3:
if (c->n_ops > 1)
return format (s, "%8.2f", (f64) d[2] / c->n_ops);
else
return format (s, "%8u", d[2]);
return format (s, "%8.2f", (f64) d[0] / c->n_ops);
case 4:
if (c->n_ops > 1)
return format (s, "%9.2f", (f64) d[3] / c->n_ops);
else
return format (s, "%9u", d[3]);
return format (s, "%8u", d[2]);
case 5:
if (c->n_ops > 1)
return format (s, "%10.2f", (f64) d[4] / c->n_ops);
else
return format (s, "%10u", d[4]);
return format (s, "%8.2f", (f64) d[2] / c->n_ops);
case 6:
return format (s, "%9u", d[3]);
case 7:
return format (s, "%9.2f", (f64) d[3] / c->n_ops);
case 8:
return format (s, "%10u", d[4]);
case 9:
return format (s, "%10.2f", (f64) d[4] / c->n_ops);
default:
return s;
}
@@ -59,6 +55,7 @@ CLIB_PERFMON_BUNDLE (default) = {
.config[4] = PERF_COUNT_HW_BRANCH_MISSES,
.n_events = 5,
.format_fn = format_perfmon_bundle_default,
.column_headers = CLIB_STRING_ARRAY ("Freq", "IPC", "Clks/Op", "Inst/Op",
"Brnch/Op", "BrMiss/Op"),
.column_headers = CLIB_STRING_ARRAY ("Freq", "IPC", "Clks", "Clks/Op",
"Inst", "Inst/Op", "Brnch", "Brnch/Op",
"BrMiss", "BrMiss/Op"),
};
-637
View File
File diff suppressed because it is too large Load Diff
+187
View File
@@ -0,0 +1,187 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright(c) 2021 Cisco Systems, Inc.
*/
#ifdef __AES__
#include <vppinfra/format.h>
#include <vppinfra/test/test.h>
#include <vppinfra/crypto/aes_cbc.h>
static const u8 iv[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
};
static const u8 plaintext[] = {
0x6B, 0xC1, 0xBE, 0xE2, 0x2E, 0x40, 0x9F, 0x96, 0xE9, 0x3D, 0x7E, 0x11, 0x73,
0x93, 0x17, 0x2A, 0xAE, 0x2D, 0x8A, 0x57, 0x1E, 0x03, 0xAC, 0x9C, 0x9E, 0xB7,
0x6F, 0xAC, 0x45, 0xAF, 0x8E, 0x51, 0x30, 0xC8, 0x1C, 0x46, 0xA3, 0x5C, 0xE4,
0x11, 0xE5, 0xFB, 0xC1, 0x19, 0x1A, 0x0A, 0x52, 0xEF, 0xF6, 0x9F, 0x24, 0x45,
0xDF, 0x4F, 0x9B, 0x17, 0xAD, 0x2B, 0x41, 0x7B, 0xE6, 0x6C, 0x37, 0x10,
};
static const u8 key128[] = { 0x2B, 0x7E, 0x15, 0x16, 0x28, 0xAE, 0xD2, 0xA6,
0xAB, 0xF7, 0x15, 0x88, 0x09, 0xCF, 0x4F, 0x3C };
static const u8 key192[24] = {
0x8E, 0x73, 0xB0, 0xF7, 0xDA, 0x0E, 0x64, 0x52, 0xC8, 0x10, 0xF3, 0x2B,
0x80, 0x90, 0x79, 0xE5, 0x62, 0xF8, 0xEA, 0xD2, 0x52, 0x2C, 0x6B, 0x7B,
};
static const u8 ciphertext128[] = {
0x76, 0x49, 0xAB, 0xAC, 0x81, 0x19, 0xB2, 0x46, 0xCE, 0xE9, 0x8E, 0x9B, 0x12,
0xE9, 0x19, 0x7D, 0x50, 0x86, 0xCB, 0x9B, 0x50, 0x72, 0x19, 0xEE, 0x95, 0xDB,
0x11, 0x3A, 0x91, 0x76, 0x78, 0xB2, 0x73, 0xBE, 0xD6, 0xB8, 0xE3, 0xC1, 0x74,
0x3B, 0x71, 0x16, 0xE6, 0x9E, 0x22, 0x22, 0x95, 0x16, 0x3F, 0xF1, 0xCA, 0xA1,
0x68, 0x1F, 0xAC, 0x09, 0x12, 0x0E, 0xCA, 0x30, 0x75, 0x86, 0xE1, 0xA7,
};
static const u8 ciphertext192[64] = {
0x4F, 0x02, 0x1D, 0xB2, 0x43, 0xBC, 0x63, 0x3D, 0x71, 0x78, 0x18, 0x3A, 0x9F,
0xA0, 0x71, 0xE8, 0xB4, 0xD9, 0xAD, 0xA9, 0xAD, 0x7D, 0xED, 0xF4, 0xE5, 0xE7,
0x38, 0x76, 0x3F, 0x69, 0x14, 0x5A, 0x57, 0x1B, 0x24, 0x20, 0x12, 0xFB, 0x7A,
0xE0, 0x7F, 0xA9, 0xBA, 0xAC, 0x3D, 0xF1, 0x02, 0xE0, 0x08, 0xB0, 0xE2, 0x79,
0x88, 0x59, 0x88, 0x81, 0xD9, 0x20, 0xA9, 0xE6, 0x4F, 0x56, 0x15, 0xCD,
};
static const u8 key256[32] = {
0x60, 0x3D, 0xEB, 0x10, 0x15, 0xCA, 0x71, 0xBE, 0x2B, 0x73, 0xAE,
0xF0, 0x85, 0x7D, 0x77, 0x81, 0x1F, 0x35, 0x2C, 0x07, 0x3B, 0x61,
0x08, 0xD7, 0x2D, 0x98, 0x10, 0xA3, 0x09, 0x14, 0xDF, 0xF4,
};
static const u8 ciphertext256[64] = {
0xF5, 0x8C, 0x4C, 0x04, 0xD6, 0xE5, 0xF1, 0xBA, 0x77, 0x9E, 0xAB, 0xFB, 0x5F,
0x7B, 0xFB, 0xD6, 0x9C, 0xFC, 0x4E, 0x96, 0x7E, 0xDB, 0x80, 0x8D, 0x67, 0x9F,
0x77, 0x7B, 0xC6, 0x70, 0x2C, 0x7D, 0x39, 0xF2, 0x33, 0x69, 0xA9, 0xD9, 0xBA,
0xCF, 0xA5, 0x30, 0xE2, 0x63, 0x04, 0x23, 0x14, 0x61, 0xB2, 0xEB, 0x05, 0xE2,
0xC3, 0x9B, 0xE9, 0xFC, 0xDA, 0x6C, 0x19, 0x07, 0x8C, 0x6A, 0x9D, 0x1B,
};
#define _(b) \
static clib_error_t *test_clib_aes##b##_cbc_encrypt (clib_error_t *err) \
{ \
aes_cbc_key_data_t k; \
u8 data[512]; \
clib_aes##b##_cbc_key_expand (&k, key##b); \
clib_aes##b##_cbc_encrypt (&k, plaintext, sizeof (plaintext), iv, data); \
if (memcmp (ciphertext##b, data, sizeof (ciphertext##b)) != 0) \
err = \
clib_error_return (err, "encrypted data doesn't match plaintext"); \
return err; \
} \
void __test_perf_fn perftest_aes##b##_enc_var_sz (test_perf_t *tp) \
{ \
u32 n = tp->n_ops; \
aes_cbc_key_data_t *kd = test_mem_alloc (sizeof (*kd)); \
u8 *dst = test_mem_alloc (n + 16); \
u8 *src = test_mem_alloc_and_fill_inc_u8 (n + 16, 0, 0); \
clib_aes##b##_cbc_key_expand (kd, key##b); \
\
test_perf_event_enable (tp); \
clib_aes##b##_cbc_encrypt (kd, src, n, iv, dst); \
test_perf_event_disable (tp); \
}
_ (128)
_ (192)
_ (256)
#undef _
REGISTER_TEST (clib_aes128_cbc_encrypt) = {
.name = "clib_aes128_cbc_encrypt",
.fn = test_clib_aes128_cbc_encrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes128_enc_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes128_enc_var_sz }),
};
REGISTER_TEST (clib_aes192_cbc_encrypt) = {
.name = "clib_aes192_cbc_encrypt",
.fn = test_clib_aes192_cbc_encrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes192_enc_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes192_enc_var_sz }),
};
REGISTER_TEST (clib_aes256_cbc_encrypt) = {
.name = "clib_aes256_cbc_encrypt",
.fn = test_clib_aes256_cbc_encrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes256_enc_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes256_enc_var_sz }),
};
#define _(b) \
static clib_error_t *test_clib_aes##b##_cbc_decrypt (clib_error_t *err) \
{ \
aes_cbc_key_data_t k; \
u8 data[512]; \
clib_aes##b##_cbc_key_expand (&k, key##b); \
clib_aes##b##_cbc_decrypt (&k, ciphertext##b, sizeof (ciphertext##b), iv, \
data); \
if (memcmp (plaintext, data, sizeof (plaintext)) != 0) \
err = \
clib_error_return (err, "decrypted data doesn't match plaintext"); \
return err; \
} \
void __test_perf_fn perftest_aes##b##_dec_var_sz (test_perf_t *tp) \
{ \
u32 n = tp->n_ops; \
aes_cbc_key_data_t *kd = test_mem_alloc (sizeof (*kd)); \
u8 *dst = test_mem_alloc (n + 16); \
u8 *src = test_mem_alloc_and_fill_inc_u8 (n + 16, 0, 0); \
clib_aes##b##_cbc_key_expand (kd, key##b); \
\
test_perf_event_enable (tp); \
clib_aes##b##_cbc_decrypt (kd, src, n, iv, dst); \
test_perf_event_disable (tp); \
}
_ (128)
_ (192)
_ (256)
#undef _
REGISTER_TEST (clib_aes128_cbc_decrypt) = {
.name = "clib_aes128_cbc_decrypt",
.fn = test_clib_aes128_cbc_decrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes128_dec_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes128_dec_var_sz }),
};
REGISTER_TEST (clib_aes192_cbc_decrypt) = {
.name = "clib_aes192_cbc_decrypt",
.fn = test_clib_aes192_cbc_decrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes192_dec_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes192_dec_var_sz }),
};
REGISTER_TEST (clib_aes256_cbc_decrypt) = {
.name = "clib_aes256_cbc_decrypt",
.fn = test_clib_aes256_cbc_decrypt,
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
.n_ops = 1424,
.fn = perftest_aes256_dec_var_sz },
{ .name = "variable size (per byte)",
.n_ops = 9008,
.fn = perftest_aes256_dec_var_sz }),
};
#endif
File diff suppressed because it is too large Load Diff
+55
View File
@@ -223,6 +223,16 @@ u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
return a ^ b ^ c;
}
static_always_inline u8x32
u8x32_reflect_u8x16 (u8x32 x)
{
static const u8x32 mask = {
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
};
return (u8x32) _mm256_shuffle_epi8 ((__m256i) x, (__m256i) mask);
}
static_always_inline u16x16
u16x16_mask_last (u16x16 v, u8 n_last)
{
@@ -332,6 +342,11 @@ u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
(__m256i) mask);
}
#define u8x32_word_shift_left(a, n) \
(u8x32) _mm256_bslli_epi128 ((__m256i) a, n)
#define u8x32_word_shift_right(a, n) \
(u8x32) _mm256_bsrli_epi128 ((__m256i) a, n)
#define u32x8_permute_lanes(a, b, m) \
(u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
#define u64x4_permute_lanes(a, b, m) \
@@ -407,6 +422,46 @@ u8x32_splat_u8x16 (u8x16 a)
return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
}
static_always_inline u32x8
u32x8_splat_u32x4 (u32x4 a)
{
return (u32x8) _mm256_broadcastsi128_si256 ((__m128i) a);
}
static_always_inline u8x32
u8x32_load_partial (u8 *data, uword n)
{
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
return u8x32_mask_load_zero (data, pow2_mask (n));
#else
u8x32 r = {};
if (n > 16)
{
r = u8x32_insert_lo (r, *(u8x16u *) data);
r = u8x32_insert_hi (r, u8x16_load_partial (data + 16, n - 16));
}
else
r = u8x32_insert_lo (r, u8x16_load_partial (data, n));
return r;
#endif
}
static_always_inline void
u8x32_store_partial (u8x32 r, u8 *data, uword n)
{
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u8x32_mask_store (r, data, pow2_mask (n));
#else
if (n > 16)
{
*(u8x16u *) data = u8x32_extract_lo (r);
u8x16_store_partial (u8x32_extract_hi (r), data + 16, n - 16);
}
else
u8x16_store_partial (u8x32_extract_lo (r), data, n);
#endif
}
#endif /* included_vector_avx2_h */
/*
+12
View File
@@ -593,6 +593,18 @@ u64x8_transpose (u64x8 m[8])
m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
}
static_always_inline u8x64
u8x64_load_partial (u8 *data, uword n)
{
return u8x64_mask_load_zero (data, pow2_mask (n));
}
static_always_inline void
u8x64_store_partial (u8x64 r, u8 *data, uword n)
{
u8x64_mask_store (r, data, pow2_mask (n));
}
#endif /* included_vector_avx512_h */
/*
* fd.io coding-style-patch-verification: ON
+55
View File
@@ -231,6 +231,61 @@ __asm__ ("eor3 %0.16b,%1.16b,%2.16b,%3.16b": "=w" (r): "0" (a), "w" (b), "w" (c)
return a ^ b ^ c;
}
static_always_inline u8x16
u8x16_load_partial (u8 *data, uword n)
{
u8x16 r = {};
if (n > 7)
{
u64x2 r;
r[1] = *(u64u *) (data + n - 8);
r >>= (16 - n) * 8;
r[0] = *(u64u *) data;
return (u8x16) r;
}
else if (n > 3)
{
u32x4 r = {};
r[1] = *(u32u *) (data + n - 4);
r >>= (8 - n) * 8;
r[0] = *(u32u *) data;
return (u8x16) r;
}
else if (n > 1)
{
u16x8 r = {};
r[1] = *(u16u *) (data + n - 2);
r >>= (4 - n) * 8;
r[0] = *(u16u *) data;
return (u8x16) r;
}
else if (n > 0)
r[0] = *data;
return r;
}
static_always_inline void
u8x16_store_partial (u8x16 r, u8 *data, uword n)
{
if (n > 7)
{
*(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
*(u64u *) data = ((u64x2) r)[0];
}
else if (n > 3)
{
*(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
*(u32u *) data = ((u32x4) r)[0];
}
else if (n > 1)
{
*(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
*(u16u *) data = ((u16x8) r)[0];
}
else if (n > 0)
data[0] = r[0];
}
#define CLIB_HAVE_VEC128_MSB_MASK
#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
+62
View File
@@ -493,6 +493,68 @@ u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
return a ^ b ^ c;
}
static_always_inline u8x16
u8x16_load_partial (u8 *data, uword n)
{
u8x16 r = {};
#if defined(CLIB_HAVE_VEC128_MASK_LOAD_STORE)
return u8x16_mask_load_zero (data, pow2_mask (n));
#endif
if (n > 7)
{
u64x2 r;
r[1] = *(u64u *) (data + n - 8);
r >>= (16 - n) * 8;
r[0] = *(u64u *) data;
return (u8x16) r;
}
else if (n > 3)
{
u32x4 r = {};
r[1] = *(u32u *) (data + n - 4);
r >>= (8 - n) * 8;
r[0] = *(u32u *) data;
return (u8x16) r;
}
else if (n > 1)
{
u16x8 r = {};
r[1] = *(u16u *) (data + n - 2);
r >>= (4 - n) * 8;
r[0] = *(u16u *) data;
return (u8x16) r;
}
else if (n > 0)
r[0] = *data;
return r;
}
static_always_inline void
u8x16_store_partial (u8x16 r, u8 *data, uword n)
{
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u8x16_mask_store (r, data, pow2_mask (n));
#else
if (n > 7)
{
*(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
*(u64u *) data = ((u64x2) r)[0];
}
else if (n > 3)
{
*(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
*(u32u *) data = ((u32x4) r)[0];
}
else if (n > 1)
{
*(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
*(u16u *) data = ((u16x8) r)[0];
}
else if (n > 0)
data[0] = r[0];
#endif
}
#endif /* included_vector_sse2_h */
/*