vppinfra: AES-CBC and AES-GCM refactor and optimizations
- crypto code moved to vppinfra for better testing and reuse - added 256-bit VAES support (Intel Client CPUs) - added AES_GMAC functions Change-Id: I960c8e14ca0a0126703e8f1589d86f32e2a98361 Type: improvement Signed-off-by: Damjan Marion <damarion@cisco.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
+43
-1119
File diff suppressed because it is too large
Load Diff
@@ -128,6 +128,11 @@ set(VPPINFRA_HEADERS
|
||||
clib.h
|
||||
cpu.h
|
||||
crc32.h
|
||||
crypto/sha2.h
|
||||
crypto/ghash.h
|
||||
crypto/aes.h
|
||||
crypto/aes_cbc.h
|
||||
crypto/aes_gcm.h
|
||||
dlist.h
|
||||
dlmalloc.h
|
||||
elf_clib.h
|
||||
@@ -168,7 +173,6 @@ set(VPPINFRA_HEADERS
|
||||
random_isaac.h
|
||||
rbtree.h
|
||||
serialize.h
|
||||
sha2.h
|
||||
smp.h
|
||||
socket.h
|
||||
sparse_vec.h
|
||||
@@ -278,6 +282,8 @@ if(VPP_BUILD_VPPINFRA_TESTS)
|
||||
endif(VPP_BUILD_VPPINFRA_TESTS)
|
||||
|
||||
set(test_files
|
||||
test/aes_cbc.c
|
||||
test/aes_gcm.c
|
||||
test/array_mask.c
|
||||
test/compress.c
|
||||
test/count_equal.c
|
||||
|
||||
@@ -28,10 +28,6 @@ typedef enum
|
||||
#define AES_KEY_ROUNDS(x) (10 + x * 2)
|
||||
#define AES_KEY_BYTES(x) (16 + x * 8)
|
||||
|
||||
static const u8x16 byte_mask_scale = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
};
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_block_load (u8 * p)
|
||||
{
|
||||
@@ -131,43 +127,6 @@ aes_block_store (u8 * p, u8x16 r)
|
||||
*(u8x16u *) p = r;
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_byte_mask (u8x16 x, u8 n_bytes)
|
||||
{
|
||||
return x & (u8x16_splat (n_bytes) > byte_mask_scale);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_load_partial (u8x16u * p, int n_bytes)
|
||||
{
|
||||
ASSERT (n_bytes <= 16);
|
||||
#ifdef __AVX512F__
|
||||
__m128i zero = { };
|
||||
return (u8x16) _mm_mask_loadu_epi8 (zero, (1 << n_bytes) - 1, p);
|
||||
#else
|
||||
u8x16 v = {};
|
||||
CLIB_ASSUME (n_bytes < 16);
|
||||
clib_memcpy_fast (&v, p, n_bytes);
|
||||
return v;
|
||||
#endif
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
aes_store_partial (void *p, u8x16 r, int n_bytes)
|
||||
{
|
||||
#if __aarch64__
|
||||
clib_memcpy_fast (p, &r, n_bytes);
|
||||
#else
|
||||
#ifdef __AVX512F__
|
||||
_mm_mask_storeu_epi8 (p, (1 << n_bytes) - 1, (__m128i) r);
|
||||
#else
|
||||
u8x16 mask = u8x16_splat (n_bytes) > byte_mask_scale;
|
||||
_mm_maskmoveu_si128 ((__m128i) r, (__m128i) mask, p);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static_always_inline u8x16
|
||||
aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
|
||||
{
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -86,7 +86,7 @@
|
||||
* This allows us to improve performance by deferring reduction. For example
|
||||
* to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
|
||||
*
|
||||
* __i128 Hi[4];
|
||||
* u8x16 Hi[4];
|
||||
* ghash_precompute (H, Hi, 4);
|
||||
*
|
||||
* ghash_data_t _gd, *gd = &_gd;
|
||||
@@ -151,6 +151,8 @@ gmul_hi_hi (u8x16 a, u8x16 b)
|
||||
typedef struct
|
||||
{
|
||||
u8x16 mid, hi, lo, tmp_lo, tmp_hi;
|
||||
u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
|
||||
u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
|
||||
int pending;
|
||||
} ghash_data_t;
|
||||
|
||||
@@ -172,7 +174,7 @@ ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
|
||||
/* a0 * b0 */
|
||||
gd->lo = gmul_lo_lo (a, b);
|
||||
/* a0 * b1 ^ a1 * b0 */
|
||||
gd->mid = (gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b));
|
||||
gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
|
||||
|
||||
/* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
|
||||
there is no pending data in tmp_lo and tmp_hi */
|
||||
@@ -270,12 +272,6 @@ static const u8x64 ghash4_poly2 = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u8x64 hi, lo, mid, tmp_lo, tmp_hi;
|
||||
int pending;
|
||||
} ghash4_data_t;
|
||||
|
||||
static_always_inline u8x64
|
||||
gmul4_lo_lo (u8x64 a, u8x64 b)
|
||||
{
|
||||
@@ -300,18 +296,17 @@ gmul4_hi_hi (u8x64 a, u8x64 b)
|
||||
return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
|
||||
}
|
||||
|
||||
|
||||
static_always_inline void
|
||||
ghash4_mul_first (ghash4_data_t * gd, u8x64 a, u8x64 b)
|
||||
ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
|
||||
{
|
||||
gd->hi = gmul4_hi_hi (a, b);
|
||||
gd->lo = gmul4_lo_lo (a, b);
|
||||
gd->mid = (gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b));
|
||||
gd->hi4 = gmul4_hi_hi (a, b);
|
||||
gd->lo4 = gmul4_lo_lo (a, b);
|
||||
gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
|
||||
gd->pending = 0;
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
|
||||
ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
|
||||
{
|
||||
u8x64 hi = gmul4_hi_hi (a, b);
|
||||
u8x64 lo = gmul4_lo_lo (a, b);
|
||||
@@ -319,63 +314,62 @@ ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
|
||||
if (gd->pending)
|
||||
{
|
||||
/* there is peding data from previous invocation so we can XOR */
|
||||
gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, hi);
|
||||
gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, lo);
|
||||
gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
|
||||
gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
|
||||
gd->pending = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* there is no peding data from previous invocation so we postpone XOR */
|
||||
gd->tmp_hi = hi;
|
||||
gd->tmp_lo = lo;
|
||||
gd->tmp_hi4 = hi;
|
||||
gd->tmp_lo4 = lo;
|
||||
gd->pending = 1;
|
||||
}
|
||||
gd->mid = u8x64_xor3 (gd->mid, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
|
||||
gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash4_reduce (ghash4_data_t * gd)
|
||||
ghash4_reduce (ghash_data_t *gd)
|
||||
{
|
||||
u8x64 r;
|
||||
|
||||
/* Final combination:
|
||||
gd->lo ^= gd->mid << 64
|
||||
gd->hi ^= gd->mid >> 64 */
|
||||
gd->lo4 ^= gd->mid4 << 64
|
||||
gd->hi4 ^= gd->mid4 >> 64 */
|
||||
|
||||
u8x64 midl = u8x64_word_shift_left (gd->mid, 8);
|
||||
u8x64 midr = u8x64_word_shift_right (gd->mid, 8);
|
||||
u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
|
||||
u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
|
||||
|
||||
if (gd->pending)
|
||||
{
|
||||
gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, midl);
|
||||
gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, midr);
|
||||
gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
|
||||
gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
|
||||
}
|
||||
else
|
||||
{
|
||||
gd->lo ^= midl;
|
||||
gd->hi ^= midr;
|
||||
gd->lo4 ^= midl;
|
||||
gd->hi4 ^= midr;
|
||||
}
|
||||
|
||||
r = gmul4_hi_lo (ghash4_poly2, gd->lo);
|
||||
gd->lo ^= u8x64_word_shift_left (r, 8);
|
||||
|
||||
r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
|
||||
gd->lo4 ^= u8x64_word_shift_left (r, 8);
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash4_reduce2 (ghash4_data_t * gd)
|
||||
ghash4_reduce2 (ghash_data_t *gd)
|
||||
{
|
||||
gd->tmp_lo = gmul4_lo_lo (ghash4_poly2, gd->lo);
|
||||
gd->tmp_hi = gmul4_lo_hi (ghash4_poly2, gd->lo);
|
||||
gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
|
||||
gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
ghash4_final (ghash4_data_t * gd)
|
||||
ghash4_final (ghash_data_t *gd)
|
||||
{
|
||||
u8x64 r;
|
||||
u8x32 t;
|
||||
|
||||
r = u8x64_xor3 (gd->hi, u8x64_word_shift_right (gd->tmp_lo, 4),
|
||||
u8x64_word_shift_left (gd->tmp_hi, 4));
|
||||
r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
|
||||
u8x64_word_shift_left (gd->tmp_hi4, 4));
|
||||
|
||||
/* horizontal XOR of 4 128-bit lanes */
|
||||
t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
|
||||
@@ -383,6 +377,117 @@ ghash4_final (ghash4_data_t * gd)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__VPCLMULQDQ__)
|
||||
|
||||
static const u8x32 ghash2_poly2 = {
|
||||
0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
|
||||
};
|
||||
|
||||
static_always_inline u8x32
|
||||
gmul2_lo_lo (u8x32 a, u8x32 b)
|
||||
{
|
||||
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
|
||||
}
|
||||
|
||||
static_always_inline u8x32
|
||||
gmul2_hi_lo (u8x32 a, u8x32 b)
|
||||
{
|
||||
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
|
||||
}
|
||||
|
||||
static_always_inline u8x32
|
||||
gmul2_lo_hi (u8x32 a, u8x32 b)
|
||||
{
|
||||
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
|
||||
}
|
||||
|
||||
static_always_inline u8x32
|
||||
gmul2_hi_hi (u8x32 a, u8x32 b)
|
||||
{
|
||||
return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
|
||||
{
|
||||
gd->hi2 = gmul2_hi_hi (a, b);
|
||||
gd->lo2 = gmul2_lo_lo (a, b);
|
||||
gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
|
||||
gd->pending = 0;
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
|
||||
{
|
||||
u8x32 hi = gmul2_hi_hi (a, b);
|
||||
u8x32 lo = gmul2_lo_lo (a, b);
|
||||
|
||||
if (gd->pending)
|
||||
{
|
||||
/* there is peding data from previous invocation so we can XOR */
|
||||
gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
|
||||
gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
|
||||
gd->pending = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* there is no peding data from previous invocation so we postpone XOR */
|
||||
gd->tmp_hi2 = hi;
|
||||
gd->tmp_lo2 = lo;
|
||||
gd->pending = 1;
|
||||
}
|
||||
gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash2_reduce (ghash_data_t *gd)
|
||||
{
|
||||
u8x32 r;
|
||||
|
||||
/* Final combination:
|
||||
gd->lo2 ^= gd->mid2 << 64
|
||||
gd->hi2 ^= gd->mid2 >> 64 */
|
||||
|
||||
u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
|
||||
u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
|
||||
|
||||
if (gd->pending)
|
||||
{
|
||||
gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
|
||||
gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
|
||||
}
|
||||
else
|
||||
{
|
||||
gd->lo2 ^= midl;
|
||||
gd->hi2 ^= midr;
|
||||
}
|
||||
|
||||
r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
|
||||
gd->lo2 ^= u8x32_word_shift_left (r, 8);
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
ghash2_reduce2 (ghash_data_t *gd)
|
||||
{
|
||||
gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
|
||||
gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
ghash2_final (ghash_data_t *gd)
|
||||
{
|
||||
u8x32 r;
|
||||
|
||||
r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
|
||||
u8x32_word_shift_left (gd->tmp_hi2, 4));
|
||||
|
||||
/* horizontal XOR of 2 128-bit lanes */
|
||||
return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
|
||||
}
|
||||
#endif
|
||||
|
||||
static_always_inline void
|
||||
ghash_precompute (u8x16 H, u8x16 * Hi, int n)
|
||||
{
|
||||
@@ -398,9 +503,7 @@ ghash_precompute (u8x16 H, u8x16 * Hi, int n)
|
||||
#else
|
||||
r32[3] = r32[0];
|
||||
#endif
|
||||
/* *INDENT-OFF* */
|
||||
r32 = r32 == (u32x4) {1, 0, 0, 1};
|
||||
/* *INDENT-ON* */
|
||||
Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
|
||||
|
||||
/* calculate H^(i + 1) */
|
||||
@@ -410,10 +513,3 @@ ghash_precompute (u8x16 H, u8x16 * Hi, int n)
|
||||
|
||||
#endif /* __ghash_h__ */
|
||||
|
||||
/*
|
||||
* fd.io coding-style-patch-verification: ON
|
||||
*
|
||||
* Local Variables:
|
||||
* eval: (c-set-style "gnu")
|
||||
* End:
|
||||
*/
|
||||
@@ -24,25 +24,21 @@ format_perfmon_bundle_default (u8 *s, va_list *args)
|
||||
case 1:
|
||||
return format (s, "%5.2f", (f64) d[2] / d[0]);
|
||||
case 2:
|
||||
if (c->n_ops > 1)
|
||||
return format (s, "%8.2f", (f64) d[0] / c->n_ops);
|
||||
else
|
||||
return format (s, "%8u", d[0]);
|
||||
return format (s, "%8u", d[0]);
|
||||
case 3:
|
||||
if (c->n_ops > 1)
|
||||
return format (s, "%8.2f", (f64) d[2] / c->n_ops);
|
||||
else
|
||||
return format (s, "%8u", d[2]);
|
||||
return format (s, "%8.2f", (f64) d[0] / c->n_ops);
|
||||
case 4:
|
||||
if (c->n_ops > 1)
|
||||
return format (s, "%9.2f", (f64) d[3] / c->n_ops);
|
||||
else
|
||||
return format (s, "%9u", d[3]);
|
||||
return format (s, "%8u", d[2]);
|
||||
case 5:
|
||||
if (c->n_ops > 1)
|
||||
return format (s, "%10.2f", (f64) d[4] / c->n_ops);
|
||||
else
|
||||
return format (s, "%10u", d[4]);
|
||||
return format (s, "%8.2f", (f64) d[2] / c->n_ops);
|
||||
case 6:
|
||||
return format (s, "%9u", d[3]);
|
||||
case 7:
|
||||
return format (s, "%9.2f", (f64) d[3] / c->n_ops);
|
||||
case 8:
|
||||
return format (s, "%10u", d[4]);
|
||||
case 9:
|
||||
return format (s, "%10.2f", (f64) d[4] / c->n_ops);
|
||||
default:
|
||||
return s;
|
||||
}
|
||||
@@ -59,6 +55,7 @@ CLIB_PERFMON_BUNDLE (default) = {
|
||||
.config[4] = PERF_COUNT_HW_BRANCH_MISSES,
|
||||
.n_events = 5,
|
||||
.format_fn = format_perfmon_bundle_default,
|
||||
.column_headers = CLIB_STRING_ARRAY ("Freq", "IPC", "Clks/Op", "Inst/Op",
|
||||
"Brnch/Op", "BrMiss/Op"),
|
||||
.column_headers = CLIB_STRING_ARRAY ("Freq", "IPC", "Clks", "Clks/Op",
|
||||
"Inst", "Inst/Op", "Brnch", "Brnch/Op",
|
||||
"BrMiss", "BrMiss/Op"),
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,187 @@
|
||||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright(c) 2021 Cisco Systems, Inc.
|
||||
*/
|
||||
|
||||
#ifdef __AES__
|
||||
#include <vppinfra/format.h>
|
||||
#include <vppinfra/test/test.h>
|
||||
#include <vppinfra/crypto/aes_cbc.h>
|
||||
|
||||
static const u8 iv[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
};
|
||||
|
||||
static const u8 plaintext[] = {
|
||||
0x6B, 0xC1, 0xBE, 0xE2, 0x2E, 0x40, 0x9F, 0x96, 0xE9, 0x3D, 0x7E, 0x11, 0x73,
|
||||
0x93, 0x17, 0x2A, 0xAE, 0x2D, 0x8A, 0x57, 0x1E, 0x03, 0xAC, 0x9C, 0x9E, 0xB7,
|
||||
0x6F, 0xAC, 0x45, 0xAF, 0x8E, 0x51, 0x30, 0xC8, 0x1C, 0x46, 0xA3, 0x5C, 0xE4,
|
||||
0x11, 0xE5, 0xFB, 0xC1, 0x19, 0x1A, 0x0A, 0x52, 0xEF, 0xF6, 0x9F, 0x24, 0x45,
|
||||
0xDF, 0x4F, 0x9B, 0x17, 0xAD, 0x2B, 0x41, 0x7B, 0xE6, 0x6C, 0x37, 0x10,
|
||||
};
|
||||
|
||||
static const u8 key128[] = { 0x2B, 0x7E, 0x15, 0x16, 0x28, 0xAE, 0xD2, 0xA6,
|
||||
0xAB, 0xF7, 0x15, 0x88, 0x09, 0xCF, 0x4F, 0x3C };
|
||||
|
||||
static const u8 key192[24] = {
|
||||
0x8E, 0x73, 0xB0, 0xF7, 0xDA, 0x0E, 0x64, 0x52, 0xC8, 0x10, 0xF3, 0x2B,
|
||||
0x80, 0x90, 0x79, 0xE5, 0x62, 0xF8, 0xEA, 0xD2, 0x52, 0x2C, 0x6B, 0x7B,
|
||||
};
|
||||
|
||||
static const u8 ciphertext128[] = {
|
||||
0x76, 0x49, 0xAB, 0xAC, 0x81, 0x19, 0xB2, 0x46, 0xCE, 0xE9, 0x8E, 0x9B, 0x12,
|
||||
0xE9, 0x19, 0x7D, 0x50, 0x86, 0xCB, 0x9B, 0x50, 0x72, 0x19, 0xEE, 0x95, 0xDB,
|
||||
0x11, 0x3A, 0x91, 0x76, 0x78, 0xB2, 0x73, 0xBE, 0xD6, 0xB8, 0xE3, 0xC1, 0x74,
|
||||
0x3B, 0x71, 0x16, 0xE6, 0x9E, 0x22, 0x22, 0x95, 0x16, 0x3F, 0xF1, 0xCA, 0xA1,
|
||||
0x68, 0x1F, 0xAC, 0x09, 0x12, 0x0E, 0xCA, 0x30, 0x75, 0x86, 0xE1, 0xA7,
|
||||
};
|
||||
|
||||
static const u8 ciphertext192[64] = {
|
||||
0x4F, 0x02, 0x1D, 0xB2, 0x43, 0xBC, 0x63, 0x3D, 0x71, 0x78, 0x18, 0x3A, 0x9F,
|
||||
0xA0, 0x71, 0xE8, 0xB4, 0xD9, 0xAD, 0xA9, 0xAD, 0x7D, 0xED, 0xF4, 0xE5, 0xE7,
|
||||
0x38, 0x76, 0x3F, 0x69, 0x14, 0x5A, 0x57, 0x1B, 0x24, 0x20, 0x12, 0xFB, 0x7A,
|
||||
0xE0, 0x7F, 0xA9, 0xBA, 0xAC, 0x3D, 0xF1, 0x02, 0xE0, 0x08, 0xB0, 0xE2, 0x79,
|
||||
0x88, 0x59, 0x88, 0x81, 0xD9, 0x20, 0xA9, 0xE6, 0x4F, 0x56, 0x15, 0xCD,
|
||||
};
|
||||
|
||||
static const u8 key256[32] = {
|
||||
0x60, 0x3D, 0xEB, 0x10, 0x15, 0xCA, 0x71, 0xBE, 0x2B, 0x73, 0xAE,
|
||||
0xF0, 0x85, 0x7D, 0x77, 0x81, 0x1F, 0x35, 0x2C, 0x07, 0x3B, 0x61,
|
||||
0x08, 0xD7, 0x2D, 0x98, 0x10, 0xA3, 0x09, 0x14, 0xDF, 0xF4,
|
||||
};
|
||||
|
||||
static const u8 ciphertext256[64] = {
|
||||
0xF5, 0x8C, 0x4C, 0x04, 0xD6, 0xE5, 0xF1, 0xBA, 0x77, 0x9E, 0xAB, 0xFB, 0x5F,
|
||||
0x7B, 0xFB, 0xD6, 0x9C, 0xFC, 0x4E, 0x96, 0x7E, 0xDB, 0x80, 0x8D, 0x67, 0x9F,
|
||||
0x77, 0x7B, 0xC6, 0x70, 0x2C, 0x7D, 0x39, 0xF2, 0x33, 0x69, 0xA9, 0xD9, 0xBA,
|
||||
0xCF, 0xA5, 0x30, 0xE2, 0x63, 0x04, 0x23, 0x14, 0x61, 0xB2, 0xEB, 0x05, 0xE2,
|
||||
0xC3, 0x9B, 0xE9, 0xFC, 0xDA, 0x6C, 0x19, 0x07, 0x8C, 0x6A, 0x9D, 0x1B,
|
||||
};
|
||||
|
||||
#define _(b) \
|
||||
static clib_error_t *test_clib_aes##b##_cbc_encrypt (clib_error_t *err) \
|
||||
{ \
|
||||
aes_cbc_key_data_t k; \
|
||||
u8 data[512]; \
|
||||
clib_aes##b##_cbc_key_expand (&k, key##b); \
|
||||
clib_aes##b##_cbc_encrypt (&k, plaintext, sizeof (plaintext), iv, data); \
|
||||
if (memcmp (ciphertext##b, data, sizeof (ciphertext##b)) != 0) \
|
||||
err = \
|
||||
clib_error_return (err, "encrypted data doesn't match plaintext"); \
|
||||
return err; \
|
||||
} \
|
||||
void __test_perf_fn perftest_aes##b##_enc_var_sz (test_perf_t *tp) \
|
||||
{ \
|
||||
u32 n = tp->n_ops; \
|
||||
aes_cbc_key_data_t *kd = test_mem_alloc (sizeof (*kd)); \
|
||||
u8 *dst = test_mem_alloc (n + 16); \
|
||||
u8 *src = test_mem_alloc_and_fill_inc_u8 (n + 16, 0, 0); \
|
||||
clib_aes##b##_cbc_key_expand (kd, key##b); \
|
||||
\
|
||||
test_perf_event_enable (tp); \
|
||||
clib_aes##b##_cbc_encrypt (kd, src, n, iv, dst); \
|
||||
test_perf_event_disable (tp); \
|
||||
}
|
||||
_ (128)
|
||||
_ (192)
|
||||
_ (256)
|
||||
#undef _
|
||||
|
||||
REGISTER_TEST (clib_aes128_cbc_encrypt) = {
|
||||
.name = "clib_aes128_cbc_encrypt",
|
||||
.fn = test_clib_aes128_cbc_encrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes128_enc_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes128_enc_var_sz }),
|
||||
};
|
||||
|
||||
REGISTER_TEST (clib_aes192_cbc_encrypt) = {
|
||||
.name = "clib_aes192_cbc_encrypt",
|
||||
.fn = test_clib_aes192_cbc_encrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes192_enc_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes192_enc_var_sz }),
|
||||
};
|
||||
|
||||
REGISTER_TEST (clib_aes256_cbc_encrypt) = {
|
||||
.name = "clib_aes256_cbc_encrypt",
|
||||
.fn = test_clib_aes256_cbc_encrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes256_enc_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes256_enc_var_sz }),
|
||||
};
|
||||
|
||||
#define _(b) \
|
||||
static clib_error_t *test_clib_aes##b##_cbc_decrypt (clib_error_t *err) \
|
||||
{ \
|
||||
aes_cbc_key_data_t k; \
|
||||
u8 data[512]; \
|
||||
clib_aes##b##_cbc_key_expand (&k, key##b); \
|
||||
clib_aes##b##_cbc_decrypt (&k, ciphertext##b, sizeof (ciphertext##b), iv, \
|
||||
data); \
|
||||
if (memcmp (plaintext, data, sizeof (plaintext)) != 0) \
|
||||
err = \
|
||||
clib_error_return (err, "decrypted data doesn't match plaintext"); \
|
||||
return err; \
|
||||
} \
|
||||
void __test_perf_fn perftest_aes##b##_dec_var_sz (test_perf_t *tp) \
|
||||
{ \
|
||||
u32 n = tp->n_ops; \
|
||||
aes_cbc_key_data_t *kd = test_mem_alloc (sizeof (*kd)); \
|
||||
u8 *dst = test_mem_alloc (n + 16); \
|
||||
u8 *src = test_mem_alloc_and_fill_inc_u8 (n + 16, 0, 0); \
|
||||
clib_aes##b##_cbc_key_expand (kd, key##b); \
|
||||
\
|
||||
test_perf_event_enable (tp); \
|
||||
clib_aes##b##_cbc_decrypt (kd, src, n, iv, dst); \
|
||||
test_perf_event_disable (tp); \
|
||||
}
|
||||
|
||||
_ (128)
|
||||
_ (192)
|
||||
_ (256)
|
||||
#undef _
|
||||
|
||||
REGISTER_TEST (clib_aes128_cbc_decrypt) = {
|
||||
.name = "clib_aes128_cbc_decrypt",
|
||||
.fn = test_clib_aes128_cbc_decrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes128_dec_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes128_dec_var_sz }),
|
||||
};
|
||||
|
||||
REGISTER_TEST (clib_aes192_cbc_decrypt) = {
|
||||
.name = "clib_aes192_cbc_decrypt",
|
||||
.fn = test_clib_aes192_cbc_decrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes192_dec_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes192_dec_var_sz }),
|
||||
};
|
||||
|
||||
REGISTER_TEST (clib_aes256_cbc_decrypt) = {
|
||||
.name = "clib_aes256_cbc_decrypt",
|
||||
.fn = test_clib_aes256_cbc_decrypt,
|
||||
.perf_tests = PERF_TESTS ({ .name = "variable size (per byte)",
|
||||
.n_ops = 1424,
|
||||
.fn = perftest_aes256_dec_var_sz },
|
||||
{ .name = "variable size (per byte)",
|
||||
.n_ops = 9008,
|
||||
.fn = perftest_aes256_dec_var_sz }),
|
||||
};
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -223,6 +223,16 @@ u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
|
||||
return a ^ b ^ c;
|
||||
}
|
||||
|
||||
static_always_inline u8x32
|
||||
u8x32_reflect_u8x16 (u8x32 x)
|
||||
{
|
||||
static const u8x32 mask = {
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
|
||||
};
|
||||
return (u8x32) _mm256_shuffle_epi8 ((__m256i) x, (__m256i) mask);
|
||||
}
|
||||
|
||||
static_always_inline u16x16
|
||||
u16x16_mask_last (u16x16 v, u8 n_last)
|
||||
{
|
||||
@@ -332,6 +342,11 @@ u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
|
||||
(__m256i) mask);
|
||||
}
|
||||
|
||||
#define u8x32_word_shift_left(a, n) \
|
||||
(u8x32) _mm256_bslli_epi128 ((__m256i) a, n)
|
||||
#define u8x32_word_shift_right(a, n) \
|
||||
(u8x32) _mm256_bsrli_epi128 ((__m256i) a, n)
|
||||
|
||||
#define u32x8_permute_lanes(a, b, m) \
|
||||
(u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
|
||||
#define u64x4_permute_lanes(a, b, m) \
|
||||
@@ -407,6 +422,46 @@ u8x32_splat_u8x16 (u8x16 a)
|
||||
return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
|
||||
}
|
||||
|
||||
static_always_inline u32x8
|
||||
u32x8_splat_u32x4 (u32x4 a)
|
||||
{
|
||||
return (u32x8) _mm256_broadcastsi128_si256 ((__m128i) a);
|
||||
}
|
||||
|
||||
static_always_inline u8x32
|
||||
u8x32_load_partial (u8 *data, uword n)
|
||||
{
|
||||
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
|
||||
return u8x32_mask_load_zero (data, pow2_mask (n));
|
||||
#else
|
||||
u8x32 r = {};
|
||||
if (n > 16)
|
||||
{
|
||||
r = u8x32_insert_lo (r, *(u8x16u *) data);
|
||||
r = u8x32_insert_hi (r, u8x16_load_partial (data + 16, n - 16));
|
||||
}
|
||||
else
|
||||
r = u8x32_insert_lo (r, u8x16_load_partial (data, n));
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
u8x32_store_partial (u8x32 r, u8 *data, uword n)
|
||||
{
|
||||
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
|
||||
u8x32_mask_store (r, data, pow2_mask (n));
|
||||
#else
|
||||
if (n > 16)
|
||||
{
|
||||
*(u8x16u *) data = u8x32_extract_lo (r);
|
||||
u8x16_store_partial (u8x32_extract_hi (r), data + 16, n - 16);
|
||||
}
|
||||
else
|
||||
u8x16_store_partial (u8x32_extract_lo (r), data, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* included_vector_avx2_h */
|
||||
|
||||
/*
|
||||
|
||||
@@ -593,6 +593,18 @@ u64x8_transpose (u64x8 m[8])
|
||||
m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
|
||||
}
|
||||
|
||||
static_always_inline u8x64
|
||||
u8x64_load_partial (u8 *data, uword n)
|
||||
{
|
||||
return u8x64_mask_load_zero (data, pow2_mask (n));
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
u8x64_store_partial (u8x64 r, u8 *data, uword n)
|
||||
{
|
||||
u8x64_mask_store (r, data, pow2_mask (n));
|
||||
}
|
||||
|
||||
#endif /* included_vector_avx512_h */
|
||||
/*
|
||||
* fd.io coding-style-patch-verification: ON
|
||||
|
||||
@@ -231,6 +231,61 @@ __asm__ ("eor3 %0.16b,%1.16b,%2.16b,%3.16b": "=w" (r): "0" (a), "w" (b), "w" (c)
|
||||
return a ^ b ^ c;
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
u8x16_load_partial (u8 *data, uword n)
|
||||
{
|
||||
u8x16 r = {};
|
||||
if (n > 7)
|
||||
{
|
||||
u64x2 r;
|
||||
r[1] = *(u64u *) (data + n - 8);
|
||||
r >>= (16 - n) * 8;
|
||||
r[0] = *(u64u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 3)
|
||||
{
|
||||
u32x4 r = {};
|
||||
r[1] = *(u32u *) (data + n - 4);
|
||||
r >>= (8 - n) * 8;
|
||||
r[0] = *(u32u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 1)
|
||||
{
|
||||
u16x8 r = {};
|
||||
r[1] = *(u16u *) (data + n - 2);
|
||||
r >>= (4 - n) * 8;
|
||||
r[0] = *(u16u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 0)
|
||||
r[0] = *data;
|
||||
return r;
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
u8x16_store_partial (u8x16 r, u8 *data, uword n)
|
||||
{
|
||||
if (n > 7)
|
||||
{
|
||||
*(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
|
||||
*(u64u *) data = ((u64x2) r)[0];
|
||||
}
|
||||
else if (n > 3)
|
||||
{
|
||||
*(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
|
||||
*(u32u *) data = ((u32x4) r)[0];
|
||||
}
|
||||
else if (n > 1)
|
||||
{
|
||||
*(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
|
||||
*(u16u *) data = ((u16x8) r)[0];
|
||||
}
|
||||
else if (n > 0)
|
||||
data[0] = r[0];
|
||||
}
|
||||
|
||||
#define CLIB_HAVE_VEC128_MSB_MASK
|
||||
|
||||
#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
|
||||
|
||||
@@ -493,6 +493,68 @@ u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
|
||||
return a ^ b ^ c;
|
||||
}
|
||||
|
||||
static_always_inline u8x16
|
||||
u8x16_load_partial (u8 *data, uword n)
|
||||
{
|
||||
u8x16 r = {};
|
||||
#if defined(CLIB_HAVE_VEC128_MASK_LOAD_STORE)
|
||||
return u8x16_mask_load_zero (data, pow2_mask (n));
|
||||
#endif
|
||||
if (n > 7)
|
||||
{
|
||||
u64x2 r;
|
||||
r[1] = *(u64u *) (data + n - 8);
|
||||
r >>= (16 - n) * 8;
|
||||
r[0] = *(u64u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 3)
|
||||
{
|
||||
u32x4 r = {};
|
||||
r[1] = *(u32u *) (data + n - 4);
|
||||
r >>= (8 - n) * 8;
|
||||
r[0] = *(u32u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 1)
|
||||
{
|
||||
u16x8 r = {};
|
||||
r[1] = *(u16u *) (data + n - 2);
|
||||
r >>= (4 - n) * 8;
|
||||
r[0] = *(u16u *) data;
|
||||
return (u8x16) r;
|
||||
}
|
||||
else if (n > 0)
|
||||
r[0] = *data;
|
||||
return r;
|
||||
}
|
||||
|
||||
static_always_inline void
|
||||
u8x16_store_partial (u8x16 r, u8 *data, uword n)
|
||||
{
|
||||
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
|
||||
u8x16_mask_store (r, data, pow2_mask (n));
|
||||
#else
|
||||
if (n > 7)
|
||||
{
|
||||
*(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
|
||||
*(u64u *) data = ((u64x2) r)[0];
|
||||
}
|
||||
else if (n > 3)
|
||||
{
|
||||
*(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
|
||||
*(u32u *) data = ((u32x4) r)[0];
|
||||
}
|
||||
else if (n > 1)
|
||||
{
|
||||
*(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
|
||||
*(u16u *) data = ((u16x8) r)[0];
|
||||
}
|
||||
else if (n > 0)
|
||||
data[0] = r[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* included_vector_sse2_h */
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user