vppinfra: fix mask compare and compress OOB reads

Use mask_load_zero to avoid out-of-buffer reads in vectorized
function versions.

Type: fix
Signed-off-by: Dmitry Valter <d-valter@yandex-team.com>
Change-Id: I12bcb817ccf2db210c1c99fdfa444dc3f540035b
This commit is contained in:
Dmitry Valter
2024-03-06 22:43:27 +00:00
committed by Damjan Marion
parent df47a0812a
commit 3150250bc9
3 changed files with 297 additions and 33 deletions

View File

@ -51,18 +51,21 @@ static compress_test_t tests[] = {
static clib_error_t *
test_clib_compress_u64 (clib_error_t *err)
{
u64 src[513];
u64 dst[513];
u32 i, j;
for (i = 0; i < ARRAY_LEN (src); i++)
src[i] = i;
for (i = 0; i < ARRAY_LEN (tests); i++)
{
compress_test_t *t = tests + i;
u64 src[t->n_elts];
#ifdef CLIB_SANITIZE_ADDR
u64 dst[t->n_elts];
#else /* CLIB_SANITIZE_ADDR */
u64 dst[513];
#endif /* CLIB_SANITIZE_ADDR */
u64 *dp = dst;
u32 r;
for (j = 0; j < t->n_elts; j++)
src[j] = j;
for (j = 0; j < ARRAY_LEN (dst); j++)
dst[j] = 0xa5a5a5a5a5a5a5a5;
@ -81,8 +84,10 @@ test_clib_compress_u64 (clib_error_t *err)
dp++;
}
#ifndef CLIB_SANITIZE_ADDR
if (dst[dp - dst + 1] != 0xa5a5a5a5a5a5a5a5)
return clib_error_return (err, "buffer overrun in testcase %u", i);
#endif /* CLIB_SANITIZE_ADDR */
if (dp - dst != r)
return clib_error_return (err, "wrong number of elts in testcase %u",
@ -95,18 +100,21 @@ test_clib_compress_u64 (clib_error_t *err)
static clib_error_t *
test_clib_compress_u32 (clib_error_t *err)
{
u32 src[513];
u32 dst[513];
u32 i, j;
for (i = 0; i < ARRAY_LEN (src); i++)
src[i] = i;
for (i = 0; i < ARRAY_LEN (tests); i++)
{
compress_test_t *t = tests + i;
u32 src[t->n_elts];
#ifdef CLIB_SANITIZE_ADDR
u32 dst[t->n_elts];
#else /* CLIB_SANITIZE_ADDR */
u32 dst[513];
#endif /* CLIB_SANITIZE_ADDR */
u32 *dp = dst;
u32 r;
for (j = 0; j < t->n_elts; j++)
src[j] = j;
for (j = 0; j < ARRAY_LEN (dst); j++)
dst[j] = 0xa5a5a5a5;
@ -126,8 +134,10 @@ test_clib_compress_u32 (clib_error_t *err)
dp++;
}
#ifndef CLIB_SANITIZE_ADDR
if (dst[dp - dst + 1] != 0xa5a5a5a5)
return clib_error_return (err, "buffer overrun in testcase %u", i);
#endif /* CLIB_SANITIZE_ADDR */
if (dp - dst != r)
return clib_error_return (err, "wrong number of elts in testcase %u",
@ -140,18 +150,21 @@ test_clib_compress_u32 (clib_error_t *err)
static clib_error_t *
test_clib_compress_u16 (clib_error_t *err)
{
u16 src[513];
u16 dst[513];
u32 i, j;
for (i = 0; i < ARRAY_LEN (src); i++)
src[i] = i;
for (i = 0; i < ARRAY_LEN (tests); i++)
{
compress_test_t *t = tests + i;
u16 src[t->n_elts];
#ifdef CLIB_SANITIZE_ADDR
u16 dst[t->n_elts];
#else /* CLIB_SANITIZE_ADDR */
u16 dst[513];
#endif /* CLIB_SANITIZE_ADDR */
u16 *dp = dst;
u32 r;
for (j = 0; j < t->n_elts; j++)
src[j] = j;
for (j = 0; j < ARRAY_LEN (dst); j++)
dst[j] = 0xa5a5;
@ -170,8 +183,10 @@ test_clib_compress_u16 (clib_error_t *err)
dp++;
}
#ifndef CLIB_SANITIZE_ADDR
if (dst[dp - dst + 1] != 0xa5a5)
return clib_error_return (err, "buffer overrun in testcase %u", i);
#endif /* CLIB_SANITIZE_ADDR */
if (dp - dst != r)
return clib_error_return (err, "wrong number of elts in testcase %u",
@ -184,18 +199,21 @@ test_clib_compress_u16 (clib_error_t *err)
static clib_error_t *
test_clib_compress_u8 (clib_error_t *err)
{
u8 src[513];
u8 dst[513];
u32 i, j;
for (i = 0; i < ARRAY_LEN (src); i++)
src[i] = i;
for (i = 0; i < ARRAY_LEN (tests); i++)
{
compress_test_t *t = tests + i;
u8 src[t->n_elts];
#ifdef CLIB_SANITIZE_ADDR
u8 dst[t->n_elts];
#else /* CLIB_SANITIZE_ADDR */
u8 dst[513];
#endif /* CLIB_SANITIZE_ADDR */
u8 *dp = dst;
u32 r;
for (j = 0; j < t->n_elts; j++)
src[j] = j;
for (j = 0; j < ARRAY_LEN (dst); j++)
dst[j] = 0xa5;
@ -214,8 +232,10 @@ test_clib_compress_u8 (clib_error_t *err)
dp++;
}
#ifndef CLIB_SANITIZE_ADDR
if (dst[dp - dst + 1] != 0xa5)
return clib_error_return (err, "buffer overrun in testcase %u", i);
#endif /* CLIB_SANITIZE_ADDR */
if (dp - dst != r)
return clib_error_return (err, "wrong number of elts in testcase %u",

View File

@ -34,6 +34,37 @@ clib_compress_u64_x64 (u64 *dst, u64 *src, u64 mask)
return dst;
}
static_always_inline u64 *
clib_compress_u64_x64_masked (u64 *dst, u64 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS) && \
defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
u64x8u *sv = (u64x8u *) src;
for (int i = 0; i < 8; i++)
{
u64x8u s = u64x8_mask_load_zero (&sv[i], mask);
u64x8_compress_store (s, mask, dst);
dst += _popcnt32 ((u8) mask);
mask >>= 8;
}
#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \
defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u64x4u *sv = (u64x4u *) src;
for (int i = 0; i < 16; i++)
{
u64x4u s = u64x4_mask_load_zero (&sv[i], mask);
u64x4_compress_store (s, mask, dst);
dst += _popcnt32 (((u8) mask) & 0x0f);
mask >>= 4;
}
#else
u32 i;
foreach_set_bit_index (i, mask)
dst++[0] = src[i];
#endif
return dst;
}
/** \brief Compress array of 64-bit elemments into destination array based on
* mask
@ -66,7 +97,9 @@ clib_compress_u64 (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
return clib_compress_u64_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
return clib_compress_u64_x64_masked (dst, src,
mask[0] & pow2_mask (n_elts)) -
dst0;
}
static_always_inline u32 *
@ -97,6 +130,38 @@ clib_compress_u32_x64 (u32 *dst, u32 *src, u64 mask)
return dst;
}
static_always_inline u32 *
clib_compress_u32_x64_masked (u32 *dst, u32 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS) && \
defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
u32x16u *sv = (u32x16u *) src;
for (int i = 0; i < 4; i++)
{
u32x16u s = u32x16_mask_load_zero (&sv[i], mask);
u32x16_compress_store (s, mask, dst);
dst += _popcnt32 ((u16) mask);
mask >>= 16;
}
#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \
defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u32x8u *sv = (u32x8u *) src;
for (int i = 0; i < 8; i++)
{
u32x8u s = u32x8_mask_load_zero (&sv[i], mask);
u32x8_compress_store (s, mask, dst);
dst += _popcnt32 ((u8) mask);
mask >>= 8;
}
#else
u32 i;
foreach_set_bit_index (i, mask)
dst++[0] = src[i];
#endif
return dst;
}
/** \brief Compress array of 32-bit elemments into destination array based on
* mask
@ -129,7 +194,9 @@ clib_compress_u32 (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
return clib_compress_u32_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
return clib_compress_u32_x64_masked (dst, src,
mask[0] & pow2_mask (n_elts)) -
dst0;
}
static_always_inline u16 *
@ -151,6 +218,27 @@ clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask)
return dst;
}
static_always_inline u16 *
clib_compress_u16_x64_masked (u16 *dst, u16 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \
defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
u16x32u *sv = (u16x32u *) src;
for (int i = 0; i < 2; i++)
{
u16x32u s = u16x32_mask_load_zero (&sv[i], mask);
u16x32_compress_store (s, mask, dst);
dst += _popcnt32 ((u32) mask);
mask >>= 32;
}
#else
u32 i;
foreach_set_bit_index (i, mask)
dst++[0] = src[i];
#endif
return dst;
}
/** \brief Compress array of 16-bit elemments into destination array based on
* mask
@ -183,7 +271,9 @@ clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
return clib_compress_u16_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
return clib_compress_u16_x64_masked (dst, src,
mask[0] & pow2_mask (n_elts)) -
dst0;
}
static_always_inline u8 *
@ -201,6 +291,23 @@ clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask)
return dst;
}
static_always_inline u8 *
clib_compress_u8_x64_masked (u8 *dst, u8 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \
defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
u8x64u *sv = (u8x64u *) src;
u8x64u s = u8x64_mask_load_zero (sv, mask);
u8x64_compress_store (s, mask, dst);
dst += _popcnt64 (mask);
#else
u32 i;
foreach_set_bit_index (i, mask)
dst++[0] = src[i];
#endif
return dst;
}
/** \brief Compress array of 8-bit elemments into destination array based on
* mask
@ -233,7 +340,8 @@ clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
return clib_compress_u8_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
return clib_compress_u8_x64_masked (dst, src, mask[0] & pow2_mask (n_elts)) -
dst0;
}
#endif

View File

@ -8,7 +8,7 @@
#include <vppinfra/memcpy.h>
static_always_inline u64
clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
clib_mask_compare_u16_x64 (u16 v, u16 *a)
{
u64 mask = 0;
#if defined(CLIB_HAVE_VEC512)
@ -46,6 +46,38 @@ clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
(u64) i8x16_msb_mask (i8x16_pack (v8 == av[2], v8 == av[3])) << 16 |
(u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 |
(u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48);
#else
for (int i = 0; i < 64; i++)
if (a[i] == v)
mask |= 1ULL << i;
#endif
return mask;
}
static_always_inline u64
clib_mask_compare_u16_x64_n (u16 v, u16 *a, u32 n_elts)
{
u64 mask = 0;
CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
#if defined(CLIB_HAVE_VEC512)
u16x32 v32 = u16x32_splat (v);
u16x32u *av = (u16x32u *) a;
mask = ((u64) u16x32_is_equal_mask (
u16x32_mask_load_zero (&av[0], data_mask), v32) |
(u64) u16x32_is_equal_mask (
u16x32_mask_load_zero (&av[1], data_mask >> 32), v32)
<< 32);
#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u16x16 v16 = u16x16_splat (v);
u16x16u *av = (u16x16u *) a;
i8x32 x;
x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[0], data_mask),
v16 == u16x16_mask_load_zero (&av[1], data_mask >> 16));
mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3));
x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[2], data_mask >> 32),
v16 == u16x16_mask_load_zero (&av[3], data_mask >> 48));
mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32;
#else
for (int i = 0; i < n_elts; i++)
if (a[i] == v)
@ -68,7 +100,7 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
{
while (n_elts >= 64)
{
mask++[0] = clib_mask_compare_u16_x64 (v, a, 64);
mask++[0] = clib_mask_compare_u16_x64 (v, a);
n_elts -= 64;
a += 64;
}
@ -76,11 +108,11 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return;
mask[0] = clib_mask_compare_u16_x64 (v, a, n_elts) & pow2_mask (n_elts);
mask[0] = clib_mask_compare_u16_x64_n (v, a, n_elts) & pow2_mask (n_elts);
}
static_always_inline u64
clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
clib_mask_compare_u32_x64 (u32 v, u32 *a)
{
u64 mask = 0;
#if defined(CLIB_HAVE_VEC512)
@ -130,6 +162,57 @@ clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
av += 4;
}
#else
for (int i = 0; i < 64; i++)
if (a[i] == v)
mask |= 1ULL << i;
#endif
return mask;
}
static_always_inline u64
clib_mask_compare_u32_x64_n (u32 v, u32 *a, u32 n_elts)
{
u64 mask = 0;
CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
#if defined(CLIB_HAVE_VEC512)
u32x16 v16 = u32x16_splat (v);
u32x16u *av = (u32x16u *) a;
mask = ((u64) u32x16_is_equal_mask (
u32x16_mask_load_zero (&av[0], data_mask), v16) |
(u64) u32x16_is_equal_mask (
u32x16_mask_load_zero (&av[1], data_mask >> 16), v16)
<< 16 |
(u64) u32x16_is_equal_mask (
u32x16_mask_load_zero (&av[2], data_mask >> 32), v16)
<< 32 |
(u64) u32x16_is_equal_mask (
u32x16_mask_load_zero (&av[3], data_mask >> 48), v16)
<< 48);
#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u32x8 v8 = u32x8_splat (v);
u32x8u *av = (u32x8u *) a;
u32x8 m = { 0, 4, 1, 5, 2, 6, 3, 7 };
i8x32 c;
c = i8x32_pack (
i16x16_pack (
(i32x8) (v8 == u32x8_mask_load_zero (&av[0], data_mask)),
(i32x8) (v8 == u32x8_mask_load_zero (&av[1], data_mask >> 8))),
i16x16_pack (
(i32x8) (v8 == u32x8_mask_load_zero (&av[2], data_mask >> 16)),
(i32x8) (v8 == u32x8_mask_load_zero (&av[3], data_mask >> 24))));
mask = i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m));
c = i8x32_pack (
i16x16_pack (
(i32x8) (v8 == u32x8_mask_load_zero (&av[4], data_mask >> 32)),
(i32x8) (v8 == u32x8_mask_load_zero (&av[5], data_mask >> 40))),
i16x16_pack (
(i32x8) (v8 == u32x8_mask_load_zero (&av[6], data_mask >> 48)),
(i32x8) (v8 == u32x8_mask_load_zero (&av[7], data_mask >> 56))));
mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
#else
for (int i = 0; i < n_elts; i++)
if (a[i] == v)
@ -152,7 +235,7 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
{
while (n_elts >= 64)
{
bitmap++[0] = clib_mask_compare_u32_x64 (v, a, 64);
bitmap++[0] = clib_mask_compare_u32_x64 (v, a);
n_elts -= 64;
a += 64;
}
@ -160,11 +243,11 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return;
bitmap[0] = clib_mask_compare_u32_x64 (v, a, n_elts) & pow2_mask (n_elts);
bitmap[0] = clib_mask_compare_u32_x64_n (v, a, n_elts) & pow2_mask (n_elts);
}
static_always_inline u64
clib_mask_compare_u64_x64 (u64 v, u64 *a, u32 n_elts)
clib_mask_compare_u64_x64 (u64 v, u64 *a)
{
u64 mask = 0;
#if defined(CLIB_HAVE_VEC512)
@ -189,6 +272,59 @@ clib_mask_compare_u64_x64 (u64 v, u64 *a, u32 n_elts)
u64 h = u8x32_msb_mask (v4 == av[i + 1]);
mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
}
#else
for (int i = 0; i < 64; i++)
if (a[i] == v)
mask |= 1ULL << i;
#endif
return mask;
}
static_always_inline u64
clib_mask_compare_u64_x64_n (u64 v, u64 *a, u32 n_elts)
{
u64 mask = 0;
CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
#if defined(CLIB_HAVE_VEC512)
u64x8 v8 = u64x8_splat (v);
u64x8u *av = (u64x8u *) a;
mask =
((u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[0], data_mask), v8) |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[1], data_mask >> 8),
v8)
<< 8 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[2], data_mask >> 16),
v8)
<< 16 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[3], data_mask >> 24),
v8)
<< 24 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[4], data_mask >> 32),
v8)
<< 32 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[5], data_mask >> 40),
v8)
<< 40 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[6], data_mask >> 48),
v8)
<< 48 |
(u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[7], data_mask >> 56),
v8)
<< 56);
#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__) && \
defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u64x4 v4 = u64x4_splat (v);
u64x4u *av = (u64x4u *) a;
for (int i = 0; i < 16; i += 2)
{
u64 l = u8x32_msb_mask (v4 == u64x4_mask_load_zero (&av[i], data_mask));
u64 h = u8x32_msb_mask (
v4 == u64x4_mask_load_zero (&av[i + 1], data_mask >> 4));
mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
data_mask >>= 8;
}
#else
for (int i = 0; i < n_elts; i++)
if (a[i] == v)
@ -211,7 +347,7 @@ clib_mask_compare_u64 (u64 v, u64 *a, u64 *bitmap, u32 n_elts)
{
while (n_elts >= 64)
{
bitmap++[0] = clib_mask_compare_u64_x64 (v, a, 64);
bitmap++[0] = clib_mask_compare_u64_x64 (v, a);
n_elts -= 64;
a += 64;
}
@ -219,7 +355,7 @@ clib_mask_compare_u64 (u64 v, u64 *a, u64 *bitmap, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return;
bitmap[0] = clib_mask_compare_u64_x64 (v, a, n_elts) & pow2_mask (n_elts);
bitmap[0] = clib_mask_compare_u64_x64_n (v, a, n_elts) & pow2_mask (n_elts);
}
#endif