bihash table size perf/scale improvements
Directly allocate and carve cache-line-aligned chunks of virtual memory. To a first approximation, bihash wasn't using clib_mem_free(...). We eliminate mheap object header/trailers, which improves space efficiency. We also eliminate the 4gb bihash table size limit. An 8_8 bihash w/ 100 million random entries uses 3.8 Gbytes. Change-Id: Icf925fdf99bce7d6ac407ac4edd30560b8f04808 Signed-off-by: Dave Barach <dave@barachs.net>
This commit is contained in:

committed by
Florin Coras

parent
cae7134a8c
commit
97f5af0180
@ -600,12 +600,20 @@ ip6_fib_table_show_one (ip6_fib_t *fib,
|
||||
u8 *
|
||||
format_ip6_fib_table_memory (u8 * s, va_list * args)
|
||||
{
|
||||
uword bytes_inuse;
|
||||
|
||||
bytes_inuse =
|
||||
ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash.alloc_arena_next
|
||||
- ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash.alloc_arena;
|
||||
|
||||
bytes_inuse +=
|
||||
ip6_main.ip6_table[IP6_FIB_TABLE_FWDING].ip6_hash.alloc_arena_next
|
||||
- ip6_main.ip6_table[IP6_FIB_TABLE_FWDING].ip6_hash.alloc_arena;
|
||||
|
||||
s = format(s, "%=30s %=6d %=8ld\n",
|
||||
"IPv6 unicast",
|
||||
pool_elts(ip6_main.fibs),
|
||||
mheap_bytes(ip6_main.ip6_table[IP6_FIB_TABLE_NON_FWDING].ip6_hash.mheap) +
|
||||
mheap_bytes(ip6_main.ip6_table[IP6_FIB_TABLE_FWDING].ip6_hash.mheap));
|
||||
|
||||
bytes_inuse);
|
||||
return (s);
|
||||
}
|
||||
|
||||
|
@ -15,10 +15,28 @@
|
||||
|
||||
/** @cond DOCUMENTATION_IS_IN_BIHASH_DOC_H */
|
||||
|
||||
static inline void *BV (alloc_aligned) (BVT (clib_bihash) * h, uword nbytes)
|
||||
{
|
||||
uword rv;
|
||||
|
||||
/* Round to an even number of cache lines */
|
||||
nbytes += CLIB_CACHE_LINE_BYTES - 1;
|
||||
nbytes &= ~(CLIB_CACHE_LINE_BYTES - 1);
|
||||
|
||||
rv = h->alloc_arena_next;
|
||||
h->alloc_arena_next += nbytes;
|
||||
|
||||
if (rv >= (h->alloc_arena + h->alloc_arena_size))
|
||||
os_out_of_memory ();
|
||||
|
||||
return (void *) rv;
|
||||
}
|
||||
|
||||
|
||||
void BV (clib_bihash_init)
|
||||
(BVT (clib_bihash) * h, char *name, u32 nbuckets, uword memory_size)
|
||||
{
|
||||
void *oldheap;
|
||||
uword bucket_size;
|
||||
int i;
|
||||
|
||||
nbuckets = 1 << (max_log2 (nbuckets));
|
||||
@ -29,19 +47,19 @@ void BV (clib_bihash_init)
|
||||
h->cache_hits = 0;
|
||||
h->cache_misses = 0;
|
||||
|
||||
h->mheap = mheap_alloc (0 /* use VM */ , memory_size);
|
||||
h->alloc_arena = (uword) clib_mem_vm_alloc (memory_size);
|
||||
h->alloc_arena_next = h->alloc_arena;
|
||||
h->alloc_arena_size = memory_size;
|
||||
|
||||
oldheap = clib_mem_set_heap (h->mheap);
|
||||
vec_validate_aligned (h->buckets, nbuckets - 1, CLIB_CACHE_LINE_BYTES);
|
||||
h->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
|
||||
CLIB_CACHE_LINE_BYTES);
|
||||
bucket_size = nbuckets * sizeof (h->buckets[0]);
|
||||
h->buckets = BV (alloc_aligned) (h, bucket_size);
|
||||
|
||||
h->writer_lock = BV (alloc_aligned) (h, CLIB_CACHE_LINE_BYTES);
|
||||
h->writer_lock[0] = 0;
|
||||
|
||||
for (i = 0; i < nbuckets; i++)
|
||||
BV (clib_bihash_reset_cache) (h->buckets + i);
|
||||
|
||||
clib_mem_set_heap (oldheap);
|
||||
|
||||
h->fmt_fn = NULL;
|
||||
}
|
||||
|
||||
@ -53,7 +71,9 @@ void BV (clib_bihash_set_kvp_format_fn) (BVT (clib_bihash) * h,
|
||||
|
||||
void BV (clib_bihash_free) (BVT (clib_bihash) * h)
|
||||
{
|
||||
mheap_free (h->mheap);
|
||||
vec_free (h->working_copies);
|
||||
vec_free (h->freelists);
|
||||
clib_mem_vm_free ((void *) (h->alloc_arena), h->alloc_arena_size);
|
||||
memset (h, 0, sizeof (*h));
|
||||
}
|
||||
|
||||
@ -62,17 +82,12 @@ BVT (clib_bihash_value) *
|
||||
BV (value_alloc) (BVT (clib_bihash) * h, u32 log2_pages)
|
||||
{
|
||||
BVT (clib_bihash_value) * rv = 0;
|
||||
void *oldheap;
|
||||
|
||||
ASSERT (h->writer_lock[0]);
|
||||
if (log2_pages >= vec_len (h->freelists) || h->freelists[log2_pages] == 0)
|
||||
{
|
||||
oldheap = clib_mem_set_heap (h->mheap);
|
||||
|
||||
vec_validate (h->freelists, log2_pages);
|
||||
rv = clib_mem_alloc_aligned ((sizeof (*rv) * (1 << log2_pages)),
|
||||
CLIB_CACHE_LINE_BYTES);
|
||||
clib_mem_set_heap (oldheap);
|
||||
vec_validate_init_empty (h->freelists, log2_pages, 0);
|
||||
rv = BV (alloc_aligned) (h, (sizeof (*rv) * (1 << log2_pages)));
|
||||
goto initialize;
|
||||
}
|
||||
rv = h->freelists[log2_pages];
|
||||
@ -106,17 +121,14 @@ BV (make_working_copy) (BVT (clib_bihash) * h, BVT (clib_bihash_bucket) * b)
|
||||
{
|
||||
BVT (clib_bihash_value) * v;
|
||||
BVT (clib_bihash_bucket) working_bucket __attribute__ ((aligned (8)));
|
||||
void *oldheap;
|
||||
BVT (clib_bihash_value) * working_copy;
|
||||
u32 thread_index = os_get_thread_index ();
|
||||
int log2_working_copy_length;
|
||||
|
||||
if (thread_index >= vec_len (h->working_copies))
|
||||
{
|
||||
oldheap = clib_mem_set_heap (h->mheap);
|
||||
vec_validate (h->working_copies, thread_index);
|
||||
vec_validate_init_empty (h->working_copy_lengths, thread_index, ~0);
|
||||
clib_mem_set_heap (oldheap);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -128,22 +140,20 @@ BV (make_working_copy) (BVT (clib_bihash) * h, BVT (clib_bihash_bucket) * b)
|
||||
log2_working_copy_length = h->working_copy_lengths[thread_index];
|
||||
|
||||
h->saved_bucket.as_u64 = b->as_u64;
|
||||
oldheap = clib_mem_set_heap (h->mheap);
|
||||
|
||||
if (b->log2_pages > log2_working_copy_length)
|
||||
{
|
||||
if (working_copy)
|
||||
clib_mem_free (working_copy);
|
||||
|
||||
working_copy = clib_mem_alloc_aligned
|
||||
(sizeof (working_copy[0]) * (1 << b->log2_pages),
|
||||
CLIB_CACHE_LINE_BYTES);
|
||||
/*
|
||||
* It's not worth the bookkeeping to free working copies
|
||||
* if (working_copy)
|
||||
* clib_mem_free (working_copy);
|
||||
*/
|
||||
working_copy = BV (alloc_aligned)
|
||||
(h, sizeof (working_copy[0]) * (1 << b->log2_pages));
|
||||
h->working_copy_lengths[thread_index] = b->log2_pages;
|
||||
h->working_copies[thread_index] = working_copy;
|
||||
}
|
||||
|
||||
clib_mem_set_heap (oldheap);
|
||||
|
||||
/* Lock the bucket... */
|
||||
while (BV (clib_bihash_lock_bucket) (b) == 0)
|
||||
;
|
||||
@ -554,6 +564,7 @@ u8 *BV (format_bihash) (u8 * s, va_list * args)
|
||||
u64 active_elements = 0;
|
||||
u64 active_buckets = 0;
|
||||
u64 linear_buckets = 0;
|
||||
u64 used_bytes;
|
||||
|
||||
s = format (s, "Hash table %s\n", h->name ? h->name : (u8 *) "(unnamed)");
|
||||
|
||||
@ -633,8 +644,13 @@ u8 *BV (format_bihash) (u8 * s, va_list * args)
|
||||
s = format (s, " %lld linear search buckets\n", linear_buckets);
|
||||
s = format (s, " %lld cache hits, %lld cache misses\n",
|
||||
h->cache_hits, h->cache_misses);
|
||||
if (h->mheap)
|
||||
s = format (s, " mheap: %U", format_mheap, h->mheap, 0 /* verbose */ );
|
||||
used_bytes = h->alloc_arena_next - h->alloc_arena;
|
||||
s = format (s,
|
||||
" arena: base %llx, next %llx\n"
|
||||
" used %lld b (%lld Mbytes) of %lld b (%lld Mbytes)\n",
|
||||
h->alloc_arena, h->alloc_arena_next,
|
||||
used_bytes, used_bytes >> 20,
|
||||
h->alloc_arena_size, h->alloc_arena_size >> 20);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -89,7 +89,14 @@ typedef struct
|
||||
u64 cache_misses;
|
||||
|
||||
BVT (clib_bihash_value) ** freelists;
|
||||
void *mheap;
|
||||
|
||||
/*
|
||||
* Backing store allocation. Since bihash mananges its own
|
||||
* freelists, we simple dole out memory at alloc_arena_next.
|
||||
*/
|
||||
uword alloc_arena;
|
||||
uword alloc_arena_next;
|
||||
uword alloc_arena_size;
|
||||
|
||||
/**
|
||||
* A custom format function to print the Key and Value of bihash_key instead of default hexdump
|
||||
@ -224,7 +231,7 @@ static inline void BV (clib_bihash_unlock_bucket)
|
||||
static inline void *BV (clib_bihash_get_value) (BVT (clib_bihash) * h,
|
||||
uword offset)
|
||||
{
|
||||
u8 *hp = h->mheap;
|
||||
u8 *hp = (u8 *) h->alloc_arena;
|
||||
u8 *vp = hp + offset;
|
||||
|
||||
return (void *) vp;
|
||||
@ -235,10 +242,9 @@ static inline uword BV (clib_bihash_get_offset) (BVT (clib_bihash) * h,
|
||||
{
|
||||
u8 *hp, *vp;
|
||||
|
||||
hp = (u8 *) h->mheap;
|
||||
hp = (u8 *) h->alloc_arena;
|
||||
vp = (u8 *) v;
|
||||
|
||||
ASSERT ((vp - hp) < 0x100000000ULL);
|
||||
return vp - hp;
|
||||
}
|
||||
|
||||
|
@ -36,6 +36,7 @@ typedef struct
|
||||
int non_random_keys;
|
||||
uword *key_hash;
|
||||
u64 *keys;
|
||||
uword hash_memory_size;
|
||||
BVT (clib_bihash) hash;
|
||||
clib_time_t clib_time;
|
||||
|
||||
@ -101,8 +102,7 @@ test_bihash (test_main_t * tm)
|
||||
|
||||
h = &tm->hash;
|
||||
|
||||
BV (clib_bihash_init) (h, "test", tm->nbuckets, 3ULL << 30);
|
||||
|
||||
BV (clib_bihash_init) (h, "test", tm->nbuckets, tm->hash_memory_size);
|
||||
|
||||
for (acycle = 0; acycle < tm->ncycles; acycle++)
|
||||
{
|
||||
@ -269,10 +269,11 @@ test_bihash (test_main_t * tm)
|
||||
}
|
||||
|
||||
/* Clean up side-bet hash table and random key vector */
|
||||
for (i = 0; i < tm->nitems; i++)
|
||||
hash_unset (tm->key_hash, tm->keys[i]);
|
||||
|
||||
hash_free (tm->key_hash);
|
||||
vec_reset_length (tm->keys);
|
||||
/* Recreate hash table if we're going to need it again */
|
||||
if (acycle != (tm->ncycles - 1))
|
||||
tm->key_hash = hash_create (tm->nitems, sizeof (uword));
|
||||
}
|
||||
|
||||
fformat (stdout, "End of run, should be empty...\n");
|
||||
@ -322,6 +323,7 @@ test_bihash_main (test_main_t * tm)
|
||||
int which = 0;
|
||||
|
||||
tm->report_every_n = 1;
|
||||
tm->hash_memory_size = 4095ULL << 20;
|
||||
|
||||
while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
|
||||
{
|
||||
@ -344,6 +346,9 @@ test_bihash_main (test_main_t * tm)
|
||||
;
|
||||
else if (unformat (i, "report-every %d", &tm->report_every_n))
|
||||
;
|
||||
else if (unformat (i, "memory-size %U",
|
||||
unformat_memory_size, &tm->hash_memory_size))
|
||||
;
|
||||
else if (unformat (i, "vec64"))
|
||||
which = 1;
|
||||
else if (unformat (i, "cache"))
|
||||
@ -356,6 +361,12 @@ test_bihash_main (test_main_t * tm)
|
||||
format_unformat_error, i);
|
||||
}
|
||||
|
||||
/* Preallocate hash table, key vector */
|
||||
tm->key_hash = hash_create (tm->nitems, sizeof (uword));
|
||||
vec_validate (tm->keys, tm->nitems - 1);
|
||||
_vec_len (tm->keys) = 0;
|
||||
|
||||
|
||||
switch (which)
|
||||
{
|
||||
case 0:
|
||||
@ -385,7 +396,7 @@ main (int argc, char *argv[])
|
||||
clib_error_t *error;
|
||||
test_main_t *tm = &test_main;
|
||||
|
||||
clib_mem_init (0, 3ULL << 30);
|
||||
clib_mem_init (0, 4095ULL << 20);
|
||||
|
||||
tm->input = &i;
|
||||
tm->seed = 0xdeaddabe;
|
||||
@ -396,7 +407,6 @@ main (int argc, char *argv[])
|
||||
tm->verbose = 1;
|
||||
tm->search_iter = 1;
|
||||
tm->careful_delete_tests = 0;
|
||||
tm->key_hash = hash_create (0, sizeof (uword));
|
||||
clib_time_init (&tm->clib_time);
|
||||
|
||||
unformat_init_command_line (&i, argv);
|
||||
|
Reference in New Issue
Block a user