perfmon: top down level 1 support

Adding perfmon node TMAM support on ICX.

Type: improvement

Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Change-Id: I48a9a9ff6a72efc28eaf0cb11ef39fb62cebb126
This commit is contained in:
mdr78
2021-03-19 19:03:54 +00:00
committed by Damjan Marion
parent 3f923d2d46
commit 8e1384f7bf
9 changed files with 248 additions and 25 deletions

View File

@@ -30,4 +30,5 @@ add_vpp_plugin(perfmon
intel/bundle/cache_hit_miss.c
intel/bundle/branch_mispred.c
intel/bundle/power_license.c
intel/bundle/topdown_metrics.c
)

View File

@@ -128,7 +128,8 @@ show_perfmon_bundle_command_fn (vlib_main_t *vm, unformat_input_t *input,
vlib_cli_output (vm, "%U\n", format_perfmon_bundle, 0, 0);
for (int i = 0; i < vec_len (vb); i++)
vlib_cli_output (vm, "%U\n", format_perfmon_bundle, vb[i], verbose);
if (!vb[i]->cpu_supports || vb[i]->cpu_supports ())
vlib_cli_output (vm, "%U\n", format_perfmon_bundle, vb[i], verbose);
vec_free (vb);
return 0;
@@ -290,7 +291,8 @@ show_perfmon_stats_command_fn (vlib_main_t *vm, unformat_input_t *input,
n_instances = vec_len (it->instances);
vec_validate (readings, n_instances - 1);
for (int i = 0; i < n_instances; i++)
/*Only perform read() for THREAD or SYSTEM bundles*/
for (int i = 0; i < n_instances && b->type != PERFMON_BUNDLE_TYPE_NODE; i++)
{
in = vec_elt_at_index (it->instances, i);
r = vec_elt_at_index (readings, i);
@@ -340,6 +342,7 @@ show_perfmon_stats_command_fn (vlib_main_t *vm, unformat_input_t *input,
table_set_cell_align (t, col, -1, TTAA_RIGHT);
table_set_cell_fg_color (t, col, -1, TTAC_CYAN);
clib_memcpy_fast (&ns, tr->node_stats + j, sizeof (ns));
for (int j = 0; j < n_row; j++)
table_format_cell (t, col, j, "%U", b->format_fn, &ns, j);
}

View File

@@ -13,6 +13,7 @@
* limitations under the License.
*/
#include "vppinfra/string.h"
#include <vnet/vnet.h>
#include <vlibapi/api.h>
@@ -49,24 +50,32 @@ perfmon_read_pmcs (u64 *counters, int *pmc_index, u8 n_counters)
}
static_always_inline int
perfmon_calc_pmc_index (perfmon_thread_runtime_t *tr, u8 i)
perfmon_calc_mmap_offset (perfmon_thread_runtime_t *tr, u8 i)
{
return (int) (tr->mmap_pages[i]->index + tr->mmap_pages[i]->offset);
}
static_always_inline int
perfmon_metric_index (perfmon_bundle_t *b, u8 i)
{
return (int) (b->metrics[i]);
}
uword
perfmon_dispatch_wrapper (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_frame_t *frame)
perfmon_dispatch_wrapper_mmap (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_frame_t *frame)
{
perfmon_main_t *pm = &perfmon_main;
perfmon_thread_runtime_t *rt =
vec_elt_at_index (pm->thread_runtimes, vm->thread_index);
perfmon_node_stats_t *s =
vec_elt_at_index (rt->node_stats, node->node_index);
u8 n_events = rt->n_events;
int pmc_index[PERF_MAX_EVENTS];
u64 before[PERF_MAX_EVENTS];
u64 after[PERF_MAX_EVENTS];
int pmc_index[PERF_MAX_EVENTS];
uword rv;
clib_prefetch_load (s);
@@ -75,33 +84,87 @@ perfmon_dispatch_wrapper (vlib_main_t *vm, vlib_node_runtime_t *node,
{
default:
case 7:
pmc_index[6] = perfmon_calc_pmc_index (rt, 6);
pmc_index[6] = perfmon_calc_mmap_offset (rt, 6);
case 6:
pmc_index[5] = perfmon_calc_pmc_index (rt, 5);
pmc_index[5] = perfmon_calc_mmap_offset (rt, 5);
case 5:
pmc_index[4] = perfmon_calc_pmc_index (rt, 4);
pmc_index[4] = perfmon_calc_mmap_offset (rt, 4);
case 4:
pmc_index[3] = perfmon_calc_pmc_index (rt, 3);
pmc_index[3] = perfmon_calc_mmap_offset (rt, 3);
case 3:
pmc_index[2] = perfmon_calc_pmc_index (rt, 2);
pmc_index[2] = perfmon_calc_mmap_offset (rt, 2);
case 2:
pmc_index[1] = perfmon_calc_pmc_index (rt, 1);
pmc_index[1] = perfmon_calc_mmap_offset (rt, 1);
case 1:
pmc_index[0] = perfmon_calc_pmc_index (rt, 0);
pmc_index[0] = perfmon_calc_mmap_offset (rt, 0);
break;
}
perfmon_read_pmcs (before, pmc_index, n_events);
perfmon_read_pmcs (&before[0], pmc_index, n_events);
rv = node->function (vm, node, frame);
perfmon_read_pmcs (after, pmc_index, n_events);
perfmon_read_pmcs (&after[0], pmc_index, n_events);
if (rv == 0)
return rv;
s->n_calls += 1;
s->n_packets += rv;
for (int i = 0; i < n_events; i++)
s->value[i] += after[i] - before[i];
return rv;
}
uword
perfmon_dispatch_wrapper_metrics (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_frame_t *frame)
{
perfmon_main_t *pm = &perfmon_main;
perfmon_thread_runtime_t *rt =
vec_elt_at_index (pm->thread_runtimes, vm->thread_index);
perfmon_node_stats_t *s =
vec_elt_at_index (rt->node_stats, node->node_index);
u8 n_events = rt->n_events;
u64 before[PERF_MAX_EVENTS];
int pmc_index[PERF_MAX_EVENTS];
uword rv;
clib_prefetch_load (s);
switch (n_events)
{
default:
case 7:
pmc_index[6] = perfmon_metric_index (rt->bundle, 6);
case 6:
pmc_index[5] = perfmon_metric_index (rt->bundle, 5);
case 5:
pmc_index[4] = perfmon_metric_index (rt->bundle, 4);
case 4:
pmc_index[3] = perfmon_metric_index (rt->bundle, 3);
case 3:
pmc_index[2] = perfmon_metric_index (rt->bundle, 2);
case 2:
pmc_index[1] = perfmon_metric_index (rt->bundle, 1);
case 1:
pmc_index[0] = perfmon_metric_index (rt->bundle, 0);
break;
}
perfmon_read_pmcs (&before[0], pmc_index, n_events);
rv = node->function (vm, node, frame);
clib_memcpy_fast (&s->t[0].value[0], &before, sizeof (before));
perfmon_read_pmcs (&s->t[1].value[0], pmc_index, n_events);
if (rv == 0)
return rv;
s->n_calls += 1;
s->n_packets += rv;
return rv;
}

View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2021 Intel and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vnet/vnet.h>
#include <perfmon/perfmon.h>
#include <perfmon/intel/core.h>
#define GET_METRIC(m, i) (((m) >> (i * 8)) & 0xff)
#define GET_RATIO(m, i) (((m) >> (i * 32)) & 0xffffffff)
#define RDPMC_FIXED_SLOTS (1 << 30) /* fixed slots */
#define RDPMC_L1_METRICS (1 << 29) /* l1 metric counters */
#define FIXED_COUNTER_SLOTS 3
#define METRIC_COUNTER_TOPDOWN_L1 0
typedef enum
{
TOPDOWN_E_METRIC_RETIRING = 0,
TOPDOWN_E_METRIC_BAD_SPEC,
TOPDOWN_E_METRIC_FE_BOUND,
TOPDOWN_E_METRIC_BE_BOUND,
} topdown_lvl1_counters_t;
enum
{
TOPDOWN_SLOTS = 0,
TOPDOWN_METRICS,
} topdown_lvl1_metrics_t;
static_always_inline f32
topdown_lvl1_parse_row (perfmon_node_stats_t *ns, topdown_lvl1_counters_t e)
{
f64 slots_t0 =
ns->t[0].value[TOPDOWN_SLOTS] *
((f64) GET_METRIC (ns->t[0].value[TOPDOWN_METRICS], e) / 0xff);
f64 slots_t1 =
ns->t[1].value[TOPDOWN_SLOTS] *
((f64) GET_METRIC (ns->t[1].value[TOPDOWN_METRICS], e) / 0xff);
u64 slots_delta =
ns->t[1].value[TOPDOWN_SLOTS] - ns->t[0].value[TOPDOWN_SLOTS];
slots_t1 = slots_t1 - slots_t0;
return (slots_t1 / slots_delta) * 100;
}
static u8 *
format_topdown_lvl1 (u8 *s, va_list *args)
{
perfmon_node_stats_t *st = va_arg (*args, perfmon_node_stats_t *);
u64 row = va_arg (*args, int);
switch (row)
{
case 0:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_BAD_SPEC) +
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_RETIRING));
break;
case 1:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_BE_BOUND) +
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_FE_BOUND));
break;
case 2:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_RETIRING));
break;
case 3:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_BAD_SPEC));
break;
case 4:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_FE_BOUND));
break;
case 5:
s = format (s, "%f",
topdown_lvl1_parse_row (st, TOPDOWN_E_METRIC_BE_BOUND));
break;
}
return s;
}
PERFMON_REGISTER_BUNDLE (topdown_lvl1) = {
.name = "topdown-level1",
.description = "Top-down Microarchitecture Analysis Level 1",
.source = "intel-core",
.type = PERFMON_BUNDLE_TYPE_NODE,
.offset_type = PERFMON_OFFSET_TYPE_METRICS,
.events[0] = INTEL_CORE_E_TOPDOWN_SLOTS,
.events[1] = INTEL_CORE_E_TOPDOWN_L1_METRICS,
.metrics[0] = RDPMC_FIXED_SLOTS | FIXED_COUNTER_SLOTS,
.metrics[1] = RDPMC_L1_METRICS | METRIC_COUNTER_TOPDOWN_L1,
.n_events = 2,
.cpu_supports = clib_cpu_supports_avx512_bitalg,
.format_fn = format_topdown_lvl1,
.column_headers = PERFMON_STRINGS ("% NS", "% ST", "% NS.RT", "% NS.BS",
"% ST.FE", "% ST.BE"),
.footer = "Not Stalled (NS),STalled (ST),\n"
" Retiring (RT), Bad Speculation (BS),\n"
" FrontEnd bound (FE), BackEnd bound (BE)",
};

View File

@@ -20,12 +20,12 @@
static perfmon_event_t events[] = {
#define _(event, umask, edge, any, inv, cmask, n, suffix, desc) \
[INTEL_CORE_E_##n##_##suffix] = { \
.type = PERF_TYPE_RAW, \
.config = PERF_INTEL_CODE (event, umask, edge, any, inv, cmask), \
.name = #n "." #suffix, \
.description = desc, \
},
[INTEL_CORE_E_##n##_##suffix] = { .type = PERF_TYPE_RAW, \
.config = PERF_INTEL_CODE ( \
event, umask, edge, any, inv, cmask), \
.name = #n "." #suffix, \
.description = desc, \
.exclude_kernel = 1 },
foreach_perf_intel_core_event
#undef _

View File

@@ -27,6 +27,10 @@
"Core cycles when the thread is not in halt state") \
_ (0x00, 0x03, 0, 0, 0, 0x00, CPU_CLK_UNHALTED, REF_TSC, \
"Reference cycles when the core is not in halt state.") \
_ (0x00, 0x04, 0, 0, 0, 0x00, TOPDOWN, SLOTS, \
"TMA slots available for an unhalted logical processor.") \
_ (0x00, 0x80, 0, 0, 0, 0x00, TOPDOWN, L1_METRICS, \
"TMA slots metrics for an unhalted logical processor.") \
_ (0x03, 0x02, 0, 0, 0, 0x00, LD_BLOCKS, STORE_FORWARD, \
"Loads blocked due to overlapping with a preceding store that cannot be" \
" forwarded.") \

View File

@@ -193,6 +193,7 @@ perfmon_set (vlib_main_t *vm, perfmon_bundle_t *b)
{
perfmon_thread_runtime_t *rt;
rt = vec_elt_at_index (pm->thread_runtimes, i);
rt->bundle = b;
rt->n_events = b->n_events;
rt->n_nodes = n_nodes;
vec_validate_aligned (rt->node_stats, n_nodes - 1,
@@ -235,11 +236,20 @@ perfmon_start (vlib_main_t *vm, perfmon_bundle_t *b)
return clib_error_return_unix (0, "ioctl(PERF_EVENT_IOC_ENABLE)");
}
}
if (pm->active_bundle->type == PERFMON_BUNDLE_TYPE_NODE)
if (b->type == PERFMON_BUNDLE_TYPE_NODE)
{
vlib_node_function_t *funcs[PERFMON_OFFSET_TYPE_MAX];
#define _(type, pfunc) funcs[type] = pfunc;
foreach_permon_offset_type
#undef _
ASSERT (funcs[b->offset_type]);
for (int i = 0; i < vlib_get_n_threads (); i++)
vlib_node_set_dispatch_wrapper (vlib_get_main_by_index (i),
perfmon_dispatch_wrapper);
funcs[b->offset_type]);
}
pm->sample_time = vlib_time_now (vm);

View File

@@ -20,6 +20,7 @@
#include <vppinfra/clib.h>
#include <vppinfra/format.h>
#include <vppinfra/error.h>
#include <vppinfra/cpu.h>
#include <vlib/vlib.h>
#define PERF_MAX_EVENTS 7 /* 3 fixed and 4 programmable */
@@ -32,6 +33,13 @@ typedef enum
PERFMON_BUNDLE_TYPE_SYSTEM,
} perfmon_bundle_type_t;
typedef enum
{
PERFMON_OFFSET_TYPE_MMAP,
PERFMON_OFFSET_TYPE_METRICS,
PERFMON_OFFSET_TYPE_MAX,
} perfmon_offset_type_t;
typedef struct
{
u32 type_from_instance : 1;
@@ -61,7 +69,12 @@ typedef struct
} perfmon_instance_type_t;
struct perfmon_source;
vlib_node_function_t perfmon_dispatch_wrapper;
vlib_node_function_t perfmon_dispatch_wrapper_mmap;
vlib_node_function_t perfmon_dispatch_wrapper_metrics;
#define foreach_permon_offset_type \
_ (PERFMON_OFFSET_TYPE_MMAP, perfmon_dispatch_wrapper_mmap) \
_ (PERFMON_OFFSET_TYPE_METRICS, perfmon_dispatch_wrapper_metrics)
typedef clib_error_t *(perfmon_source_init_fn_t) (vlib_main_t *vm,
struct perfmon_source *);
@@ -78,8 +91,10 @@ typedef struct perfmon_source
} perfmon_source_t;
struct perfmon_bundle;
typedef clib_error_t *(perfmon_bundle_init_fn_t) (vlib_main_t *vm,
struct perfmon_bundle *);
typedef struct perfmon_bundle
{
char *name;
@@ -87,7 +102,9 @@ typedef struct perfmon_bundle
char *source;
char *footer;
perfmon_bundle_type_t type;
perfmon_offset_type_t offset_type;
u32 events[PERF_MAX_EVENTS];
u32 metrics[PERF_MAX_EVENTS];
u32 n_events;
perfmon_bundle_init_fn_t *init_fn;
@@ -95,6 +112,7 @@ typedef struct perfmon_bundle
char **column_headers;
char **raw_column_headers;
format_function_t *format_fn;
clib_cpu_supports_func_t cpu_supports;
/* do not set manually */
perfmon_source_t *src;
@@ -114,7 +132,14 @@ typedef struct
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
u64 n_calls;
u64 n_packets;
u64 value[PERF_MAX_EVENTS];
union
{
struct
{
u64 value[PERF_MAX_EVENTS];
} t[2];
u64 value[PERF_MAX_EVENTS * 2];
};
} perfmon_node_stats_t;
typedef struct
@@ -122,6 +147,7 @@ typedef struct
u8 n_events;
u16 n_nodes;
perfmon_node_stats_t *node_stats;
perfmon_bundle_t *bundle;
struct perf_event_mmap_page *mmap_pages[PERF_MAX_EVENTS];
} perfmon_thread_runtime_t;