perfmon: topdown level 1 and 2 for icx

Topdown level 1 and 2 for Intel Ice Lake (ICX). Limiting topdown support to THREAD for the moment on Ice Lake, as NODE support is still unreliable. Also removing Topdown Level 1 from Sapphire Rapids onwards, as Topdown LeveL 2 also shows Level 1 on Sapphire, and it reduces the overall number of bundles. Type: improvement Signed-off-by: Ray Kinsella <mdr@ashroe.eu> Change-Id: Iaa68b711dc8b6fb1090880b411debadb3c37f8bc
2022-01-27 09:55:02 +00:00
parent 7e8aeb876b
commit 9d0c638b0f
4 changed files with 183 additions and 63 deletions
--- a/src/plugins/perfmon/CMakeLists.txt
+++ b/src/plugins/perfmon/CMakeLists.txt
@@ -32,6 +32,7 @@ add_vpp_plugin(perfmon
  intel/bundle/branch_mispred.c
  intel/bundle/power_license.c
  intel/bundle/topdown_metrics.c
+  intel/bundle/topdown_icelake.c
  intel/bundle/topdown_tremont.c
  intel/bundle/frontend_bound_bw.c
  intel/bundle/frontend_bound_lat.c
--- a/src/plugins/perfmon/intel/bundle/topdown_icelake.c
+++ b/src/plugins/perfmon/intel/bundle/topdown_icelake.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2022 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <perfmon/perfmon.h>
+#include <perfmon/intel/core.h>
+
+static int
+is_icelake ()
+{
+  return clib_cpu_supports_avx512_bitalg () && !clib_cpu_supports_movdir64b ();
+}
+
+static perfmon_cpu_supports_t topdown_lvl2_cpu_supports_icx[] = {
+  { is_icelake, PERFMON_BUNDLE_TYPE_THREAD }
+};
+
+#define GET_METRIC(m, i) (f64) (((m) >> (i * 8)) & 0xff)
+
+enum
+{
+  TD_SLOTS = 0,
+  STALLS_MEM_ANY,
+  STALLS_TOTAL,
+  BOUND_ON_STORES,
+  RECOVERY_CYCLES,
+  UOP_DROPPING,
+  UOP_NOT_DELIVERED,
+  TD_RETIRING,
+  TD_BAD_SPEC,
+  TD_FE_BOUND,
+  TD_BE_BOUND,
+};
+
+static_always_inline f64
+memory_bound_fraction (perfmon_reading_t *ss)
+{
+  return (ss->value[STALLS_MEM_ANY] + ss->value[BOUND_ON_STORES]) /
+	 (f64) (ss->value[STALLS_TOTAL] + ss->value[BOUND_ON_STORES]);
+}
+
+static_always_inline f64
+perf_metrics_sum (perfmon_reading_t *ss)
+{
+  return ss->value[TD_RETIRING] + ss->value[TD_BAD_SPEC] +
+	 ss->value[TD_FE_BOUND] + ss->value[TD_BE_BOUND];
+}
+
+static_always_inline f64
+retiring (perfmon_reading_t *ss)
+{
+  return ss->value[TD_RETIRING] / perf_metrics_sum (ss);
+}
+
+static_always_inline f64
+bad_speculation (perfmon_reading_t *ss)
+{
+  return ss->value[TD_BAD_SPEC] / perf_metrics_sum (ss);
+}
+
+static_always_inline f64
+frontend_bound (perfmon_reading_t *ss)
+{
+  return (ss->value[TD_FE_BOUND] / perf_metrics_sum (ss)) -
+	 (ss->value[UOP_DROPPING] / perf_metrics_sum (ss));
+}
+
+static_always_inline f64
+backend_bound (perfmon_reading_t *ss)
+{
+  return (ss->value[TD_BE_BOUND] / perf_metrics_sum (ss)) +
+	 ((5 * ss->value[RECOVERY_CYCLES]) / perf_metrics_sum (ss));
+}
+
+static_always_inline f64
+fetch_latency (perfmon_reading_t *ss)
+{
+  f64 r = ((5 * ss->value[UOP_NOT_DELIVERED] - ss->value[UOP_DROPPING]) /
+	   (f64) ss->value[TD_SLOTS]);
+  return r;
+}
+
+static_always_inline f64
+fetch_bandwidth (perfmon_reading_t *ss)
+{
+  return clib_max (0, frontend_bound (ss) - fetch_latency (ss));
+}
+
+static_always_inline f64
+memory_bound (perfmon_reading_t *ss)
+{
+  return backend_bound (ss) * memory_bound_fraction (ss);
+}
+
+static_always_inline f64
+core_bound (perfmon_reading_t *ss)
+{
+  return backend_bound (ss) - memory_bound (ss);
+}
+
+static u8 *
+format_topdown_lvl2_icx (u8 *s, va_list *args)
+{
+  perfmon_reading_t *ss = va_arg (*args, perfmon_reading_t *);
+  u64 idx = va_arg (*args, int);
+  f64 sv = 0;
+
+  switch (idx)
+    {
+    case 0:
+      sv = retiring (ss);
+      break;
+    case 1:
+      sv = bad_speculation (ss);
+      break;
+    case 2:
+      sv = frontend_bound (ss);
+      break;
+    case 3:
+      sv = backend_bound (ss);
+      break;
+    case 4:
+      sv = fetch_latency (ss);
+      break;
+    case 5:
+      sv = fetch_bandwidth (ss);
+      break;
+    case 6:
+      sv = memory_bound (ss);
+      break;
+    case 7:
+      sv = core_bound (ss);
+      break;
+    }
+
+  s = format (s, "%f", sv * 100);
+
+  return s;
+}
+
+PERFMON_REGISTER_BUNDLE (topdown_lvl2_metric_icx) = {
+  .name = "topdown",
+  .description = "Top-down Microarchitecture Analysis Level 1 & 2",
+  .source = "intel-core",
+  .events[0] = INTEL_CORE_E_TOPDOWN_SLOTS,
+  .events[1] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_MEM_ANY,
+  .events[2] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_TOTAL,
+  .events[3] = INTEL_CORE_E_EXE_ACTIVITY_BOUND_ON_STORES,
+  .events[4] = INTEL_CORE_E_INT_MISC_RECOVERY_CYCLES,
+  .events[5] = INTEL_CORE_E_INT_MISC_UOP_DROPPING,
+  .events[6] = INTEL_CORE_E_IDQ_UOPS_NOT_DELIVERED_CORE,
+  .events[7] = INTEL_CORE_E_TOPDOWN_L1_RETIRING_METRIC,
+  .events[8] = INTEL_CORE_E_TOPDOWN_L1_BAD_SPEC_METRIC,
+  .events[9] = INTEL_CORE_E_TOPDOWN_L1_FE_BOUND_METRIC,
+  .events[10] = INTEL_CORE_E_TOPDOWN_L1_BE_BOUND_METRIC,
+  .n_events = 11,
+  .cpu_supports = topdown_lvl2_cpu_supports_icx,
+  .n_cpu_supports = ARRAY_LEN (topdown_lvl2_cpu_supports_icx),
+  .format_fn = format_topdown_lvl2_icx,
+  .column_headers = PERFMON_STRINGS ("% RT", "% BS", "% FE", "% BE", "% FE.FL",
+				     "% FE.FB", "% BE.MB", "% BE.CB"),
+  .footer = "Retiring (RT), Bad Speculation (BS),\n"
+	    " FrontEnd bound (FE), BackEnd bound (BE),\n"
+	    " Fetch Latency (FL), Fetch Bandwidth (FB),\n"
+	    " Memory Bound (MB), Core Bound (CB)",
+};
--- a/src/plugins/perfmon/intel/bundle/topdown_metrics.c
+++ b/src/plugins/perfmon/intel/bundle/topdown_metrics.c
@@ -79,66 +79,6 @@ topdown_lvl1_rdpmc_metric (void *ps, topdown_e_t e)
  return (slots_t1 / slots_delta) * 100;
 }

-static u8 *
-format_topdown_lvl1 (u8 *s, va_list *args)
-{
-  void *ps = va_arg (*args, void *);
-  u64 idx = va_arg (*args, int);
-  perfmon_bundle_type_t type = va_arg (*args, perfmon_bundle_type_t);
-  f64 sv = 0;
-
-  topdown_lvl1_parse_fn_t *parse_fn,
-    *parse_fns[PERFMON_BUNDLE_TYPE_MAX] = { 0, topdown_lvl1_rdpmc_metric,
-					    topdown_lvl1_perf_reading, 0 };
-  parse_fn = parse_fns[type];
-  ASSERT (parse_fn);
-
-  switch (idx)
-    {
-    case 0:
-      sv =
-	parse_fn (ps, TOPDOWN_E_BAD_SPEC) + parse_fn (ps, TOPDOWN_E_RETIRING);
-      break;
-    case 1:
-      sv =
-	parse_fn (ps, TOPDOWN_E_BE_BOUND) + parse_fn (ps, TOPDOWN_E_FE_BOUND);
-      break;
-    default:
-      sv = parse_fn (ps, (topdown_e_t) idx - 2);
-      break;
-    }
-
-  s = format (s, "%f", sv);
-
-  return s;
-}
-
-static perfmon_cpu_supports_t topdown_lvl1_cpu_supports[] = {
-  /* Intel ICX supports papi/thread or rdpmc/node */
-  { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE_OR_THREAD }
-};
-
-PERFMON_REGISTER_BUNDLE (topdown_lvl1_metric) = {
-  .name = "topdown-level1",
-  .description = "Top-down Microarchitecture Analysis Level 1",
-  .source = "intel-core",
-  .events[0] = INTEL_CORE_E_TOPDOWN_SLOTS,
-  .events[1] = INTEL_CORE_E_TOPDOWN_L1_RETIRING_METRIC,
-  .events[2] = INTEL_CORE_E_TOPDOWN_L1_BAD_SPEC_METRIC,
-  .events[3] = INTEL_CORE_E_TOPDOWN_L1_FE_BOUND_METRIC,
-  .events[4] = INTEL_CORE_E_TOPDOWN_L1_BE_BOUND_METRIC,
-  .n_events = 5,
-  .preserve_samples = 0x1F,
-  .cpu_supports = topdown_lvl1_cpu_supports,
-  .n_cpu_supports = ARRAY_LEN (topdown_lvl1_cpu_supports),
-  .format_fn = format_topdown_lvl1,
-  .column_headers = PERFMON_STRINGS ("% NS", "% ST", "% NS.RT", "% NS.BS",
-				     "% ST.FE", "% ST.BE"),
-  .footer = "Not Stalled (NS),STalled (ST),\n"
-	    " Retiring (RT), Bad Speculation (BS),\n"
-	    " FrontEnd bound (FE), BackEnd bound (BE)",
-};
-
 /* Convert the TopDown enum to the perf reading index */
 #define TO_LVL2_PERF_IDX(e)                                                   \
  ({                                                                          \
@@ -245,8 +185,8 @@ static perfmon_cpu_supports_t topdown_lvl2_cpu_supports[] = {
 };

 PERFMON_REGISTER_BUNDLE (topdown_lvl2_metric) = {
-  .name = "topdown-level2",
-  .description = "Top-down Microarchitecture Analysis Level 2",
+  .name = "topdown",
+  .description = "Top-down Microarchitecture Analysis Level 1 & 2",
  .source = "intel-core",
  .events[0] = INTEL_CORE_E_TOPDOWN_SLOTS,
  .events[1] = INTEL_CORE_E_TOPDOWN_L1_RETIRING_METRIC,
--- a/src/plugins/perfmon/intel/core.h
+++ b/src/plugins/perfmon/intel/core.h
@@ -146,7 +146,10 @@
  _ (0x83, 0x04, 0, 0, 0, 0x00, ICACHE_64B, IFTAG_STALL,                      \
     "Cycles where a code fetch is stalled due to L1 instruction cache tag "  \
     "miss.")                                                                 \
-  _ (0x9C, 0x01, 0, 0, 0, 0x00, IDQ_UOPS_NOT_DELIVERED, CORE,                 \
+  _ (0x83, 0x02, 0, 0, 0, 0x00, ICACHE_64B, IFTAG_MISS,                       \
+     "Instruction fetch tag lookups that miss in the instruction cache "      \
+     "(L1I). Counts at 64-byte cache-line granularity.")                      \
+  _ (0x9C, 0x01, 0, 0, 0, 0x05, IDQ_UOPS_NOT_DELIVERED, CORE,                 \
     "Uops not delivered to Resource Allocation Table (RAT) per thread when " \
     "backend of the machine is not stalled")                                 \
  _ (0xA1, 0x01, 0, 0, 0, 0x00, UOPS_DISPATCHED, PORT_0,                      \