Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives

This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case: ``` diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index ad268ae7057..6aa1a56056e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + request.pipeline->metalrt_hair = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR); + request.pipeline->metalrt_hair_thick = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + request.pipeline->metalrt_pointcloud = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); { thread_scoped_lock lock(cache_mutex); @@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); + bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); MetalKernelPipeline *best_pipeline = nullptr; for (auto &pipeline : collection) { ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D14923
2022-05-11 14:52:49 +01:00 · 2022-05-11 14:52:49 +01:00 · 007184bcf2
commit 007184bcf2
parent 59cd616534
6 changed files with 600 additions and 591 deletions
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -28,7 +28,8 @@ class MetalDevice : public Device {
  id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
  id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
      nil; /* encoder used for fetching device pointers from MTLBuffers */
-  string source_used_for_compile[PSO_NUM];
+  string source[PSO_NUM];
+  string source_md5[PSO_NUM];

  KernelParamsMetal launch_params = {0};

@ -72,7 +73,6 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

-  MetalDeviceKernels kernels;
  bool use_metalrt = false;
  bool use_function_specialisation = false;

@ -110,6 +110,8 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

+  id<MTLLibrary> compile(string const &source);
+
  /* ------------------------------------------------------------------ */
  /* low-level memory management */

--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  NSError *error = NULL;
+  source[PSO_GENERIC] = get_source(kernel_features);
+  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);

-  for (int i = 0; i < PSO_NUM; i++) {
-    if (mtlLibrary[i]) {
-      [mtlLibrary[i] release];
-      mtlLibrary[i] = nil;
-    }
-  }
+  MD5Hash md5;
+  md5.append(source[PSO_GENERIC]);
+  source_md5[PSO_GENERIC] = md5.get_hex();

+  metal_printf("Front-end compilation finished (generic)\n");
+
+  bool result = MetalDeviceKernels::load(this, false);
+
+  reserve_local_memory(kernel_features);
+
+  return result;
+}
+
+id<MTLLibrary> MetalDevice::compile(string const &source)
+{
  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

  options.fastMathEnabled = YES;
  if (@available(macOS 12.0, *)) {
    options.languageVersion = MTLLanguageVersion2_4;
  }
-  else {
-    return false;
-  }

-  string metalsrc;
-
-  /* local helper: dump source to disk and return filepath */
-  auto dump_source = [&](int kernel_type) -> string {
-    string &source = source_used_for_compile[kernel_type];
-    string metalsrc = path_cache_get(path_join("kernels",
-                                               string_printf("%s.%s.metal",
-                                                             kernel_type_as_string(kernel_type),
-                                                             util_md5_string(source).c_str())));
-    path_write_text(metalsrc, source);
-    return metalsrc;
-  };
-
-  /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
-   * then compile it into a MTLLibrary */
-  auto fetch_and_compile_source = [&](int kernel_type) {
-    /* Record the source used to compile this library, for hash building later. */
-    string &source = source_used_for_compile[kernel_type];
-
-    switch (kernel_type) {
-      case PSO_GENERIC: {
-        source = get_source(kernel_features);
-        break;
-      }
-      case PSO_SPECIALISED: {
-        /* PSO_SPECIALISED derives from PSO_GENERIC */
-        string &generic_source = source_used_for_compile[PSO_GENERIC];
-        if (generic_source.empty()) {
-          generic_source = get_source(kernel_features);
-        }
-        source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
-        break;
-      }
-      default:
-        assert(0);
-    }
-
-    /* create MTLLibrary (front-end compilation) */
-    mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
+  NSError *error = NULL;
+  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                      options:options
                                                        error:&error];

-    bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
-
-    if (!mtlLibrary[kernel_type] || do_source_dump) {
-      string metalsrc = dump_source(kernel_type);
-
-      if (!mtlLibrary[kernel_type]) {
-        NSString *err = [error localizedDescription];
-        set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
-
-        return false;
-      }
-    }
-    return true;
-  };
-
-  fetch_and_compile_source(PSO_GENERIC);
-
-  if (use_function_specialisation) {
-    fetch_and_compile_source(PSO_SPECIALISED);
+  if (!mtlLibrary) {
+    NSString *err = [error localizedDescription];
+    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
  }

-  metal_printf("Front-end compilation finished\n");
-
-  bool result = kernels.load(this, PSO_GENERIC);
-
  [options release];
-  reserve_local_memory(kernel_features);

-  return result;
+  return mtlLibrary;
 }

 void MetalDevice::reserve_local_memory(const uint kernel_features)
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -54,103 +54,41 @@ enum {
 const char *kernel_type_as_string(int kernel_type);

 struct MetalKernelPipeline {
-  void release()
-  {
-    if (pipeline) {
-      [pipeline release];
-      pipeline = nil;
-      if (@available(macOS 11.0, *)) {
-        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-          if (intersection_func_table[i]) {
-            [intersection_func_table[i] release];
-            intersection_func_table[i] = nil;
-          }
-        }
-      }
-    }
-    if (function) {
-      [function release];
-      function = nil;
-    }
-    if (@available(macOS 11.0, *)) {
-      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-        if (intersection_func_table[i]) {
-          [intersection_func_table[i] release];
-        }
-      }
-    }
-  }

+  void compile();
+
+  id<MTLLibrary> mtlLibrary = nil;
+  bool scene_specialized;
+  string source_md5;
+
+  bool use_metalrt;
+  bool metalrt_hair;
+  bool metalrt_hair_thick;
+  bool metalrt_pointcloud;
+
+  int threads_per_threadgroup;
+
+  DeviceKernel device_kernel;
  bool loaded = false;
+  id<MTLDevice> mtlDevice = nil;
  id<MTLFunction> function = nil;
  id<MTLComputePipelineState> pipeline = nil;
+  int num_threads_per_block = 0;
+
+  string error_str;

  API_AVAILABLE(macos(11.0))
  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
-};
-
-struct MetalKernelLoadDesc {
-  int pso_index = 0;
-  const char *function_name = nullptr;
-  int kernel_index = 0;
-  int threads_per_threadgroup = 0;
-  MTLFunctionConstantValues *constant_values = nullptr;
-  NSArray *linked_functions = nullptr;
-
-  struct IntersectorFunctions {
-    NSArray *defaults;
-    NSArray *shadow;
-    NSArray *local;
-    NSArray *operator[](int index) const
-    {
-      if (index == METALRT_TABLE_DEFAULT)
-        return defaults;
-      if (index == METALRT_TABLE_SHADOW)
-        return shadow;
-      return local;
-    }
-  } intersector_functions = {nullptr};
-};
-
-/* Metal kernel and associate occupancy information. */
-class MetalDeviceKernel {
- public:
-  ~MetalDeviceKernel();
-
-  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
-
-  void mark_loaded(int pso_index)
-  {
-    pso[pso_index].loaded = true;
-  }
-
-  int get_num_threads_per_block() const
-  {
-    return num_threads_per_block;
-  }
-  const MetalKernelPipeline &get_pso() const;
-
-  double load_duration = 0.0;
-
- private:
-  MetalKernelPipeline pso[PSO_NUM];
-
-  int num_threads_per_block = 0;
+  id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
 };

 /* Cache of Metal kernels for each DeviceKernel. */
-class MetalDeviceKernels {
- public:
-  bool load(MetalDevice *device, int kernel_type);
-  bool available(DeviceKernel kernel) const;
-  const MetalDeviceKernel &get(DeviceKernel kernel) const;
+namespace MetalDeviceKernels {

-  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+bool load(MetalDevice *device, bool scene_specialized);
+const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);

-  id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
-
-  string loaded_md5[PSO_NUM];
-};
+} /* namespace MetalDeviceKernels */

 CCL_NAMESPACE_END

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -108,9 +108,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
          << work_size;

-  const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
-  const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
-
  id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);

  /* Determine size requirement for argument buffer. */
@ -212,6 +209,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  }
  bytes_written = globals_offsets + sizeof(KernelParamsMetal);

+  const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(metal_device,
+                                                                                      kernel);
+  if (!metal_kernel_pso) {
+    metal_device->set_error(
+        string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
+    return false;
+  }
+
  /* Encode ancillaries */
  [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
  [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@ -228,14 +233,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-        if (metal_kernel_pso.intersection_func_table[table]) {
-          [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
-                                                              offset:globals_offsets
-                                                             atIndex:1];
+        if (metal_kernel_pso->intersection_func_table[table]) {
+          [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
+                                                               offset:globals_offsets
+                                                              atIndex:1];
          [metal_device->mtlAncillaryArgEncoder
-              setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
+              setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
                                   atIndex:3 + table];
-          [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
+          [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
                                          usage:MTLResourceUsageRead];
        }
        else {
@ -281,10 +286,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
    }
  }

-  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
+  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];

  /* Compute kernel launch parameters. */
-  const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
+  const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;

  int shared_mem_bytes = 0;

@ -314,7 +319,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
                           threadsPerThreadgroup:size_threads_per_threadgroup];

  [mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
-    NSString *kernel_name = metal_kernel_pso.function.label;
+    NSString *kernel_name = metal_kernel_pso->function.label;

    /* Enhanced command buffer errors are only available in 11.0+ */
    if (@available(macos 11.0, *)) {
@ -547,6 +552,8 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
        computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
                                                           MTLDispatchTypeSerial];

+    [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
+
    /* declare usage of MTLBuffers etc */
    prepare_resources(kernel);
  }
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -29,10 +29,26 @@ using namespace metal::raytracing;

 /* Qualifiers */

-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device __attribute__((noinline))
+#if defined(__KERNEL_METAL_APPLE__)
+
+/* Inline everything for Apple GPUs.
+ * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
+ * at the cost of longer compile times (~4.5 minutes on M1 Max). */
+
+#  define ccl_device __attribute__((always_inline))
+#  define ccl_device_inline __attribute__((always_inline))
+#  define ccl_device_forceinline __attribute__((always_inline))
+#  define ccl_device_noinline __attribute__((always_inline))
+
+#else
+
+#  define ccl_device
+#  define ccl_device_inline ccl_device
+#  define ccl_device_forceinline ccl_device
+#  define ccl_device_noinline ccl_device __attribute__((noinline))
+
+#endif
+
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device