From 007184bcf2121296fa244871382670b0f06210c0 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 11 May 2022 14:52:49 +0100 Subject: [PATCH] Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case: ``` diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index ad268ae7057..6aa1a56056e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + request.pipeline->metalrt_hair = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR); + request.pipeline->metalrt_hair_thick = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + request.pipeline->metalrt_pointcloud = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); { thread_scoped_lock lock(cache_mutex); @@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); + bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); MetalKernelPipeline *best_pipeline = nullptr; for (auto &pipeline : collection) { ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D14923 --- intern/cycles/device/metal/device_impl.h | 6 +- intern/cycles/device/metal/device_impl.mm | 96 +-- intern/cycles/device/metal/kernel.h | 108 +-- intern/cycles/device/metal/kernel.mm | 926 ++++++++++++--------- intern/cycles/device/metal/queue.mm | 31 +- intern/cycles/kernel/device/metal/compat.h | 24 +- 6 files changed, 600 insertions(+), 591 deletions(-) diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 27c58ce6d2f..7506b9b069f 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -28,7 +28,8 @@ class MetalDevice : public Device { id mtlGeneralCommandQueue = nil; id mtlAncillaryArgEncoder = nil; /* encoder used for fetching device pointers from MTLBuffers */ - string source_used_for_compile[PSO_NUM]; + string source[PSO_NUM]; + string source_md5[PSO_NUM]; KernelParamsMetal launch_params = {0}; @@ -72,7 +73,6 @@ class MetalDevice : public Device { id texture_bindings_3d = nil; std::vector> texture_slot_map; - MetalDeviceKernels kernels; bool use_metalrt = false; bool use_function_specialisation = false; @@ -110,6 +110,8 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + id compile(string const &source); + /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index c01f51fb506..e1438a9d6e2 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - NSError *error = NULL; + source[PSO_GENERIC] = get_source(kernel_features); + mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - for (int i = 0; i < PSO_NUM; i++) { - if (mtlLibrary[i]) { - [mtlLibrary[i] release]; - mtlLibrary[i] = nil; - } - } + MD5Hash md5; + md5.append(source[PSO_GENERIC]); + source_md5[PSO_GENERIC] = md5.get_hex(); + metal_printf("Front-end compilation finished (generic)\n"); + + bool result = MetalDeviceKernels::load(this, false); + + reserve_local_memory(kernel_features); + + return result; +} + +id MetalDevice::compile(string const &source) +{ MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; } - else { - return false; - } - string metalsrc; - - /* local helper: dump source to disk and return filepath */ - auto dump_source = [&](int kernel_type) -> string { - string &source = source_used_for_compile[kernel_type]; - string metalsrc = path_cache_get(path_join("kernels", - string_printf("%s.%s.metal", - kernel_type_as_string(kernel_type), - util_md5_string(source).c_str()))); - path_write_text(metalsrc, source); - return metalsrc; - }; - - /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor, - * then compile it into a MTLLibrary */ - auto fetch_and_compile_source = [&](int kernel_type) { - /* Record the source used to compile this library, for hash building later. */ - string &source = source_used_for_compile[kernel_type]; - - switch (kernel_type) { - case PSO_GENERIC: { - source = get_source(kernel_features); - break; - } - case PSO_SPECIALISED: { - /* PSO_SPECIALISED derives from PSO_GENERIC */ - string &generic_source = source_used_for_compile[PSO_GENERIC]; - if (generic_source.empty()) { - generic_source = get_source(kernel_features); - } - source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source; - break; - } - default: - assert(0); - } - - /* create MTLLibrary (front-end compilation) */ - mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str()) + NSError *error = NULL; + id mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) options:options error:&error]; - bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr); - - if (!mtlLibrary[kernel_type] || do_source_dump) { - string metalsrc = dump_source(kernel_type); - - if (!mtlLibrary[kernel_type]) { - NSString *err = [error localizedDescription]; - set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); - - return false; - } - } - return true; - }; - - fetch_and_compile_source(PSO_GENERIC); - - if (use_function_specialisation) { - fetch_and_compile_source(PSO_SPECIALISED); + if (!mtlLibrary) { + NSString *err = [error localizedDescription]; + set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); } - metal_printf("Front-end compilation finished\n"); - - bool result = kernels.load(this, PSO_GENERIC); - [options release]; - reserve_local_memory(kernel_features); - return result; + return mtlLibrary; } void MetalDevice::reserve_local_memory(const uint kernel_features) diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index b12491d820d..69b2a686ecc 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,103 +54,41 @@ enum { const char *kernel_type_as_string(int kernel_type); struct MetalKernelPipeline { - void release() - { - if (pipeline) { - [pipeline release]; - pipeline = nil; - if (@available(macOS 11.0, *)) { - for (int i = 0; i < METALRT_TABLE_NUM; i++) { - if (intersection_func_table[i]) { - [intersection_func_table[i] release]; - intersection_func_table[i] = nil; - } - } - } - } - if (function) { - [function release]; - function = nil; - } - if (@available(macOS 11.0, *)) { - for (int i = 0; i < METALRT_TABLE_NUM; i++) { - if (intersection_func_table[i]) { - [intersection_func_table[i] release]; - } - } - } - } + void compile(); + + id mtlLibrary = nil; + bool scene_specialized; + string source_md5; + + bool use_metalrt; + bool metalrt_hair; + bool metalrt_hair_thick; + bool metalrt_pointcloud; + + int threads_per_threadgroup; + + DeviceKernel device_kernel; bool loaded = false; + id mtlDevice = nil; id function = nil; id pipeline = nil; + int num_threads_per_block = 0; + + string error_str; API_AVAILABLE(macos(11.0)) id intersection_func_table[METALRT_TABLE_NUM] = {nil}; -}; - -struct MetalKernelLoadDesc { - int pso_index = 0; - const char *function_name = nullptr; - int kernel_index = 0; - int threads_per_threadgroup = 0; - MTLFunctionConstantValues *constant_values = nullptr; - NSArray *linked_functions = nullptr; - - struct IntersectorFunctions { - NSArray *defaults; - NSArray *shadow; - NSArray *local; - NSArray *operator[](int index) const - { - if (index == METALRT_TABLE_DEFAULT) - return defaults; - if (index == METALRT_TABLE_SHADOW) - return shadow; - return local; - } - } intersector_functions = {nullptr}; -}; - -/* Metal kernel and associate occupancy information. */ -class MetalDeviceKernel { - public: - ~MetalDeviceKernel(); - - bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5); - - void mark_loaded(int pso_index) - { - pso[pso_index].loaded = true; - } - - int get_num_threads_per_block() const - { - return num_threads_per_block; - } - const MetalKernelPipeline &get_pso() const; - - double load_duration = 0.0; - - private: - MetalKernelPipeline pso[PSO_NUM]; - - int num_threads_per_block = 0; + id rt_intersection_function[METALRT_FUNC_NUM] = {nil}; }; /* Cache of Metal kernels for each DeviceKernel. */ -class MetalDeviceKernels { - public: - bool load(MetalDevice *device, int kernel_type); - bool available(DeviceKernel kernel) const; - const MetalDeviceKernel &get(DeviceKernel kernel) const; +namespace MetalDeviceKernels { - MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM]; +bool load(MetalDevice *device, bool scene_specialized); +const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel); - id rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}}; - - string loaded_md5[PSO_NUM]; -}; +} /* namespace MetalDeviceKernels */ CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 9555ca03c8e..fc9a8cecd75 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -9,6 +9,7 @@ # include "util/path.h" # include "util/tbb.h" # include "util/time.h" +# include "util/unique_ptr.h" CCL_NAMESPACE_BEGIN @@ -28,315 +29,272 @@ const char *kernel_type_as_string(int kernel_type) return ""; } -MetalDeviceKernel::~MetalDeviceKernel() +bool kernel_has_intersection(DeviceKernel device_kernel) { - for (int i = 0; i < PSO_NUM; i++) { - pso[i].release(); + return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || + device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); +} + +struct ShaderCache { + ShaderCache(id _mtlDevice) : mtlDevice(_mtlDevice) + { + } + ~ShaderCache(); + + /* Get the fastest available pipeline for the specified kernel. */ + MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device); + + /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by + * device. */ + void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized); + + void wait_for_all(); + + private: + friend ShaderCache *get_shader_cache(id mtlDevice); + + void compile_thread_func(int thread_index); + + using PipelineCollection = std::vector>; + + struct PipelineRequest { + MetalKernelPipeline *pipeline = nullptr; + std::function completionHandler; + }; + + std::mutex cache_mutex; + + PipelineCollection pipelines[DEVICE_KERNEL_NUM]; + id mtlDevice; + + bool running = false; + std::condition_variable cond_var; + std::deque request_queue; + std::vector compile_threads; + std::atomic_int incomplete_requests = 0; +}; + +std::mutex g_shaderCacheMutex; +std::map, unique_ptr> g_shaderCache; + +ShaderCache *get_shader_cache(id mtlDevice) +{ + thread_scoped_lock lock(g_shaderCacheMutex); + auto it = g_shaderCache.find(mtlDevice); + if (it != g_shaderCache.end()) { + return it->second.get(); + } + + g_shaderCache[mtlDevice] = make_unique(mtlDevice); + return g_shaderCache[mtlDevice].get(); +} + +ShaderCache::~ShaderCache() +{ + metal_printf("ShaderCache shutting down with incomplete_requests = %d\n", + int(incomplete_requests)); + + running = false; + cond_var.notify_all(); + for (auto &thread : compile_threads) { + thread.join(); } } -bool MetalDeviceKernel::load(MetalDevice *device, - MetalKernelLoadDesc const &desc_in, - MD5Hash const &md5) +void ShaderCache::wait_for_all() { - __block MetalKernelLoadDesc const desc(desc_in); - if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - /* skip megakernel */ - return true; + while (incomplete_requests > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } +} - bool use_binary_archive = true; - if (device->device_vendor == METAL_GPU_APPLE) { - /* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */ - use_binary_archive = false; +void ShaderCache::compile_thread_func(int thread_index) +{ + while (1) { + + /* wait for / acquire next request */ + PipelineRequest request; + { + thread_scoped_lock lock(cache_mutex); + cond_var.wait(lock, [&] { return !running || !request_queue.empty(); }); + if (!running) { + break; + } + + if (!request_queue.empty()) { + request = request_queue.front(); + request_queue.pop_front(); + } + } + + /* service request */ + if (request.pipeline) { + request.pipeline->compile(); + incomplete_requests--; + } } +} - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { - use_binary_archive = (atoi(str) == 0); - } - - id archive = nil; - string metalbin_path; - if (use_binary_archive) { - NSProcessInfo *processInfo = [NSProcessInfo processInfo]; - string osVersion = [[processInfo operatingSystemVersionString] UTF8String]; - MD5Hash local_md5(md5); - local_md5.append(osVersion); - string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() + - to_string(desc.pso_index) + ".bin"; - metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); - path_create_directories(metalbin_path); - - if (path_exists(metalbin_path) && use_binary_archive) { - if (@available(macOS 11.0, *)) { - MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; - archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; - archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; - [archiveDesc release]; +void ShaderCache::load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + bool scene_specialized) +{ + { + /* create compiler threads on first run */ + thread_scoped_lock lock(cache_mutex); + if (compile_threads.empty()) { + running = true; + for (int i = 0; i < max_mtlcompiler_threads; i++) { + compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); } } } - NSString *entryPoint = [@(desc.function_name) copy]; + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + /* skip megakernel */ + return; + } + + if (scene_specialized) { + /* Only specialize kernels where it can make an impact. */ + if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + return; + } + } + + { + /* check whether the kernel has already been requested / cached */ + thread_scoped_lock lock(cache_mutex); + for (auto &pipeline : pipelines[device_kernel]) { + if (scene_specialized) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { + /* we already requested a pipeline that is specialised for this kernel data */ + metal_printf("Specialized kernel already requested (%s)\n", + device_kernel_as_string(device_kernel)); + return; + } + } + else { + if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) { + /* we already requested a generic pipeline for this kernel */ + metal_printf("Generic kernel already requested (%s)\n", + device_kernel_as_string(device_kernel)); + return; + } + } + } + } + + incomplete_requests++; + + PipelineRequest request; + request.pipeline = new MetalKernelPipeline; + request.pipeline->scene_specialized = scene_specialized; + request.pipeline->mtlDevice = mtlDevice; + request.pipeline->source_md5 = + device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; + request.pipeline->mtlLibrary = + device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; + request.pipeline->device_kernel = device_kernel; + request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; + + /* metalrt options */ + request.pipeline->use_metalrt = device->use_metalrt; + request.pipeline->metalrt_hair = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR); + request.pipeline->metalrt_hair_thick = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + request.pipeline->metalrt_pointcloud = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + + { + thread_scoped_lock lock(cache_mutex); + pipelines[device_kernel].push_back(unique_ptr(request.pipeline)); + request_queue.push_back(request); + } + cond_var.notify_one(); +} + +MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device) +{ + thread_scoped_lock lock(cache_mutex); + auto &collection = pipelines[kernel]; + if (collection.empty()) { + return nullptr; + } + + /* metalrt options */ + bool use_metalrt = device->use_metalrt; + bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); + bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + + MetalKernelPipeline *best_pipeline = nullptr; + for (auto &pipeline : collection) { + if (!pipeline->loaded) { + /* still loading - ignore */ + continue; + } + + if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair || + pipeline->metalrt_hair_thick != metalrt_hair_thick || + pipeline->metalrt_pointcloud != metalrt_pointcloud) { + /* wrong combination of metalrt options */ + continue; + } + + if (pipeline->scene_specialized) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { + best_pipeline = pipeline.get(); + } + } + else if (!best_pipeline) { + best_pipeline = pipeline.get(); + } + } + + return best_pipeline; +} + +void MetalKernelPipeline::compile() +{ + int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC; + + const std::string function_name = std::string("cycles_metal_") + + device_kernel_as_string(device_kernel); + + int threads_per_threadgroup = this->threads_per_threadgroup; + if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && + device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) { + /* Always use 512 for the sorting kernels */ + threads_per_threadgroup = 512; + } + + NSString *entryPoint = [@(function_name.c_str()) copy]; NSError *error = NULL; if (@available(macOS 11.0, *)) { MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; func_desc.name = entryPoint; - if (desc.constant_values) { - func_desc.constantValues = desc.constant_values; - } - pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index] - newFunctionWithDescriptor:func_desc - error:&error]; + function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error]; } + [entryPoint release]; - if (pso[desc.pso_index].function == nil) { + if (function == nil) { NSString *err = [error localizedDescription]; string errors = [err UTF8String]; - - device->set_error( - string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str())); - return false; + metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str()); + return; } - pso[desc.pso_index].function.label = [@(desc.function_name) copy]; + function.label = [entryPoint copy]; - __block MTLComputePipelineDescriptor *computePipelineStateDescriptor = - [[MTLComputePipelineDescriptor alloc] init]; - - computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable; - computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable; - computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable; - - if (@available(macos 10.14, *)) { - computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup; - } - computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; - - computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function; - if (@available(macOS 11.0, *)) { - /* Attach the additional functions to an MTLLinkedFunctions object */ - if (desc.linked_functions) { - computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init]; - computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions; - } - - computePipelineStateDescriptor.maxCallStackDepth = 1; - } - - /* Create a new Compute pipeline state object */ - MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; - - bool creating_new_archive = false; - if (@available(macOS 11.0, *)) { - if (use_binary_archive) { - if (!archive) { - MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; - archiveDesc.url = nil; - archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; - creating_new_archive = true; - - double starttime = time_dt(); - - if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor - error:&error]) { - NSString *errStr = [error localizedDescription]; - metal_printf("Failed to add PSO to archive:\n%s\n", - errStr ? [errStr UTF8String] : "nil"); - } - else { - double duration = time_dt() - starttime; - metal_printf("%2d | %-55s | %7.2fs\n", - desc.kernel_index, - device_kernel_as_string((DeviceKernel)desc.kernel_index), - duration); - - if (desc.pso_index == PSO_GENERIC) { - this->load_duration = duration; - } - } - } - computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil]; - pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss; - } - } - - double starttime = time_dt(); - - MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^( - id computePipelineState, - MTLComputePipelineReflection *reflection, - NSError *error) { - bool recreate_archive = false; - if (computePipelineState == nil && archive && !creating_new_archive) { - - assert(0); - - NSString *errStr = [error localizedDescription]; - metal_printf( - "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... " - "(error: %s)\n", - device_kernel_as_string((DeviceKernel)desc.kernel_index), - errStr ? [errStr UTF8String] : "nil"); - computePipelineState = [device->mtlDevice - newComputePipelineStateWithDescriptor:computePipelineStateDescriptor - options:MTLPipelineOptionNone - reflection:nullptr - error:&error]; - recreate_archive = true; - } - - double duration = time_dt() - starttime; - - if (computePipelineState == nil) { - NSString *errStr = [error localizedDescription]; - device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n", - device_kernel_as_string((DeviceKernel)desc.kernel_index)) + - (errStr ? [errStr UTF8String] : "nil")); - metal_printf("%2d | %-55s | %7.2fs | FAILED!\n", - desc.kernel_index, - device_kernel_as_string((DeviceKernel)desc.kernel_index), - duration); - return; - } - - pso[desc.pso_index].pipeline = computePipelineState; - num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, - computePipelineState.threadExecutionWidth); - num_threads_per_block = std::max(num_threads_per_block, - (int)computePipelineState.threadExecutionWidth); - - if (!use_binary_archive) { - metal_printf("%2d | %-55s | %7.2fs\n", - desc.kernel_index, - device_kernel_as_string((DeviceKernel)desc.kernel_index), - duration); - - if (desc.pso_index == PSO_GENERIC) { - this->load_duration = duration; - } - } - - if (@available(macOS 11.0, *)) { - if (creating_new_archive || recreate_archive) { - if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] - error:&error]) { - metal_printf("Failed to save binary archive, error:\n%s\n", - [[error localizedDescription] UTF8String]); - } - } - } - - [computePipelineStateDescriptor release]; - computePipelineStateDescriptor = nil; - - if (device->use_metalrt && desc.linked_functions) { - for (int table = 0; table < METALRT_TABLE_NUM; table++) { - if (@available(macOS 11.0, *)) { - MTLIntersectionFunctionTableDescriptor *ift_desc = - [[MTLIntersectionFunctionTableDescriptor alloc] init]; - ift_desc.functionCount = desc.intersector_functions[table].count; - - pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline - newIntersectionFunctionTableWithDescriptor:ift_desc]; - - /* Finally write the function handles into this pipeline's table */ - for (int i = 0; i < 2; i++) { - id handle = [pso[desc.pso_index].pipeline - functionHandleWithFunction:desc.intersector_functions[table][i]]; - [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i]; - } - } - } - } - - mark_loaded(desc.pso_index); - }; - - if (desc.pso_index == PSO_SPECIALISED) { - /* Asynchronous load */ - dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ - NSError *error; - id pipeline = [device->mtlDevice - newComputePipelineStateWithDescriptor:computePipelineStateDescriptor - options:pipelineOptions - reflection:nullptr - error:&error]; - completionHandler(pipeline, nullptr, error); - }); - } - else { - /* Block on load to ensure we continue with a valid kernel function */ - id pipeline = [device->mtlDevice - newComputePipelineStateWithDescriptor:computePipelineStateDescriptor - options:pipelineOptions - reflection:nullptr - error:&error]; - completionHandler(pipeline, nullptr, error); - } - - return true; -} - -const MetalKernelPipeline &MetalDeviceKernel::get_pso() const -{ - if (pso[PSO_SPECIALISED].loaded) { - return pso[PSO_SPECIALISED]; - } - - assert(pso[PSO_GENERIC].loaded); - return pso[PSO_GENERIC]; -} - -bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type) -{ - bool any_error = false; - - MD5Hash md5; - - /* Build the function constant table */ - MTLFunctionConstantValues *constant_values = nullptr; - if (kernel_type == PSO_SPECIALISED) { - constant_values = [MTLFunctionConstantValues new]; - -# define KERNEL_FILM(_type, name) \ - [constant_values setConstantValue:&data.film.name \ - type:get_MTLDataType_##_type() \ - atIndex:KernelData_film_##name]; \ - md5.append((uint8_t *)&data.film.name, sizeof(data.film.name)); - -# define KERNEL_BACKGROUND(_type, name) \ - [constant_values setConstantValue:&data.background.name \ - type:get_MTLDataType_##_type() \ - atIndex:KernelData_background_##name]; \ - md5.append((uint8_t *)&data.background.name, sizeof(data.background.name)); - -# define KERNEL_INTEGRATOR(_type, name) \ - [constant_values setConstantValue:&data.integrator.name \ - type:get_MTLDataType_##_type() \ - atIndex:KernelData_integrator_##name]; \ - md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name)); - -# define KERNEL_BVH(_type, name) \ - [constant_values setConstantValue:&data.bvh.name \ - type:get_MTLDataType_##_type() \ - atIndex:KernelData_bvh_##name]; \ - md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name)); - - /* METAL_WIP: populate constant_values based on KernelData */ - assert(0); - /* - const KernelData &data = device->launch_params.data; - # include "kernel/types/background.h" - # include "kernel/types/bvh.h" - # include "kernel/types/film.h" - # include "kernel/types/integrator.h" - */ - } - - if (device->use_metalrt) { + if (use_metalrt) { if (@available(macOS 11.0, *)) { /* create the id for each intersection function */ const char *function_names[] = { @@ -356,176 +314,316 @@ bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type) assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM); MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; - if (kernel_type == PSO_SPECIALISED) { - desc.constantValues = constant_values; - } for (int i = 0; i < METALRT_FUNC_NUM; i++) { const char *function_name = function_names[i]; desc.name = [@(function_name) copy]; NSError *error = NULL; - rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type] - newFunctionWithDescriptor:desc - error:&error]; + rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error]; - if (rt_intersection_funcs[kernel_type][i] == nil) { + if (rt_intersection_function[i] == nil) { NSString *err = [error localizedDescription]; string errors = [err UTF8String]; - device->set_error(string_printf( - "Error getting intersection function \"%s\": %s", function_name, errors.c_str())); - any_error = true; + error_str = string_printf( + "Error getting intersection function \"%s\": %s", function_name, errors.c_str()); break; } - rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy]; + rt_intersection_function[i].label = [@(function_name) copy]; } } } - md5.append(device->source_used_for_compile[kernel_type]); - string hash = md5.get_hex(); - if (loaded_md5[kernel_type] == hash) { - return true; - } + NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; + NSArray *linked_functions = nil; - if (!any_error) { - NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; - NSArray *function_list = nil; - - if (device->use_metalrt) { - id curve_intersect_default = nil; - id curve_intersect_shadow = nil; - id point_intersect_default = nil; - id point_intersect_shadow = nil; - if (device->kernel_features & KERNEL_FEATURE_HAIR) { - /* Add curve intersection programs. */ - if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) { - /* Slower programs for thick hair since that also slows down ribbons. - * Ideally this should not be needed. */ - curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL]; - curve_intersect_shadow = - rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW]; - } - else { - curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON]; - curve_intersect_shadow = - rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW]; - } + if (use_metalrt) { + id curve_intersect_default = nil; + id curve_intersect_shadow = nil; + id point_intersect_default = nil; + id point_intersect_shadow = nil; + if (metalrt_hair) { + /* Add curve intersection programs. */ + if (metalrt_hair_thick) { + /* Slower programs for thick hair since that also slows down ribbons. + * Ideally this should not be needed. */ + curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL]; + curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW]; } - if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) { - point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT]; - point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW]; + else { + curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON]; + curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW]; } - table_functions[METALRT_TABLE_DEFAULT] = [NSArray - arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI], - curve_intersect_default ? - curve_intersect_default : - rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX], - point_intersect_default ? - point_intersect_default : - rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX], - nil]; - table_functions[METALRT_TABLE_SHADOW] = [NSArray - arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI], - curve_intersect_shadow ? - curve_intersect_shadow : - rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX], - point_intersect_shadow ? - point_intersect_shadow : - rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX], - nil]; - table_functions[METALRT_TABLE_LOCAL] = [NSArray - arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI], - rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX], - rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX], - nil]; + } + if (metalrt_pointcloud) { + point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT]; + point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW]; + } + table_functions[METALRT_TABLE_DEFAULT] = [NSArray + arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI], + curve_intersect_default ? + curve_intersect_default : + rt_intersection_function[METALRT_FUNC_DEFAULT_BOX], + point_intersect_default ? + point_intersect_default : + rt_intersection_function[METALRT_FUNC_DEFAULT_BOX], + nil]; + table_functions[METALRT_TABLE_SHADOW] = [NSArray + arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI], + curve_intersect_shadow ? + curve_intersect_shadow : + rt_intersection_function[METALRT_FUNC_SHADOW_BOX], + point_intersect_shadow ? + point_intersect_shadow : + rt_intersection_function[METALRT_FUNC_SHADOW_BOX], + nil]; + table_functions[METALRT_TABLE_LOCAL] = [NSArray + arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI], + rt_intersection_function[METALRT_FUNC_LOCAL_BOX], + rt_intersection_function[METALRT_FUNC_LOCAL_BOX], + nil]; - NSMutableSet *unique_functions = [NSMutableSet - setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; - [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; - [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; + NSMutableSet *unique_functions = [NSMutableSet + setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; - function_list = [[NSArray arrayWithArray:[unique_functions allObjects]] + if (kernel_has_intersection(device_kernel)) { + linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]] sortedArrayUsingComparator:^NSComparisonResult(id f1, id f2) { return [f1.label compare:f2.label]; }]; + } + unique_functions = nil; + } - unique_functions = nil; + MTLComputePipelineDescriptor *computePipelineStateDescriptor = + [[MTLComputePipelineDescriptor alloc] init]; + + computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable; + computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable; + computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable; + + if (@available(macos 10.14, *)) { + computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup; + } + computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; + + computePipelineStateDescriptor.computeFunction = function; + + if (@available(macOS 11.0, *)) { + /* Attach the additional functions to an MTLLinkedFunctions object */ + if (linked_functions) { + computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init]; + computePipelineStateDescriptor.linkedFunctions.functions = linked_functions; + } + computePipelineStateDescriptor.maxCallStackDepth = 1; + if (use_metalrt) { + computePipelineStateDescriptor.maxCallStackDepth = 8; + } + } + + MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; + + bool use_binary_archive = true; + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + use_binary_archive = (atoi(str) == 0); + } + + id archive = nil; + string metalbin_path; + string metalbin_name; + if (use_binary_archive) { + NSProcessInfo *processInfo = [NSProcessInfo processInfo]; + string osVersion = [[processInfo operatingSystemVersionString] UTF8String]; + MD5Hash local_md5; + local_md5.append(source_md5); + local_md5.append(osVersion); + local_md5.append((uint8_t *)&this->threads_per_threadgroup, + sizeof(this->threads_per_threadgroup)); + + string options; + if (use_metalrt && kernel_has_intersection(device_kernel)) { + /* incorporate any MetalRT specialisations into the archive name */ + options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d", + metalrt_hair ? 1 : 0, + metalrt_hair_thick ? 1 : 0, + metalrt_pointcloud ? 1 : 0); } - metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n", - kernel_type_as_string(kernel_type)); + /* Replace non-alphanumerical characters with underscores. */ + string device_name = [mtlDevice.name UTF8String]; + for (char &c : device_name) { + if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) { + c = '_'; + } + } - tbb::task_arena local_arena(max_mtlcompiler_threads); - local_arena.execute([&]() { - parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) { - /* skip megakernel */ - if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; - } + metalbin_name = device_name; + metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel)); + metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type)); + metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin"); - /* Only specialize kernels where it can make an impact. */ - if (kernel_type == PSO_SPECIALISED) { - if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; - } - } + metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); + path_create_directories(metalbin_path); - MetalDeviceKernel &kernel = kernels_[i]; - - const std::string function_name = std::string("cycles_metal_") + - device_kernel_as_string((DeviceKernel)i); - int threads_per_threadgroup = device->max_threads_per_threadgroup; - if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) { - /* Always use 512 for the sorting kernels */ - threads_per_threadgroup = 512; - } - - NSArray *kernel_function_list = nil; - - if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || - i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || - i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || - i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { - kernel_function_list = function_list; - } - - MetalKernelLoadDesc desc; - desc.pso_index = kernel_type; - desc.kernel_index = i; - desc.linked_functions = kernel_function_list; - desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT]; - desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW]; - desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL]; - desc.constant_values = constant_values; - desc.threads_per_threadgroup = threads_per_threadgroup; - desc.function_name = function_name.c_str(); - - bool success = kernel.load(device, desc, md5); - - any_error |= !success; - }); - }); + if (path_exists(metalbin_path) && use_binary_archive) { + if (@available(macOS 11.0, *)) { + MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; + archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; + archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + [archiveDesc release]; + } + } } - bool loaded = !any_error; - if (loaded) { - loaded_md5[kernel_type] = hash; + __block bool creating_new_archive = false; + if (@available(macOS 11.0, *)) { + if (use_binary_archive) { + if (!archive) { + MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; + archiveDesc.url = nil; + archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + creating_new_archive = true; + } + computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil]; + pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss; + } + } + + double starttime = time_dt(); + + MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^( + id computePipelineState, + MTLComputePipelineReflection *reflection, + NSError *error) { + bool recreate_archive = false; + if (computePipelineState == nil && archive) { + NSString *errStr = [error localizedDescription]; + metal_printf( + "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... " + "(error: %s)\n", + device_kernel_as_string((DeviceKernel)device_kernel), + errStr ? [errStr UTF8String] : "nil"); + computePipelineState = [mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:MTLPipelineOptionNone + reflection:nullptr + error:&error]; + recreate_archive = true; + } + + double duration = time_dt() - starttime; + + if (computePipelineState == nil) { + NSString *errStr = [error localizedDescription]; + error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n", + device_kernel_as_string((DeviceKernel)device_kernel)); + error_str += (errStr ? [errStr UTF8String] : "nil"); + metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n", + kernel_type_as_string(pso_type), + device_kernel, + device_kernel_as_string((DeviceKernel)device_kernel), + duration); + return; + } + + int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, + computePipelineState.threadExecutionWidth); + num_threads_per_block = std::max(num_threads_per_block, + (int)computePipelineState.threadExecutionWidth); + this->pipeline = computePipelineState; + this->num_threads_per_block = num_threads_per_block; + + if (@available(macOS 11.0, *)) { + if (creating_new_archive || recreate_archive) { + if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] + error:&error]) { + metal_printf("Failed to save binary archive, error:\n%s\n", + [[error localizedDescription] UTF8String]); + } + } + } + }; + + /* Block on load to ensure we continue with a valid kernel function */ + if (creating_new_archive) { + starttime = time_dt(); + NSError *error; + if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor + error:&error]) { + NSString *errStr = [error localizedDescription]; + metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil"); + } + } + id pipeline = [mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:pipelineOptions + reflection:nullptr + error:&error]; + completionHandler(pipeline, nullptr, error); + + this->loaded = true; + [computePipelineStateDescriptor release]; + computePipelineStateDescriptor = nil; + + if (use_metalrt && linked_functions) { + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (@available(macOS 11.0, *)) { + MTLIntersectionFunctionTableDescriptor *ift_desc = + [[MTLIntersectionFunctionTableDescriptor alloc] init]; + ift_desc.functionCount = table_functions[table].count; + intersection_func_table[table] = [this->pipeline + newIntersectionFunctionTableWithDescriptor:ift_desc]; + + /* Finally write the function handles into this pipeline's table */ + for (int i = 0; i < 2; i++) { + id handle = [pipeline + functionHandleWithFunction:table_functions[table][i]]; + [intersection_func_table[table] setFunction:handle atIndex:i]; + } + } + } + } + + double duration = time_dt() - starttime; + + if (!use_binary_archive) { + metal_printf("%16s | %2d | %-55s | %7.2fs\n", + kernel_type_as_string(pso_type), + int(device_kernel), + device_kernel_as_string(device_kernel), + duration); + } + else { + metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n", + kernel_type_as_string(pso_type), + device_kernel, + device_kernel_as_string((DeviceKernel)device_kernel), + duration, + creating_new_archive ? " new" : "load", + metalbin_name.c_str()); } - return loaded; } -const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const +bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized) { - return kernels_[(int)kernel]; + auto shader_cache = get_shader_cache(device->mtlDevice); + for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { + shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized); + } + + if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) { + shader_cache->wait_for_all(); + } + return true; } -bool MetalDeviceKernels::available(DeviceKernel kernel) const +const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device, + DeviceKernel kernel) { - return kernels_[(int)kernel].get_pso().function != nil; + return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 1686ab95ffa..ec10e091b25 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -108,9 +108,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " << work_size; - const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel); - const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso(); - id mtlComputeCommandEncoder = get_compute_encoder(kernel); /* Determine size requirement for argument buffer. */ @@ -212,6 +209,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } bytes_written = globals_offsets + sizeof(KernelParamsMetal); + const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(metal_device, + kernel); + if (!metal_kernel_pso) { + metal_device->set_error( + string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel))); + return false; + } + /* Encode ancillaries */ [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d @@ -228,14 +233,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } for (int table = 0; table < METALRT_TABLE_NUM; table++) { - if (metal_kernel_pso.intersection_func_table[table]) { - [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer - offset:globals_offsets - atIndex:1]; + if (metal_kernel_pso->intersection_func_table[table]) { + [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer + offset:globals_offsets + atIndex:1]; [metal_device->mtlAncillaryArgEncoder - setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table] + setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table] atIndex:3 + table]; - [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table] + [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table] usage:MTLResourceUsageRead]; } else { @@ -281,10 +286,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } } - [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline]; + [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline]; /* Compute kernel launch parameters. */ - const int num_threads_per_block = metal_kernel.get_num_threads_per_block(); + const int num_threads_per_block = metal_kernel_pso->num_threads_per_block; int shared_mem_bytes = 0; @@ -314,7 +319,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, threadsPerThreadgroup:size_threads_per_threadgroup]; [mtlCommandBuffer addCompletedHandler:^(id command_buffer) { - NSString *kernel_name = metal_kernel_pso.function.label; + NSString *kernel_name = metal_kernel_pso->function.label; /* Enhanced command buffer errors are only available in 11.0+ */ if (@available(macos 11.0, *)) { @@ -547,6 +552,8 @@ id MetalDeviceQueue::get_compute_encoder(DeviceKernel computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial]; + [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))]; + /* declare usage of MTLBuffers etc */ prepare_resources(kernel); } diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 4e309f16c08..0ed52074a90 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,10 +29,26 @@ using namespace metal::raytracing; /* Qualifiers */ -#define ccl_device -#define ccl_device_inline ccl_device -#define ccl_device_forceinline ccl_device -#define ccl_device_noinline ccl_device __attribute__((noinline)) +#if defined(__KERNEL_METAL_APPLE__) + +/* Inline everything for Apple GPUs. + * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface + * at the cost of longer compile times (~4.5 minutes on M1 Max). */ + +# define ccl_device __attribute__((always_inline)) +# define ccl_device_inline __attribute__((always_inline)) +# define ccl_device_forceinline __attribute__((always_inline)) +# define ccl_device_noinline __attribute__((always_inline)) + +#else + +# define ccl_device +# define ccl_device_inline ccl_device +# define ccl_device_forceinline ccl_device +# define ccl_device_noinline ccl_device __attribute__((noinline)) + +#endif + #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global device