Cycles: Remove AMD and Intel GPU support from Metal backend

This is because with the addition of new features to Cycles, these GPUs
experienced significant performance regressions and bugs, all stemming
from bugs in the Metal GPU driver/compiler. The only reasonable way to
work around these issues was to disable parts of Cycles code on
these GPUs to avoid the driver/compiler bugs.

This resulted in increased development time maintaining these platforms
while being unable to deliver feature parity with other
GPU backends.

It has been decided that this development time is better spent
maintaining platforms that are still actively maintained by
hardware/software vendors, and so AMD and Intel GPU support will be
removed from the Metal backend for Cycles.

Pull Request: https://projects.blender.org/blender/blender/pulls/123551
This commit is contained in:
Alaska 2024-06-26 17:16:20 +02:00 committed by Sergey Sharybin
parent eb37bace96
commit c8340cf754
15 changed files with 116 additions and 316 deletions

@ -1769,12 +1769,9 @@ class CyclesPreferences(bpy.types.AddonPreferences):
col.label(text=rpt_(" %s or newer") % driver_version, icon='BLANK1', translate=False)
col.label(text=rpt_(" - oneAPI Level-Zero Loader"), icon='BLANK1', translate=False)
elif device_type == 'METAL':
silicon_mac_version = "12.2"
amd_mac_version = "12.3"
col.label(text=rpt_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
mac_version = "12.2"
col.label(text=rpt_("Requires Apple Silicon with macOS %s or newer") % mac_version,
icon='BLANK1', translate=False)
col.label(text=rpt_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
translate=False)
return
for device in devices:
@ -1816,21 +1813,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
if compute_device_type == 'METAL':
import platform
import re
is_navi_2 = False
for device in devices:
if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
is_navi_2 = True
break
# MetalRT only works on Apple Silicon and Navi2.
is_arm64 = platform.machine() == 'arm64'
if is_arm64 or (is_navi_2 and has_rt_api_support['METAL']):
# MetalRT only works on Apple Silicon.
if (platform.machine() == 'arm64'):
col = layout.column()
col.use_property_split = True
# Kernel specialization is only supported on Apple Silicon
if is_arm64:
col.prop(self, "kernel_optimization_level")
col.prop(self, "kernel_optimization_level")
if has_rt_api_support['METAL']:
col.prop(self, "metalrt")

@ -388,7 +388,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.num = 0;
info.has_nanovdb = true;
info.has_light_tree = true;
info.has_mnee = true;
info.has_osl = true;
info.has_guiding = true;
@ -438,7 +437,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
/* Accumulate device info. */
info.has_nanovdb &= device.has_nanovdb;
info.has_light_tree &= device.has_light_tree;
info.has_mnee &= device.has_mnee;
info.has_osl &= device.has_osl;
info.has_guiding &= device.has_guiding;

@ -83,7 +83,6 @@ class DeviceInfo {
int num;
bool display_device; /* GPU is used as a display device. */
bool has_nanovdb; /* Support NanoVDB volumes. */
bool has_light_tree; /* Support light tree. */
bool has_mnee; /* Support MNEE. */
bool has_osl; /* Support Open Shading Language. */
bool has_guiding; /* Support path guiding. */
@ -107,7 +106,6 @@ class DeviceInfo {
cpu_threads = 0;
display_device = false;
has_nanovdb = false;
has_light_tree = true;
has_mnee = true;
has_osl = false;
has_guiding = false;

@ -158,7 +158,6 @@ void device_hip_info(vector<DeviceInfo> &devices)
info.num = num;
info.has_nanovdb = true;
info.has_light_tree = true;
info.has_mnee = true;
info.has_gpu_queue = true;

@ -69,16 +69,13 @@ void device_metal_info(vector<DeviceInfo> &devices)
}
# endif
MetalGPUVendor vendor = MetalInfo::get_device_vendor(device);
info.has_nanovdb = vendor == METAL_GPU_APPLE;
info.has_light_tree = vendor != METAL_GPU_AMD;
info.has_nanovdb = true;
/* MNEE caused "Compute function exceeds available temporary registers" in macOS < 13 due to a
* bug in spill buffer allocation sizing. */
info.has_mnee = false;
if (@available(macos 13.0, *)) {
info.has_mnee = vendor != METAL_GPU_AMD;
info.has_mnee = true;
}
info.use_hardware_raytracing = false;
@ -86,13 +83,11 @@ void device_metal_info(vector<DeviceInfo> &devices)
/* MetalRT now uses features exposed in Xcode versions corresponding to macOS 14+, so don't
* expose it in builds from older Xcode versions. */
# if defined(MAC_OS_VERSION_14_0)
if (vendor != METAL_GPU_INTEL) {
if (@available(macos 14.0, *)) {
info.use_hardware_raytracing = device.supportsRaytracing;
if (@available(macos 14.0, *)) {
info.use_hardware_raytracing = device.supportsRaytracing;
/* Use hardware raytracing for faster rendering on architectures that support it. */
info.use_metalrt_by_default = (MetalInfo::get_apple_gpu_architecture(device) >= APPLE_M3);
}
/* Use hardware raytracing for faster rendering on architectures that support it. */
info.use_metalrt_by_default = (MetalInfo::get_apple_gpu_architecture(device) >= APPLE_M3);
}
# endif

@ -45,8 +45,6 @@ class MetalDevice : public Device {
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
/*---------------------------------------------------*/
MetalGPUVendor device_vendor;
uint kernel_features;
bool using_nanovdb = false;
MTLResourceOptions default_storage_mode;

@ -80,34 +80,19 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
auto usable_devices = MetalInfo::get_usable_devices();
assert(mtlDevId < usable_devices.size());
mtlDevice = usable_devices[mtlDevId];
device_vendor = MetalInfo::get_device_vendor(mtlDevice);
assert(device_vendor != METAL_GPU_UNKNOWN);
metal_printf("Creating new Cycles Metal device: %s\n", info.description.c_str());
/* determine default storage mode based on whether UMA is supported */
default_storage_mode = MTLResourceStorageModeManaged;
/* We only support Apple Silicon which hasUnifiedMemory support. But leave this check here
* just in case a future GPU comes out that doesn't. */
if ([mtlDevice hasUnifiedMemory]) {
default_storage_mode = MTLResourceStorageModeShared;
}
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL: {
max_threads_per_threadgroup = 64;
break;
}
case METAL_GPU_AMD: {
max_threads_per_threadgroup = 128;
break;
}
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
break;
}
}
max_threads_per_threadgroup = 512;
use_metalrt = info.use_hardware_raytracing;
if (auto metalrt = getenv("CYCLES_METALRT")) {
@ -118,20 +103,18 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
capture_enabled = true;
}
if (device_vendor == METAL_GPU_APPLE) {
/* Set kernel_specialization_level based on user preferences. */
switch (info.kernel_optimization_level) {
case KERNEL_OPTIMIZATION_LEVEL_OFF:
kernel_specialization_level = PSO_GENERIC;
break;
default:
case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
case KERNEL_OPTIMIZATION_LEVEL_FULL:
kernel_specialization_level = PSO_SPECIALIZED_SHADE;
break;
}
/* Set kernel_specialization_level based on user preferences. */
switch (info.kernel_optimization_level) {
case KERNEL_OPTIMIZATION_LEVEL_OFF:
kernel_specialization_level = PSO_GENERIC;
break;
default:
case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
case KERNEL_OPTIMIZATION_LEVEL_FULL:
kernel_specialization_level = PSO_SPECIALIZED_SHADE;
break;
}
if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
@ -351,41 +334,18 @@ string MetalDevice::preprocess_source(MetalPipelineType pso_type,
global_defines += "#define WITH_CYCLES_DEBUG\n";
# endif
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL:
global_defines += "#define __KERNEL_METAL_INTEL__\n";
break;
case METAL_GPU_AMD:
global_defines += "#define __KERNEL_METAL_AMD__\n";
/* The increased amount of BSDF code leads to a big performance regression
* on AMD. There is currently no workaround to fix this general. Instead
* disable Principled Hair and patch evaluation. */
if (kernel_features & KERNEL_FEATURE_NODE_PRINCIPLED_HAIR) {
global_defines += "#define WITH_PRINCIPLED_HAIR\n";
}
if (kernel_features & KERNEL_FEATURE_PATCH_EVALUATION) {
global_defines += "#define WITH_PATCH_EVAL\n";
}
break;
case METAL_GPU_APPLE:
global_defines += "#define __KERNEL_METAL_APPLE__\n";
if (@available(macos 14.0, *)) {
/* Use Program Scope Global Built-ins, when available. */
global_defines += "#define __METAL_GLOBAL_BUILTINS__\n";
}
# ifdef WITH_NANOVDB
/* Compiling in NanoVDB results in a marginal drop in render performance,
* so disable it for specialized PSOs when no textures are using it. */
if ((pso_type == PSO_GENERIC || using_nanovdb) && DebugFlags().metal.use_nanovdb) {
global_defines += "#define WITH_NANOVDB\n";
}
# endif
break;
global_defines += "#define __KERNEL_METAL_APPLE__\n";
if (@available(macos 14.0, *)) {
/* Use Program Scope Global Built-ins, when available. */
global_defines += "#define __METAL_GLOBAL_BUILTINS__\n";
}
# ifdef WITH_NANOVDB
/* Compiling in NanoVDB results in a marginal drop in render performance,
* so disable it for specialized PSOs when no textures are using it. */
if ((pso_type == PSO_GENERIC || using_nanovdb) && DebugFlags().metal.use_nanovdb) {
global_defines += "#define WITH_NANOVDB\n";
}
# endif
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
@ -543,7 +503,6 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
id<MTLDevice> mtlDevice;
string source;
MetalGPUVendor device_vendor;
/* Safely gather any state required for the MSL->AIR compilation. */
{
@ -566,7 +525,6 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
}
mtlDevice = instance->mtlDevice;
device_vendor = instance->device_vendor;
source = instance->source[pso_type];
}
@ -575,14 +533,6 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
# if defined(MAC_OS_VERSION_13_0)
if (@available(macos 13.0, *)) {
if (device_vendor == METAL_GPU_INTEL) {
[options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
}
}
# endif
options.fastMathEnabled = YES;
if (@available(macos 12.0, *)) {
options.languageVersion = MTLLanguageVersion2_4;
@ -1158,8 +1108,7 @@ void MetalDevice::tex_alloc(device_texture &mem)
}
}
MTLStorageMode storage_mode = MTLStorageModeManaged;
/* Intel GPUs don't support MTLStorageModeShared for MTLTextures. */
if ([mtlDevice hasUnifiedMemory] && device_vendor != METAL_GPU_INTEL) {
if ([mtlDevice hasUnifiedMemory]) {
storage_mode = MTLStorageModeShared;
}

@ -38,52 +38,50 @@ struct ShaderCache {
// TODO: Look into tuning for DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT and
// DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT.
if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
default:
case APPLE_M3:
/* Peak occupancy is achieved through Dynamic Caching on M3 GPUs. */
for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
occupancy_tuning[i] = {64, 64};
}
break;
case APPLE_M2_BIG:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
break;
case APPLE_M2:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
break;
case APPLE_M1:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
break;
}
switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
default:
case APPLE_M3:
/* Peak occupancy is achieved through Dynamic Caching on M3 GPUs. */
for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
occupancy_tuning[i] = {64, 64};
}
break;
case APPLE_M2_BIG:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
break;
case APPLE_M2:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
break;
case APPLE_M1:
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
break;
}
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
@ -397,13 +395,6 @@ bool MetalKernelPipeline::should_use_binary_archive() const
return false;
}
}
else {
/* Workaround for issues using Binary Archives on non-Apple Silicon systems. */
MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
if (gpu_vendor != METAL_GPU_APPLE) {
return false;
}
}
if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
/* Binary linked functions aren't supported in binary archives. */

@ -285,33 +285,24 @@ int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
return result;
}
result = 1048576;
if (metal_device_->device_vendor == METAL_GPU_APPLE) {
result *= 4;
result = 4194304;
/* Increasing the state count doesn't notably benefit M1-family systems. */
if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
size_t system_ram = system_physical_ram();
size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
/* Increasing the state count doesn't notably benefit M1-family systems. */
if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
size_t system_ram = system_physical_ram();
size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
/* Determine whether we can double the state count, and leave enough GPU-available memory
* (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
* keep dispatch sizes high and minimize work submission overheads. */
size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
size_t total_state_size = result * state_size;
if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
result *= 2;
metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
}
/* Determine whether we can double the state count, and leave enough GPU-available memory
* (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
* keep dispatch sizes high and minimize work submission overheads. */
size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
size_t total_state_size = result * state_size;
if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
result *= 2;
metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
}
}
else if (metal_device_->device_vendor == METAL_GPU_AMD) {
/* METAL_WIP */
/* TODO: compute automatically. */
/* TODO: must have at least num_threads_per_block. */
result *= 2;
}
return result;
}
@ -323,7 +314,7 @@ int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
int MetalDeviceQueue::num_sort_partition_elements() const
{
return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
return MetalInfo::optimal_sort_partition_elements();
}
bool MetalDeviceQueue::supports_local_atomic_sort() const

@ -19,18 +19,7 @@
CCL_NAMESPACE_BEGIN
enum MetalGPUVendor {
METAL_GPU_UNKNOWN = 0,
METAL_GPU_APPLE = 1,
METAL_GPU_AMD = 2,
METAL_GPU_INTEL = 3,
};
enum AppleGPUArchitecture {
/* NOT_APPLE_GPU represents AMD/Intel GPUs. This should remained at the start of this enum to
* ensure that AMD/Intel GPUs don't accidentally get Apple Silicon only features enabled when
* using comparison operators. */
NOT_APPLE_GPU,
APPLE_M1,
APPLE_M2,
APPLE_M2_BIG,
@ -44,9 +33,8 @@ enum AppleGPUArchitecture {
struct MetalInfo {
static vector<id<MTLDevice>> const &get_usable_devices();
static int get_apple_gpu_core_count(id<MTLDevice> device);
static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
static int optimal_sort_partition_elements(id<MTLDevice> device);
static int optimal_sort_partition_elements();
static string get_device_name(id<MTLDevice> device);
};

@ -21,11 +21,11 @@ CCL_NAMESPACE_BEGIN
string MetalInfo::get_device_name(id<MTLDevice> device)
{
string device_name = [device.name UTF8String];
if (get_device_vendor(device) == METAL_GPU_APPLE) {
/* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */
int gpu_core_count = get_apple_gpu_core_count(device);
device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count);
}
/* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */
int gpu_core_count = get_apple_gpu_core_count(device);
device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count);
return device_name;
}
@ -49,10 +49,6 @@ int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device)
AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
{
if (MetalInfo::get_device_vendor(device) != METAL_GPU_APPLE) {
return NOT_APPLE_GPU;
}
const char *device_name = [device.name UTF8String];
if (strstr(device_name, "M1")) {
return APPLE_M1;
@ -66,28 +62,7 @@ AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
return APPLE_UNKNOWN;
}
MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
{
const char *device_name = [device.name UTF8String];
if (strstr(device_name, "Intel")) {
return METAL_GPU_INTEL;
}
else if (strstr(device_name, "AMD")) {
/* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
if (atoi(str)) {
return METAL_GPU_UNKNOWN;
}
}
return METAL_GPU_AMD;
}
else if (strstr(device_name, "Apple")) {
return METAL_GPU_APPLE;
}
return METAL_GPU_UNKNOWN;
}
int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
int MetalInfo::optimal_sort_partition_elements()
{
if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
return atoi(str);
@ -96,10 +71,8 @@ int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
/* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
* sorting each partition by material. Partitioning into chunks of 65536 elements results in an
* overall render time speedup of up to 15%. */
if (get_device_vendor(device) == METAL_GPU_APPLE) {
return 65536;
}
return 0;
return 65536;
}
vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
@ -111,36 +84,20 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
return usable_devices;
}
/* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
* one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
bool has_usable_amd_gpu = false;
if (@available(macos 12.3, *)) {
for (id<MTLDevice> device in MTLCopyAllDevices()) {
has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
}
}
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
string device_name = get_device_name(device);
MetalGPUVendor vendor = get_device_vendor(device);
bool usable = false;
if (@available(macos 12.2, *)) {
usable |= (vendor == METAL_GPU_APPLE);
}
if (@available(macos 12.3, *)) {
usable |= (vendor == METAL_GPU_AMD);
}
# if defined(MAC_OS_VERSION_13_0)
if (!has_usable_amd_gpu) {
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
const char *device_name_char = [device.name UTF8String];
if (!(strstr(device_name_char, "Intel") || strstr(device_name_char, "AMD")) &&
strstr(device_name_char, "Apple"))
{
/* TODO: Implement a better way to identify device vendor instead of relying on name. */
usable = true;
}
}
# endif
if (usable) {
metal_printf("- %s\n", device_name.c_str());

@ -40,49 +40,20 @@ class MetalKernelContext {
return 0;
}
#ifdef __KERNEL_METAL_INTEL__
template<typename TextureType, typename CoordsType>
inline __attribute__((__always_inline__))
auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array,
const uint tid, const uint sid,
CoordsType coords) const
{
switch(sid) {
default:
case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords);
case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords);
case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords);
case 3: return texture_array[tid].tex.sample(sampler(address::mirrored_repeat, filter::nearest), coords);
case 4: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords);
case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords);
case 6: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords);
case 7: return texture_array[tid].tex.sample(sampler(address::mirrored_repeat, filter::linear), coords);
}
}
#endif
// texture2d
template<>
inline __attribute__((__always_inline__))
float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
#else
return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y));
#endif
}
template<>
inline __attribute__((__always_inline__))
float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
#else
return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x;
#endif
}
// texture3d
@ -91,22 +62,14 @@ class MetalKernelContext {
float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
const uint tid(tex);
const uint sid(tex >> 32);
#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
#else
return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z));
#endif
}
template<>
inline __attribute__((__always_inline__))
float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
const uint tid(tex);
const uint sid(tex >> 32);
#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
#else
return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x;
#endif
}
# include "kernel/device/gpu/image.h"

@ -216,20 +216,6 @@ CCL_NAMESPACE_BEGIN
# undef __MNEE__
#endif
#if defined(__KERNEL_METAL_AMD__)
/* Disabled due to internal compiler perf issue and enable light tree on Metal/AMD. */
# undef __LIGHT_TREE__
/* Disabled due to compiler crash on Metal/AMD. */
# undef __MNEE__
/* Disable due to performance regression on Metal/AMD. */
# ifndef WITH_PRINCIPLED_HAIR
# undef __PRINCIPLED_HAIR__
# endif
# ifndef WITH_PATCH_EVAL
# undef __PATCH_EVAL__
# endif
#endif
/* Scene-based selective features compilation. */
/* Scene-based selective features compilation. */
#ifdef __KERNEL_FEATURES__

@ -1119,7 +1119,7 @@ void LightManager::device_update_background(Device *device,
dscene->light_background_conditional_cdf.copy_to_device();
}
void LightManager::device_update_lights(Device *device, DeviceScene *dscene, Scene *scene)
void LightManager::device_update_lights(DeviceScene *dscene, Scene *scene)
{
/* Counts lights in the scene. */
size_t num_lights = 0;
@ -1153,8 +1153,7 @@ void LightManager::device_update_lights(Device *device, DeviceScene *dscene, Sce
/* Update integrator settings. */
KernelIntegrator *kintegrator = &dscene->data.integrator;
kintegrator->use_light_tree = scene->integrator->get_use_light_tree() &&
device->info.has_light_tree;
kintegrator->use_light_tree = scene->integrator->get_use_light_tree();
kintegrator->num_lights = num_lights;
kintegrator->num_distant_lights = num_distant_lights;
kintegrator->num_background_lights = num_background_lights;
@ -1429,7 +1428,7 @@ void LightManager::device_update(Device *device,
device_free(device, dscene, need_update_background);
device_update_lights(device, dscene, scene);
device_update_lights(dscene, scene);
if (progress.get_cancel()) {
return;
}

@ -142,7 +142,7 @@ class LightManager {
*/
void test_enabled_lights(Scene *scene);
void device_update_lights(Device *device, DeviceScene *dscene, Scene *scene);
void device_update_lights(DeviceScene *dscene, Scene *scene);
void device_update_distribution(Device *device,
DeviceScene *dscene,
Scene *scene,