diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aa8bffe08f..acfab6ffc60 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -501,12 +501,14 @@ endif() if(NOT APPLE) option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF) option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF) + option(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION "Switch target of oneAPI implementation from SYCL devices to Host Task (single thread on CPU). This option is only for debugging purposes." OFF) # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html # acm-g10 is the target for the first Intel Arc Alchemist GPUs. set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "acm-g10" CACHE STRING "oneAPI Intel GPU architectures to build binaries for") set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for") + mark_as_advanced(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION) mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES) mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS) endif() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index c7e95d44d9b..8ec15c6f304 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -163,6 +163,9 @@ if(WITH_CYCLES_DEVICE_METAL) endif() if(WITH_CYCLES_DEVICE_ONEAPI) + if(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION) + add_definitions(-DWITH_ONEAPI_SYCL_HOST_TASK) + endif() if(WITH_CYCLES_ONEAPI_BINARIES) set(cycles_kernel_oneapi_lib_suffix "_aot") else() diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index bf8de8b5a12..edffd9525b1 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -429,7 +429,12 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_ queue->get_device().get_info(); sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); (void)usm_type; - assert(usm_type == sycl::usm::alloc::device || +# ifndef WITH_ONEAPI_SYCL_HOST_TASK + const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device; +# else + const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host; +# endif + assert(usm_type == main_memory_type || (usm_type == sycl::usm::alloc::host && (allow_host || device_type == sycl::info::device_type::cpu)) || usm_type == sycl::usm::alloc::unknown); @@ -478,7 +483,11 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); +# ifndef WITH_ONEAPI_SYCL_HOST_TASK return sycl::malloc_device(memory_size, *queue); +# else + return sycl::malloc_host(memory_size, *queue); +# endif } void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr) @@ -736,7 +745,11 @@ char *OneapiDevice::device_capabilities() const std::vector &oneapi_devices = available_devices(); for (const sycl::device &device : oneapi_devices) { +# ifndef WITH_ONEAPI_SYCL_HOST_TASK const std::string &name = device.get_info(); +# else + const std::string &name = "SYCL Host Task (Debug)"; +# endif capabilities << std::string("\t") << name << "\n"; # define WRITE_ATTR(attribute_name, attribute_variable) \ @@ -813,7 +826,11 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p for (sycl::device &device : devices) { const std::string &platform_name = device.get_platform().get_info(); +# ifndef WITH_ONEAPI_SYCL_HOST_TASK std::string name = device.get_info(); +# else + std::string name = "SYCL Host Task (Debug)"; +# endif std::string id = "ONEAPI_" + platform_name + "_" + name; if (device.has(sycl::aspect::ext_intel_pci_address)) { id.append("_" + device.get_info()); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a7a6c0a6007..5ba1b683d6b 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -752,6 +752,10 @@ if(WITH_CYCLES_DEVICE_ONEAPI) ${SYCL_CPP_FLAGS} ) + if (WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION) + list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_TASK) + endif() + # Set defaults for spir64 and spir64_gen options if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") @@ -763,7 +767,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI) string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ") string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ") - if(WITH_CYCLES_ONEAPI_BINARIES) + # Host execution won't use GPU binaries, no need to compile them. + if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION) # AoT binaries aren't currently reused when calling sycl::build. list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD) # Iterate over all targest and their options diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 1d47211604b..c876c35465c 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -30,6 +30,16 @@ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *ccl_restrict num_indices, IsActiveOp is_active_op) { +# ifdef WITH_ONEAPI_SYCL_HOST_TASK + int write_index = 0; + for (int state_index = 0; state_index < num_states; state_index++) { + if (is_active_op(state_index)) + indices[write_index++] = state_index; + } + *num_indices = write_index; + return; +# endif /* WITH_ONEAPI_SYCL_HOST_TASK */ + const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>(); const uint blocksize = item_id.get_local_range(0); diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index b83512180d7..0691c01b3b5 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -56,7 +56,8 @@ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) #define ccl_gpu_kernel_threads(block_num_threads) -#define ccl_gpu_kernel_signature(name, ...) \ +#ifndef WITH_ONEAPI_SYCL_HOST_TASK +# define ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ size_t kernel_local_size, \ @@ -67,9 +68,37 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ [=](sycl::nd_item<1> item) { -#define ccl_gpu_kernel_postfix \ +# define ccl_gpu_kernel_postfix \ }); \ } +#else +/* Additional anonymous lambda is required to handle all "return" statements in the kernel code */ +# define ccl_gpu_kernel_signature(name, ...) \ +void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ + size_t kernel_global_size, \ + size_t kernel_local_size, \ + sycl::handler &cgh, \ + __VA_ARGS__) { \ + (kg); \ + (kernel_local_size); \ + cgh.host_task( \ + [=]() {\ + for (size_t gid = (size_t)0; gid < kernel_global_size; gid++) { \ + kg->nd_item_local_id_0 = 0; \ + kg->nd_item_local_range_0 = 1; \ + kg->nd_item_group_id_0 = gid; \ + kg->nd_item_group_range_0 = kernel_global_size; \ + kg->nd_item_global_id_0 = gid; \ + kg->nd_item_global_range_0 = kernel_global_size; \ + auto kernel = [=]() { + +# define ccl_gpu_kernel_postfix \ + }; \ + kernel(); \ + } \ + }); \ +} +#endif #define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x @@ -83,23 +112,40 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) /* GPU thread, block, grid size and index */ -#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) -#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) -#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) -#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) -#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) -#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) -#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) +#ifndef WITH_ONEAPI_SYCL_HOST_TASK +# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) /* GPU warp synchronization */ -#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() -#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) -#ifdef __SYCL_DEVICE_ONLY__ - #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count()) +# define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() +# define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) +# ifdef __SYCL_DEVICE_ONLY__ +# define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count()) +# else +# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) +# endif #else - #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) +# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) +# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) +# define ccl_gpu_block_idx_x (kg->nd_item_group_id_0) +# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) +# define ccl_gpu_warp_size (1) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) +# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) + +# define ccl_gpu_syncthreads() +# define ccl_gpu_local_syncthreads() +# define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) #endif /* Debug defines */ diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index 116620eb725..87932deb2f0 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -23,6 +23,15 @@ typedef struct KernelGlobalsGPU { #undef KERNEL_DATA_ARRAY IntegratorStateGPU *integrator_state; const KernelData *__data; + +#ifdef WITH_ONEAPI_SYCL_HOST_TASK + size_t nd_item_local_id_0; + size_t nd_item_local_range_0; + size_t nd_item_group_id_0; + size_t nd_item_group_range_0; + size_t nd_item_global_id_0; + size_t nd_item_global_range_0; +#endif } KernelGlobalsGPU; typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 525ae288f0c..56c1e7ca47c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -230,6 +230,12 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; + +# ifdef WITH_ONEAPI_SYCL_HOST_TASK + /* Path array implementation is serial in case of SYCL Host Task execution. */ + global_size = 1; + local_size = 1; +# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */