diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index f43ccffe461..6ebc359fdb3 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -24,6 +24,7 @@ #include "util_cuda.h" #include "util_debug.h" +#include "util_foreach.h" #include "util_math.h" #include "util_opencl.h" #include "util_opengl.h" @@ -41,7 +42,31 @@ DeviceTask::DeviceTask(Type type_) { } -void DeviceTask::split(ThreadQueue& tasks, int num) +void DeviceTask::split_max_size(list& tasks, int max_size) +{ + int num; + + if(type == DISPLACE) { + num = (displace_w + max_size - 1)/max_size; + } + else { + max_size = max(1, max_size/w); + num = (h + max_size - 1)/max_size; + } + + split(tasks, num); +} + +void DeviceTask::split(ThreadQueue& queue, int num) +{ + list tasks; + split(tasks, num); + + foreach(DeviceTask& task, tasks) + queue.push(task); +} + +void DeviceTask::split(list& tasks, int num) { if(type == DISPLACE) { num = min(displace_w, num); @@ -55,7 +80,7 @@ void DeviceTask::split(ThreadQueue& tasks, int num) task.displace_x = tx; task.displace_w = tw; - tasks.push(task); + tasks.push_back(task); } } else { @@ -70,7 +95,7 @@ void DeviceTask::split(ThreadQueue& tasks, int num) task.y = ty; task.h = th; - tasks.push(task); + tasks.push_back(task); } } } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index be6a3f144ed..a6a81e7b326 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -23,6 +23,7 @@ #include "device_memory.h" +#include "util_list.h" #include "util_string.h" #include "util_thread.h" #include "util_types.h" @@ -67,7 +68,10 @@ public: int displace_x, displace_w; DeviceTask(Type type = PATH_TRACE); + + void split(list& tasks, int num); void split(ThreadQueue& tasks, int num); + void split_max_size(list& tasks, int max_size); }; /* Device */ diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 3a1d3032d6e..6014dd0fdb7 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -25,6 +25,7 @@ #include "device.h" #include "device_intern.h" +#include "util_foreach.h" #include "util_map.h" #include "util_math.h" #include "util_md5.h" @@ -52,6 +53,7 @@ public: map mem_map; device_ptr null_mem; bool device_initialized; + string platform_name; const char *opencl_error_string(cl_int err) { @@ -175,6 +177,10 @@ public: if(opencl_error(ciErr)) return; + char name[256]; + clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL); + platform_name = name; + cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr); if(opencl_error(ciErr)) return; @@ -191,7 +197,7 @@ public: { char version[256]; - int major, minor, req_major = 1, req_minor = 0; + int major, minor, req_major = 1, req_minor = 1; clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); @@ -277,14 +283,11 @@ public: { string build_options = " -cl-fast-relaxed-math "; - /* Full Shading only on NVIDIA cards at the moment */ - char vendor[256]; - - clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(vendor), &vendor, NULL); - string name = vendor; - - if(name == "NVIDIA CUDA") - build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ "; + /* full shading only on NVIDIA cards at the moment */ + if(platform_name == "NVIDIA CUDA") + build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ -cl-nv-maxrregcount=24 -cl-nv-verbose "; + if(platform_name == "Apple") + build_options += " -D__CL_NO_FLOAT3__ "; return build_options; } @@ -657,12 +660,24 @@ public: opencl_assert(clFinish(cqCommandQueue)); } - void task_add(DeviceTask& task) + void task_add(DeviceTask& maintask) { - if(task.type == DeviceTask::TONEMAP) - tonemap(task); - else if(task.type == DeviceTask::PATH_TRACE) - path_trace(task); + list tasks; + + /* arbitrary limit to work around apple ATI opencl issue */ + if(platform_name == "Apple") + maintask.split_max_size(tasks, 76800); + else + tasks.push_back(maintask); + + DeviceTask task; + + foreach(DeviceTask& task, tasks) { + if(task.type == DeviceTask::TONEMAP) + tonemap(task); + else if(task.type == DeviceTask::PATH_TRACE) + path_trace(task); + } } void task_wait() diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index e17544bf7af..939a74660a1 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -143,7 +143,7 @@ endif() #set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl) #add_custom_command( # OUTPUT ${KERNEL_PREPROCESSED} -# COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DWITH_OPENCL -o ${KERNEL_PREPROCESSED} +# COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED} # DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS}) #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 5515966807b..9fbd8566ecd 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -25,12 +25,21 @@ /* no namespaces in opencl */ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END -#define WITH_OPENCL + +#ifdef __CL_NO_FLOAT3__ +#define float3 float4 +#endif + +#ifdef __CL_NOINLINE__ +#define __noinline __attribute__((noinline)) +#else +#define __noinline +#endif /* in opencl all functions are device functions, so leave this empty */ #define __device -#define __device_inline -#define __device_noinline +#define __device_inline __device +#define __device_noinline __device __noinline /* no assert in opencl */ #define kernel_assert(cond) @@ -68,7 +77,11 @@ __device float kernel_tex_interp_(__global float *data, int width, float x) #endif #define make_float2(x, y) ((float2)(x, y)) +#ifdef __CL_NO_FLOAT3__ +#define make_float3(x, y, z) ((float4)(x, y, z, 0.0)) +#else #define make_float3(x, y, z) ((float3)(x, y, z)) +#endif #define make_float4(x, y, z, w) ((float4)(x, y, z, w)) #define make_int2(x, y) ((int2)(x, y)) #define make_int3(x, y, z) ((int3)(x, y, z))