Cycles: some tweaks for apple opencl with ATI cards, to get it working up to

the level of ambient occlusion render, shaders still fail. Fixes found with much help from Jens and Dalai.
2011-12-20 17:36:56 +00:00 · 2011-12-20 17:36:56 +00:00 · 690de79580
commit 690de79580
parent 738fdc7b6f
5 changed files with 78 additions and 21 deletions
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -24,6 +24,7 @@

 #include "util_cuda.h"
 #include "util_debug.h"
+#include "util_foreach.h"
 #include "util_math.h"
 #include "util_opencl.h"
 #include "util_opengl.h"
@ -41,7 +42,31 @@ DeviceTask::DeviceTask(Type type_)
 {
 }

-void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
+void DeviceTask::split_max_size(list<DeviceTask>& tasks, int max_size)
+{
+	int num;
+
+	if(type == DISPLACE) {
+		num = (displace_w + max_size - 1)/max_size;
+	}
+	else {
+		max_size = max(1, max_size/w);
+		num = (h + max_size - 1)/max_size;
+	}
+
+	split(tasks, num);
+}
+
+void DeviceTask::split(ThreadQueue<DeviceTask>& queue, int num)
+{
+	list<DeviceTask> tasks;
+	split(tasks, num);
+
+	foreach(DeviceTask& task, tasks)
+		queue.push(task);
+}
+
+void DeviceTask::split(list<DeviceTask>& tasks, int num)
 {
 	if(type == DISPLACE) {
 		num = min(displace_w, num);
@ -55,7 +80,7 @@ void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
 			task.displace_x = tx;
 			task.displace_w = tw;

-			tasks.push(task);
+			tasks.push_back(task);
 		}
 	}
 	else {
@ -70,7 +95,7 @@ void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
 			task.y = ty;
 			task.h = th;

-			tasks.push(task);
+			tasks.push_back(task);
 		}
 	}
 }
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -23,6 +23,7 @@

 #include "device_memory.h"

+#include "util_list.h"
 #include "util_string.h"
 #include "util_thread.h"
 #include "util_types.h"
@ -67,7 +68,10 @@ public:
 	int displace_x, displace_w;

 	DeviceTask(Type type = PATH_TRACE);
+
+	void split(list<DeviceTask>& tasks, int num);
 	void split(ThreadQueue<DeviceTask>& tasks, int num);
+	void split_max_size(list<DeviceTask>& tasks, int max_size);
 };

 /* Device */
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@ -25,6 +25,7 @@
 #include "device.h"
 #include "device_intern.h"

+#include "util_foreach.h"
 #include "util_map.h"
 #include "util_math.h"
 #include "util_md5.h"
@ -52,6 +53,7 @@ public:
 	map<string, device_memory*> mem_map;
 	device_ptr null_mem;
 	bool device_initialized;
+	string platform_name;

 	const char *opencl_error_string(cl_int err)
 	{
@ -175,6 +177,10 @@ public:
 		if(opencl_error(ciErr))
 			return;

+		char name[256];
+		clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+		platform_name = name;
+
 		cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
 		if(opencl_error(ciErr))
 			return;
@ -191,7 +197,7 @@ public:
 	{
 		char version[256];

-		int major, minor, req_major = 1, req_minor = 0;
+		int major, minor, req_major = 1, req_minor = 1;

 		clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);

@ -277,14 +283,11 @@ public:
 	{
 		string build_options = " -cl-fast-relaxed-math ";
 		
-		/* Full Shading only on NVIDIA cards at the moment */
-		char vendor[256];
-
-		clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(vendor), &vendor, NULL);
-		string name = vendor;
-		
-		if(name == "NVIDIA CUDA")
-			build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ ";
+		/* full shading only on NVIDIA cards at the moment */
+		if(platform_name == "NVIDIA CUDA")
+			build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
+		if(platform_name == "Apple")
+			build_options += " -D__CL_NO_FLOAT3__ ";

 		return build_options;
 	}
@ -657,12 +660,24 @@ public:
 		opencl_assert(clFinish(cqCommandQueue));
 	}

-	void task_add(DeviceTask& task)
+	void task_add(DeviceTask& maintask)
 	{
-		if(task.type == DeviceTask::TONEMAP)
-			tonemap(task);
-		else if(task.type == DeviceTask::PATH_TRACE)
-			path_trace(task);
+		list<DeviceTask> tasks;
+
+		/* arbitrary limit to work around apple ATI opencl issue */
+		if(platform_name == "Apple")
+			maintask.split_max_size(tasks, 76800);
+		else
+			tasks.push_back(maintask);
+
+		DeviceTask task;
+
+		foreach(DeviceTask& task, tasks) {
+			if(task.type == DeviceTask::TONEMAP)
+				tonemap(task);
+			else if(task.type == DeviceTask::PATH_TRACE)
+				path_trace(task);
+		}
 	}

 	void task_wait()
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -143,7 +143,7 @@ endif()
 #set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
 #add_custom_command(
 #	OUTPUT ${KERNEL_PREPROCESSED}
-#	COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DWITH_OPENCL -o ${KERNEL_PREPROCESSED}
+#	COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
 #	DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@ -25,12 +25,21 @@
 /* no namespaces in opencl */
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
-#define WITH_OPENCL
+
+#ifdef __CL_NO_FLOAT3__
+#define float3 float4
+#endif
+
+#ifdef __CL_NOINLINE__
+#define __noinline __attribute__((noinline))
+#else
+#define __noinline
+#endif

 /* in opencl all functions are device functions, so leave this empty */
 #define __device
-#define __device_inline
-#define __device_noinline
+#define __device_inline __device
+#define __device_noinline  __device __noinline

 /* no assert in opencl */
 #define kernel_assert(cond)
@ -68,7 +77,11 @@ __device float kernel_tex_interp_(__global float *data, int width, float x)
 #endif

 #define make_float2(x, y) ((float2)(x, y))
+#ifdef __CL_NO_FLOAT3__
+#define make_float3(x, y, z) ((float4)(x, y, z, 0.0))
+#else
 #define make_float3(x, y, z) ((float3)(x, y, z))
+#endif
 #define make_float4(x, y, z, w) ((float4)(x, y, z, w))
 #define make_int2(x, y) ((int2)(x, y))
 #define make_int3(x, y, z) ((int3)(x, y, z))