blender/intern/cycles/device/opencl/opencl.h

/*
 * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifdef WITH_OPENCL

#include "device.h"

#include "util_map.h"
#include "util_param.h"
#include "util_string.h"

#include "clew.h"

CCL_NAMESPACE_BEGIN

/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
#  undef clEnqueueNDRangeKernel
#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
	clFinish(a); \
	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
	clFinish(a);

#  undef clEnqueueWriteBuffer
#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
	clFinish(a); \
	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
	clFinish(a);

#  undef clEnqueueReadBuffer
#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
	clFinish(a); \
	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
	clFinish(a);
#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */

#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))

struct OpenCLPlatformDevice {
	OpenCLPlatformDevice(cl_platform_id platform_id,
	                     const string& platform_name,
	                     cl_device_id device_id,
	                     cl_device_type device_type,
	                     const string& device_name,
	                     const string& hardware_id)
	  : platform_id(platform_id),
	    platform_name(platform_name),
	    device_id(device_id),
	    device_type(device_type),
	    device_name(device_name),
	    hardware_id(hardware_id) {}
	cl_platform_id platform_id;
	string platform_name;
	cl_device_id device_id;
	cl_device_type device_type;
	string device_name;
	string hardware_id;
};

/* Contains all static OpenCL helper functions. */
class OpenCLInfo
{
public:
	static cl_device_type device_type();
	static bool use_debug();
	static bool kernel_use_advanced_shading(const string& platform_name);
	static bool kernel_use_split(const string& platform_name,
	                             const cl_device_type device_type);
	static bool device_supported(const string& platform_name,
	                             const cl_device_id device_id);
	static bool platform_version_check(cl_platform_id platform,
	                                   string *error = NULL);
	static bool device_version_check(cl_device_id device,
	                                 string *error = NULL);
	static string get_hardware_id(string platform_name,
	                              cl_device_id device_id);
	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
	                               bool force_all = false);
	static bool use_single_program();

	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */

	/* Platform information. */
	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
	static cl_uint get_num_platforms();

	static bool get_platforms(vector<cl_platform_id> *platform_ids,
	                          cl_int *error = NULL);
	static vector<cl_platform_id> get_platforms();

	static bool get_platform_name(cl_platform_id platform_id,
	                              string *platform_name);
	static string get_platform_name(cl_platform_id platform_id);

	static bool get_num_platform_devices(cl_platform_id platform_id,
	                                     cl_device_type device_type,
	                                     cl_uint *num_devices,
	                                     cl_int *error = NULL);
	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
	                                        cl_device_type device_type);

	static bool get_platform_devices(cl_platform_id platform_id,
	                                 cl_device_type device_type,
	                                 vector<cl_device_id> *device_ids,
	                                 cl_int* error = NULL);
	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
	                                                 cl_device_type device_type);

	/* Device information. */
	static bool get_device_name(cl_device_id device_id,
	                            string *device_name,
	                            cl_int* error = NULL);

	static string get_device_name(cl_device_id device_id);

	static bool get_device_type(cl_device_id device_id,
	                            cl_device_type *device_type,
	                            cl_int* error = NULL);
	static cl_device_type get_device_type(cl_device_id device_id);

	/* Get somewhat more readable device name.
	 * Main difference is AMD OpenCL here which only gives code name
	 * for the regular device name. This will give more sane device
	 * name using some extensions.
	 */
	static string get_readable_device_name(cl_device_id device_id);
};

/* Thread safe cache for contexts and programs.
 */
class OpenCLCache
{
	struct Slot
	{
		struct ProgramEntry
		{
			ProgramEntry();
			ProgramEntry(const ProgramEntry& rhs);
			~ProgramEntry();
			cl_program program;
			thread_mutex *mutex;
		};

		Slot();
		Slot(const Slot& rhs);
		~Slot();

		thread_mutex *context_mutex;
		cl_context context;
		typedef map<ustring, ProgramEntry> EntryMap;
		EntryMap programs;

	};

	/* key is combination of platform ID and device ID */
	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;

	/* map of Slot objects */
	typedef map<PlatformDevicePair, Slot> CacheMap;
	CacheMap cache;

	/* MD5 hash of the kernel source. */
	string kernel_md5;

	thread_mutex cache_lock;
	thread_mutex kernel_md5_lock;

	/* lazy instantiate */
	static OpenCLCache& global_instance();

public:

	enum ProgramName {
		OCL_DEV_BASE_PROGRAM,
		OCL_DEV_MEGAKERNEL_PROGRAM,
	};

	/* Lookup context in the cache. If this returns NULL, slot_locker
	 * will be holding a lock for the cache. slot_locker should refer to a
	 * default constructed thread_scoped_lock. */
	static cl_context get_context(cl_platform_id platform,
	                              cl_device_id device,
	                              thread_scoped_lock& slot_locker);
	/* Same as above. */
	static cl_program get_program(cl_platform_id platform,
	                              cl_device_id device,
	                              ustring key,
	                              thread_scoped_lock& slot_locker);

	/* Store context in the cache. You MUST have tried to get the item before storing to it. */
	static void store_context(cl_platform_id platform,
	                          cl_device_id device,
	                          cl_context context,
	                          thread_scoped_lock& slot_locker);
	/* Same as above. */
	static void store_program(cl_platform_id platform,
	                          cl_device_id device,
	                          cl_program program,
	                          ustring key,
	                          thread_scoped_lock& slot_locker);

	static string get_kernel_md5();
};

#define opencl_assert(stmt) \
	{ \
		cl_int err = stmt; \
		\
		if(err != CL_SUCCESS) { \
			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
			if(error_msg == "") \
				error_msg = message; \
			fprintf(stderr, "%s\n", message.c_str()); \
		} \
	} (void)0

class OpenCLDeviceBase : public Device
{
public:
	DedicatedTaskPool task_pool;
	cl_context cxContext;
	cl_command_queue cqCommandQueue;
	cl_platform_id cpPlatform;
	cl_device_id cdDevice;
	cl_int ciErr;

	class OpenCLProgram {
	public:
		OpenCLProgram() : loaded(false), device(NULL) {}
		OpenCLProgram(OpenCLDeviceBase *device,
		              string program_name,
		              string kernel_name,
		              string kernel_build_options,
		              bool use_stdout = true);
		~OpenCLProgram();

		void add_kernel(ustring name);
		void load();

		bool is_loaded()    { return loaded; }
		string get_log()    { return log; }
		void report_error();

		cl_kernel operator()();
		cl_kernel operator()(ustring name);

		void release();

	private:
		bool build_kernel(const string *debug_src);
		bool compile_kernel(const string *debug_src);
		bool load_binary(const string& clbin, const string *debug_src = NULL);
		bool save_binary(const string& clbin);

		void add_log(string msg, bool is_debug);
		void add_error(string msg);

		bool loaded;
		cl_program program;
		OpenCLDeviceBase *device;

		/* Used for the OpenCLCache key. */
		string program_name;

		string kernel_file, kernel_build_options, device_md5;

		bool use_stdout;
		string log, error_msg;
		string compile_output;

		map<ustring, cl_kernel> kernels;
	};

	OpenCLProgram base_program;

	typedef map<string, device_vector<uchar>*> ConstMemMap;
	typedef map<string, device_ptr> MemMap;

	ConstMemMap const_mem_map;
	MemMap mem_map;
	device_ptr null_mem;

	bool device_initialized;
	string platform_name;
	string device_name;

	bool opencl_error(cl_int err);
	void opencl_error(const string& message);
	void opencl_assert_err(cl_int err, const char* where);

	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_);
	~OpenCLDeviceBase();

	static void CL_CALLBACK context_notify_callback(const char *err_info,
		const void * /*private_info*/, size_t /*cb*/, void *user_data);

	bool opencl_version_check();

	string device_md5_hash(string kernel_custom_build_options = "");
	bool load_kernels(const DeviceRequestedFeatures& requested_features);

	/* Has to be implemented by the real device classes.
	 * The base device will then load all these programs. */
	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
	                          vector<OpenCLProgram*> &programs) = 0;

	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
	void mem_copy_to(device_memory& mem);
	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
	void mem_zero(device_memory& mem);
	void mem_free(device_memory& mem);
	void const_copy_to(const char *name, void *host, size_t size);
	void tex_alloc(const char *name,
	               device_memory& mem,
	               InterpolationType /*interpolation*/,
	               ExtensionType /*extension*/);
	void tex_free(device_memory& mem);

	size_t global_size_round_up(int group_size, int global_size);
	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);

	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
	void shader(DeviceTask& task);

	class OpenCLDeviceTask : public DeviceTask {
	public:
		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
		: DeviceTask(task)
		{
			run = function_bind(&OpenCLDeviceBase::thread_run,
			                    device,
			                    this);
		}
	};

	int get_split_task_count(DeviceTask& /*task*/)
	{
		return 1;
	}

	void task_add(DeviceTask& task)
	{
		task_pool.push(new OpenCLDeviceTask(this, task));
	}

	void task_wait()
	{
		task_pool.wait();
	}

	void task_cancel()
	{
		task_pool.cancel();
	}

	virtual void thread_run(DeviceTask * /*task*/) = 0;

protected:
	string kernel_build_options(const string *debug_src = NULL);

	class ArgumentWrapper {
	public:
		ArgumentWrapper() : size(0), pointer(NULL)
		{
		}

		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
		                                           pointer((void*)(&argument.device_pointer))
		{
		}

		template<typename T>
		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
		                                              pointer((void*)(&argument.device_pointer))
		{
		}

		template<typename T>
		ArgumentWrapper(T& argument) : size(sizeof(argument)),
		                               pointer(&argument)
		{
		}

		ArgumentWrapper(int argument) : size(sizeof(int)),
		                                int_value(argument),
		                                pointer(&int_value)
		{
		}

		ArgumentWrapper(float argument) : size(sizeof(float)),
		                                  float_value(argument),
		                                  pointer(&float_value)
		{
		}

		size_t size;
		int int_value;
		float float_value;
		void *pointer;
	};

	/* TODO(sergey): In the future we can use variadic templates, once
	 * C++0x is allowed. Should allow to clean this up a bit.
	 */
	int kernel_set_args(cl_kernel kernel,
	                    int start_argument_index,
	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
	                    const ArgumentWrapper& arg33 = ArgumentWrapper());

	void release_kernel_safe(cl_kernel kernel);
	void release_mem_object_safe(cl_mem mem);
	void release_program_safe(cl_program program);

	/* ** Those guys are for workign around some compiler-specific bugs ** */

	virtual cl_program load_cached_kernel(
	        ustring key,
	        thread_scoped_lock& cache_locker);

	virtual void store_cached_kernel(
	        cl_program program,
	        ustring key,
	        thread_scoped_lock& cache_locker);

	virtual string build_options_for_base_program(
	        const DeviceRequestedFeatures& /*requested_features*/);
};

Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background);

CCL_NAMESPACE_END

#endif