diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index b19f5e22769..48d1c18555a 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -746,7 +746,12 @@ public: CUdeviceptr d_output = cuda_device_ptr(task.shader_output); /* get kernel function */ - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); + } + else { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + } /* do tasks in smaller chunks, so we can cancel it */ const int shader_chunk_size = 65536; diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 694ec9db036..f841daba124 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -321,6 +321,7 @@ public: cl_kernel ckFilmConvertByteKernel; cl_kernel ckFilmConvertHalfFloatKernel; cl_kernel ckShaderKernel; + cl_kernel ckBakeKernel; cl_int ciErr; typedef map*> ConstMemMap; @@ -443,6 +444,7 @@ public: ckFilmConvertByteKernel = NULL; ckFilmConvertHalfFloatKernel = NULL; ckShaderKernel = NULL; + ckBakeKernel = NULL; null_mem = 0; device_initialized = false; @@ -791,6 +793,10 @@ public: if(opencl_error(ciErr)) return false; + ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr); + if(opencl_error(ciErr)) + return false; + return true; } @@ -1054,19 +1060,26 @@ public: /* sample arguments */ cl_uint narg = 0; - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output)); + cl_kernel kernel; + + if(task.shader_eval_type >= SHADER_EVAL_BAKE) + kernel = ckBakeKernel; + else + kernel = ckShaderKernel; + + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output)); #define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckShaderKernel, &narg, #name); + set_kernel_arg_mem(kernel, &narg, #name); #include "kernel_textures.h" - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); - enqueue_kernel(ckShaderKernel, task.shader_w, 1); + enqueue_kernel(kernel, task.shader_w, 1); } void thread_run(DeviceTask *task) diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl index 1dc0793a7bc..d7d3438036e 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernel.cl @@ -131,3 +131,28 @@ __kernel void kernel_ocl_shader( kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x); } +__kernel void kernel_ocl_bake( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel_textures.h" + + int type, int sx, int sw) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel_textures.h" + + int x = sx + get_global_id(0); + + if(x < sx + sw) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x); +} + diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index a0b6b8e13d0..a535659b3b1 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -122,7 +122,10 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index b9b41f755ba..bb20819f6fc 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -153,5 +153,14 @@ kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x); } +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x); +} + #endif diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp index c572fcd3df6..7d354de16d2 100644 --- a/intern/cycles/kernel/kernel_avx.cpp +++ b/intern/cycles/kernel/kernel_avx.cpp @@ -69,7 +69,10 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 6e735517763..c3ae2b6a54e 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -356,11 +356,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i) { - if(type >= SHADER_EVAL_BAKE) { - kernel_bake_evaluate(kg, input, output, type, i); - return; - } - ShaderData sd; uint4 in = input[i]; float3 out; diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 455cac046a3..3b5faea2994 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -66,7 +66,10 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 29aca52890e..3b18b164ffd 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -68,7 +68,10 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp index 0ece67e6d2b..a3731d790f4 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernel_sse41.cpp @@ -69,7 +69,10 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } CCL_NAMESPACE_END