forked from bartvdbraak/blender
Fix T40379: world MIS causing too much CUDA memory usage.
The kernel for baking the world texture was the same as the one used for baking. Now that's separate which allows the kernel to reserve much less memory.
This commit is contained in:
parent
bc9e66f083
commit
69c7522b24
@ -746,7 +746,12 @@ public:
|
||||
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
|
||||
|
||||
/* get kernel function */
|
||||
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
|
||||
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
||||
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
|
||||
}
|
||||
else {
|
||||
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
|
||||
}
|
||||
|
||||
/* do tasks in smaller chunks, so we can cancel it */
|
||||
const int shader_chunk_size = 65536;
|
||||
|
@ -321,6 +321,7 @@ public:
|
||||
cl_kernel ckFilmConvertByteKernel;
|
||||
cl_kernel ckFilmConvertHalfFloatKernel;
|
||||
cl_kernel ckShaderKernel;
|
||||
cl_kernel ckBakeKernel;
|
||||
cl_int ciErr;
|
||||
|
||||
typedef map<string, device_vector<uchar>*> ConstMemMap;
|
||||
@ -443,6 +444,7 @@ public:
|
||||
ckFilmConvertByteKernel = NULL;
|
||||
ckFilmConvertHalfFloatKernel = NULL;
|
||||
ckShaderKernel = NULL;
|
||||
ckBakeKernel = NULL;
|
||||
null_mem = 0;
|
||||
device_initialized = false;
|
||||
|
||||
@ -791,6 +793,10 @@ public:
|
||||
if(opencl_error(ciErr))
|
||||
return false;
|
||||
|
||||
ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
|
||||
if(opencl_error(ciErr))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1054,19 +1060,26 @@ public:
|
||||
/* sample arguments */
|
||||
cl_uint narg = 0;
|
||||
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data));
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input));
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output));
|
||||
cl_kernel kernel;
|
||||
|
||||
if(task.shader_eval_type >= SHADER_EVAL_BAKE)
|
||||
kernel = ckBakeKernel;
|
||||
else
|
||||
kernel = ckShaderKernel;
|
||||
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
set_kernel_arg_mem(ckShaderKernel, &narg, #name);
|
||||
set_kernel_arg_mem(kernel, &narg, #name);
|
||||
#include "kernel_textures.h"
|
||||
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
|
||||
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
|
||||
opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
|
||||
|
||||
enqueue_kernel(ckShaderKernel, task.shader_w, 1);
|
||||
enqueue_kernel(kernel, task.shader_w, 1);
|
||||
}
|
||||
|
||||
void thread_run(DeviceTask *task)
|
||||
|
@ -131,3 +131,28 @@ __kernel void kernel_ocl_shader(
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_bake(
|
||||
ccl_constant KernelData *data,
|
||||
ccl_global uint4 *input,
|
||||
ccl_global float4 *output,
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
ccl_global type *name,
|
||||
#include "kernel_textures.h"
|
||||
|
||||
int type, int sx, int sw)
|
||||
{
|
||||
KernelGlobals kglobals, *kg = &kglobals;
|
||||
|
||||
kg->data = data;
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
kg->name = name;
|
||||
#include "kernel_textures.h"
|
||||
|
||||
int x = sx + get_global_id(0);
|
||||
|
||||
if(x < sx + sw)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x);
|
||||
}
|
||||
|
||||
|
@ -122,7 +122,10 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
|
||||
|
||||
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
|
||||
{
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
if(type >= SHADER_EVAL_BAKE)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
else
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -153,5 +153,14 @@ kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
|
||||
kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx)
|
||||
{
|
||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -69,7 +69,10 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
|
||||
|
||||
void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
|
||||
{
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
if(type >= SHADER_EVAL_BAKE)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
else
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -356,11 +356,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
|
||||
|
||||
ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
|
||||
{
|
||||
if(type >= SHADER_EVAL_BAKE) {
|
||||
kernel_bake_evaluate(kg, input, output, type, i);
|
||||
return;
|
||||
}
|
||||
|
||||
ShaderData sd;
|
||||
uint4 in = input[i];
|
||||
float3 out;
|
||||
|
@ -66,7 +66,10 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
|
||||
|
||||
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
|
||||
{
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
if(type >= SHADER_EVAL_BAKE)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
else
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -68,7 +68,10 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
|
||||
|
||||
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
|
||||
{
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
if(type >= SHADER_EVAL_BAKE)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
else
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -69,7 +69,10 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
|
||||
|
||||
void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
|
||||
{
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
if(type >= SHADER_EVAL_BAKE)
|
||||
kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
else
|
||||
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
Loading…
Reference in New Issue
Block a user