Code cleanup: style, require ; for cuda_assert, opencl_assert

This commit is contained in:
Campbell Barton 2014-05-04 03:49:56 +10:00
parent 95d885b3f4
commit 1618329b00
3 changed files with 137 additions and 127 deletions

@ -177,8 +177,14 @@ static void display_info(Progress& progress)
interactive = options.interactive? "On":"Off"; interactive = options.interactive? "On":"Off";
str = string_printf("%s Time: %.2f Latency: %.4f Sample: %d Average: %.4f Interactive: %s", str = string_printf(
status.c_str(), total_time, latency, sample, sample_time, interactive.c_str()); "%s"
" Time: %.2f"
" Latency: %.4f"
" Sample: %d"
" Average: %.4f"
" Interactive: %s",
status.c_str(), total_time, latency, sample, sample_time, interactive.c_str());
view_display_info(str.c_str()); view_display_info(str.c_str());

@ -139,7 +139,7 @@ public:
/*cuda_abort();*/ \ /*cuda_abort();*/ \
cuda_error_documentation(); \ cuda_error_documentation(); \
} \ } \
} } (void)0
bool cuda_error_(CUresult result, const string& stmt) bool cuda_error_(CUresult result, const string& stmt)
{ {
@ -166,7 +166,7 @@ public:
void cuda_push_context() void cuda_push_context()
{ {
cuda_assert(cuCtxSetCurrent(cuContext)) cuda_assert(cuCtxSetCurrent(cuContext));
} }
void cuda_pop_context() void cuda_pop_context()
@ -174,7 +174,7 @@ public:
cuda_assert(cuCtxSetCurrent(NULL)); cuda_assert(cuCtxSetCurrent(NULL));
} }
CUDADevice(DeviceInfo& info, Stats &stats, bool background_) CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
: Device(info, stats, background_) : Device(info, stats, background_)
{ {
first_error = true; first_error = true;
@ -212,8 +212,8 @@ public:
if(cuda_error_(result, "cuCtxCreate")) if(cuda_error_(result, "cuCtxCreate"))
return; return;
cuda_assert(cuStreamCreate(&cuStream, 0)) cuda_assert(cuStreamCreate(&cuStream, 0));
cuda_assert(cuEventCreate(&tileDone, 0x1)) cuda_assert(cuEventCreate(&tileDone, 0x1));
int major, minor; int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId); cuDeviceComputeCapability(&major, &minor, cuDevId);
@ -231,9 +231,9 @@ public:
{ {
task_pool.stop(); task_pool.stop();
cuda_assert(cuEventDestroy(tileDone)) cuda_assert(cuEventDestroy(tileDone));
cuda_assert(cuStreamDestroy(cuStream)) cuda_assert(cuStreamDestroy(cuStream));
cuda_assert(cuCtxDestroy(cuContext)) cuda_assert(cuCtxDestroy(cuContext));
} }
bool support_device(bool experimental, bool branched) bool support_device(bool experimental, bool branched)
@ -376,7 +376,7 @@ public:
cuda_push_context(); cuda_push_context();
CUdeviceptr device_pointer; CUdeviceptr device_pointer;
size_t size = mem.memory_size(); size_t size = mem.memory_size();
cuda_assert(cuMemAlloc(&device_pointer, size)) cuda_assert(cuMemAlloc(&device_pointer, size));
mem.device_pointer = (device_ptr)device_pointer; mem.device_pointer = (device_ptr)device_pointer;
stats.mem_alloc(size); stats.mem_alloc(size);
cuda_pop_context(); cuda_pop_context();
@ -386,7 +386,7 @@ public:
{ {
cuda_push_context(); cuda_push_context();
if(mem.device_pointer) if(mem.device_pointer)
cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
cuda_pop_context(); cuda_pop_context();
} }
@ -398,7 +398,7 @@ public:
cuda_push_context(); cuda_push_context();
if(mem.device_pointer) { if(mem.device_pointer) {
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
(CUdeviceptr)((uchar*)mem.device_pointer + offset), size)) (CUdeviceptr)((uchar*)mem.device_pointer + offset), size));
} }
else { else {
memset((char*)mem.data_pointer + offset, 0, size); memset((char*)mem.data_pointer + offset, 0, size);
@ -412,7 +412,7 @@ public:
cuda_push_context(); cuda_push_context();
if(mem.device_pointer) if(mem.device_pointer)
cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())) cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
cuda_pop_context(); cuda_pop_context();
} }
@ -420,7 +420,7 @@ public:
{ {
if(mem.device_pointer) { if(mem.device_pointer) {
cuda_push_context(); cuda_push_context();
cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))) cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
cuda_pop_context(); cuda_pop_context();
mem.device_pointer = 0; mem.device_pointer = 0;
@ -435,9 +435,9 @@ public:
size_t bytes; size_t bytes;
cuda_push_context(); cuda_push_context();
cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)) cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
//assert(bytes == size); //assert(bytes == size);
cuda_assert(cuMemcpyHtoD(mem, host, size)) cuda_assert(cuMemcpyHtoD(mem, host, size));
cuda_pop_context(); cuda_pop_context();
} }
@ -464,7 +464,7 @@ public:
CUtexref texref = NULL; CUtexref texref = NULL;
cuda_push_context(); cuda_push_context();
cuda_assert(cuModuleGetTexRef(&texref, cuModule, name)) cuda_assert(cuModuleGetTexRef(&texref, cuModule, name));
if(!texref) { if(!texref) {
cuda_pop_context(); cuda_pop_context();
@ -480,7 +480,7 @@ public:
desc.Format = format; desc.Format = format;
desc.NumChannels = mem.data_elements; desc.NumChannels = mem.data_elements;
cuda_assert(cuArrayCreate(&handle, &desc)) cuda_assert(cuArrayCreate(&handle, &desc));
if(!handle) { if(!handle) {
cuda_pop_context(); cuda_pop_context();
@ -498,23 +498,23 @@ public:
param.WidthInBytes = param.srcPitch; param.WidthInBytes = param.srcPitch;
param.Height = mem.data_height; param.Height = mem.data_height;
cuda_assert(cuMemcpy2D(&param)) cuda_assert(cuMemcpy2D(&param));
} }
else else
cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)) cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)) cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
if(interpolation == INTERPOLATION_CLOSEST) { if(interpolation == INTERPOLATION_CLOSEST) {
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)) cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
} }
else if (interpolation == INTERPOLATION_LINEAR){ else if (interpolation == INTERPOLATION_LINEAR){
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)) cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
} }
else {/* CUBIC and SMART are unsupported for CUDA */ else {/* CUBIC and SMART are unsupported for CUDA */
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)) cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
} }
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)) cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
mem.device_pointer = (device_ptr)handle; mem.device_pointer = (device_ptr)handle;
@ -528,20 +528,20 @@ public:
cuda_push_context(); cuda_push_context();
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)) cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)) cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)) cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
} }
if(periodic) { if(periodic) {
cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP)) cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP));
cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP)) cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP));
} }
else { else {
cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP)) cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP));
cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP)) cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP));
} }
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)) cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
cuda_pop_context(); cuda_pop_context();
} }
@ -554,17 +554,17 @@ public:
CUdeviceptr cumem; CUdeviceptr cumem;
size_t cubytes; size_t cubytes;
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name)) cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name));
if(cubytes == 8) { if(cubytes == 8) {
/* 64 bit device pointer */ /* 64 bit device pointer */
uint64_t ptr = mem.device_pointer; uint64_t ptr = mem.device_pointer;
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)) cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
} }
else { else {
/* 32 bit device pointer */ /* 32 bit device pointer */
uint32_t ptr = (uint32_t)mem.device_pointer; uint32_t ptr = (uint32_t)mem.device_pointer;
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)) cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
} }
cuda_pop_context(); cuda_pop_context();
@ -605,10 +605,12 @@ public:
CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
/* get kernel function */ /* get kernel function */
if(branched && support_device(true, branched)) if(branched && support_device(true, branched)) {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")) cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
else }
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")) else {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
}
if(have_error()) if(have_error())
return; return;
@ -616,43 +618,43 @@ public:
/* pass in parameters */ /* pass in parameters */
int offset = 0; int offset = 0;
cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer))) cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)));
offset += sizeof(d_buffer); offset += sizeof(d_buffer);
cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state))) cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)));
offset += sizeof(d_rng_state); offset += sizeof(d_rng_state);
offset = align_up(offset, __alignof(sample)); offset = align_up(offset, __alignof(sample));
cuda_assert(cuParamSeti(cuPathTrace, offset, sample)) cuda_assert(cuParamSeti(cuPathTrace, offset, sample));
offset += sizeof(sample); offset += sizeof(sample);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x));
offset += sizeof(rtile.x); offset += sizeof(rtile.x);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y));
offset += sizeof(rtile.y); offset += sizeof(rtile.y);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w));
offset += sizeof(rtile.w); offset += sizeof(rtile.w);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h));
offset += sizeof(rtile.h); offset += sizeof(rtile.h);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset));
offset += sizeof(rtile.offset); offset += sizeof(rtile.offset);
cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride)) cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride));
offset += sizeof(rtile.stride); offset += sizeof(rtile.stride);
cuda_assert(cuParamSetSize(cuPathTrace, offset)) cuda_assert(cuParamSetSize(cuPathTrace, offset));
/* launch kernel */ /* launch kernel */
int threads_per_block; int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)) cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
/*int num_registers; /*int num_registers;
cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)) cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
printf("threads_per_block %d\n", threads_per_block); printf("threads_per_block %d\n", threads_per_block);
printf("num_registers %d\n", num_registers);*/ printf("num_registers %d\n", num_registers);*/
@ -662,16 +664,16 @@ public:
int xblocks = (rtile.w + xthreads - 1)/xthreads; int xblocks = (rtile.w + xthreads - 1)/xthreads;
int yblocks = (rtile.h + ythreads - 1)/ythreads; int yblocks = (rtile.h + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)) cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1));
if(info.display_device) { if(info.display_device) {
/* don't use async for device used for display, locks up UI too much */ /* don't use async for device used for display, locks up UI too much */
cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)) cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks));
cuda_assert(cuCtxSynchronize()) cuda_assert(cuCtxSynchronize());
} }
else { else {
cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream)) cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream));
} }
cuda_pop_context(); cuda_pop_context();
@ -689,58 +691,60 @@ public:
CUdeviceptr d_buffer = cuda_device_ptr(buffer); CUdeviceptr d_buffer = cuda_device_ptr(buffer);
/* get kernel function */ /* get kernel function */
if(rgba_half) if(rgba_half) {
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")) cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
else }
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")) else {
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
}
/* pass in parameters */ /* pass in parameters */
int offset = 0; int offset = 0;
cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba))) cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)));
offset += sizeof(d_rgba); offset += sizeof(d_rgba);
cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer))) cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)));
offset += sizeof(d_buffer); offset += sizeof(d_buffer);
float sample_scale = 1.0f/(task.sample + 1); float sample_scale = 1.0f/(task.sample + 1);
offset = align_up(offset, __alignof(sample_scale)); offset = align_up(offset, __alignof(sample_scale));
cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale)) cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale));
offset += sizeof(sample_scale); offset += sizeof(sample_scale);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x));
offset += sizeof(task.x); offset += sizeof(task.x);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y));
offset += sizeof(task.y); offset += sizeof(task.y);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w));
offset += sizeof(task.w); offset += sizeof(task.w);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h));
offset += sizeof(task.h); offset += sizeof(task.h);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset));
offset += sizeof(task.offset); offset += sizeof(task.offset);
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride)) cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride));
offset += sizeof(task.stride); offset += sizeof(task.stride);
cuda_assert(cuParamSetSize(cuFilmConvert, offset)) cuda_assert(cuParamSetSize(cuFilmConvert, offset));
/* launch kernel */ /* launch kernel */
int threads_per_block; int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)) cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
int xthreads = (int)sqrt((float)threads_per_block); int xthreads = (int)sqrt((float)threads_per_block);
int ythreads = (int)sqrt((float)threads_per_block); int ythreads = (int)sqrt((float)threads_per_block);
int xblocks = (task.w + xthreads - 1)/xthreads; int xblocks = (task.w + xthreads - 1)/xthreads;
int yblocks = (task.h + ythreads - 1)/ythreads; int yblocks = (task.h + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1)) cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1));
cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks)) cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks));
unmap_pixels((rgba_byte)? rgba_byte: rgba_half); unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
@ -759,7 +763,7 @@ public:
CUdeviceptr d_output = cuda_device_ptr(task.shader_output); CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
/* get kernel function */ /* get kernel function */
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")) cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
/* do tasks in smaller chunks, so we can cancel it */ /* do tasks in smaller chunks, so we can cancel it */
const int shader_chunk_size = 65536; const int shader_chunk_size = 65536;
@ -773,35 +777,35 @@ public:
/* pass in parameters */ /* pass in parameters */
int offset = 0; int offset = 0;
cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input))) cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
offset += sizeof(d_input); offset += sizeof(d_input);
cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output))) cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
offset += sizeof(d_output); offset += sizeof(d_output);
int shader_eval_type = task.shader_eval_type; int shader_eval_type = task.shader_eval_type;
offset = align_up(offset, __alignof(shader_eval_type)); offset = align_up(offset, __alignof(shader_eval_type));
cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type)) cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
offset += sizeof(task.shader_eval_type); offset += sizeof(task.shader_eval_type);
cuda_assert(cuParamSeti(cuShader, offset, shader_x)) cuda_assert(cuParamSeti(cuShader, offset, shader_x));
offset += sizeof(shader_x); offset += sizeof(shader_x);
cuda_assert(cuParamSetSize(cuShader, offset)) cuda_assert(cuParamSetSize(cuShader, offset));
/* launch kernel */ /* launch kernel */
int threads_per_block; int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)) cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
int shader_w = min(shader_chunk_size, end - shader_x); int shader_w = min(shader_chunk_size, end - shader_x);
int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1)) cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
cuda_assert(cuLaunchGrid(cuShader, xblocks, 1)) cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
cuda_assert(cuCtxSynchronize()) cuda_assert(cuCtxSynchronize());
} }
cuda_pop_context(); cuda_pop_context();
@ -814,8 +818,8 @@ public:
CUdeviceptr buffer; CUdeviceptr buffer;
size_t bytes; size_t bytes;
cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)) cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)) cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
return buffer; return buffer;
} }
@ -828,7 +832,7 @@ public:
if(!background) { if(!background) {
PixelMem pmem = pixel_mem_map[mem]; PixelMem pmem = pixel_mem_map[mem];
cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)) cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
} }
} }
@ -917,7 +921,7 @@ public:
cuda_push_context(); cuda_push_context();
cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)) cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
glDeleteBuffers(1, &pmem.cuPBO); glDeleteBuffers(1, &pmem.cuPBO);
glDeleteTextures(1, &pmem.cuTexId); glDeleteTextures(1, &pmem.cuTexId);
@ -1038,8 +1042,8 @@ public:
if(!info.display_device && sample == sync_sample) { if(!info.display_device && sample == sync_sample) {
cuda_push_context(); cuda_push_context();
cuda_assert(cuEventRecord(tileDone, cuStream)) cuda_assert(cuEventRecord(tileDone, cuStream));
cuda_assert(cuEventSynchronize(tileDone)) cuda_assert(cuEventSynchronize(tileDone));
/* Do some time keeping to find out if we need to sync less */ /* Do some time keeping to find out if we need to sync less */
boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time()); boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
@ -1065,7 +1069,7 @@ public:
shader(*task); shader(*task);
cuda_push_context(); cuda_push_context();
cuda_assert(cuCtxSynchronize()) cuda_assert(cuCtxSynchronize());
cuda_pop_context(); cuda_pop_context();
} }
} }
@ -1086,7 +1090,7 @@ public:
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
cuda_push_context(); cuda_push_context();
cuda_assert(cuCtxSynchronize()) cuda_assert(cuCtxSynchronize());
cuda_pop_context(); cuda_pop_context();
} }
else { else {

@ -416,7 +416,7 @@ public:
error_msg = message; \ error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \ fprintf(stderr, "%s\n", message.c_str()); \
} \ } \
} } (void)0
void opencl_assert_err(cl_int err, const char* where) void opencl_assert_err(cl_int err, const char* where)
{ {
@ -846,7 +846,7 @@ public:
{ {
/* this is blocking */ /* this is blocking */
size_t size = mem.memory_size(); size_t size = mem.memory_size();
opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL)) opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL));
} }
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
@ -854,7 +854,7 @@ public:
size_t offset = elem*y*w; size_t offset = elem*y*w;
size_t size = elem*w*h; size_t size = elem*w*h;
opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL)) opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL));
} }
void mem_zero(device_memory& mem) void mem_zero(device_memory& mem)
@ -868,7 +868,7 @@ public:
void mem_free(device_memory& mem) void mem_free(device_memory& mem)
{ {
if(mem.device_pointer) { if(mem.device_pointer) {
opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))) opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
mem.device_pointer = 0; mem.device_pointer = 0;
stats.mem_free(mem.memory_size()); stats.mem_free(mem.memory_size());
@ -944,8 +944,8 @@ public:
size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)}; size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
/* run kernel */ /* run kernel */
opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)) opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
opencl_assert(clFlush(cqCommandQueue)) opencl_assert(clFlush(cqCommandQueue));
} }
void path_trace(RenderTile& rtile, int sample) void path_trace(RenderTile& rtile, int sample)
@ -965,21 +965,21 @@ public:
/* sample arguments */ /* sample arguments */
cl_uint narg = 0; cl_uint narg = 0;
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
#define KERNEL_TEX(type, ttype, name) \ #define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckPathTraceKernel, &narg, #name); set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
#include "kernel_textures.h" #include "kernel_textures.h"
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride)) opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
enqueue_kernel(ckPathTraceKernel, d_w, d_h); enqueue_kernel(ckPathTraceKernel, d_w, d_h);
} }
@ -1020,21 +1020,21 @@ public:
cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel; cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
#define KERNEL_TEX(type, ttype, name) \ #define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name); set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
#include "kernel_textures.h" #include "kernel_textures.h"
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride)) opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
@ -1054,17 +1054,17 @@ public:
/* sample arguments */ /* sample arguments */
cl_uint narg = 0; cl_uint narg = 0;
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data));
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input));
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output));
#define KERNEL_TEX(type, ttype, name) \ #define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckShaderKernel, &narg, #name); set_kernel_arg_mem(ckShaderKernel, &narg, #name);
#include "kernel_textures.h" #include "kernel_textures.h"
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)) opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
enqueue_kernel(ckShaderKernel, task.shader_w, 1); enqueue_kernel(ckShaderKernel, task.shader_w, 1);
} }