Cuda use streams and async to avoid busywaiting

This is my first stab at this and is based on this IRC converstation:

<mib2berlin> brecht: this is meaning as reminder only, I know you have other things to do > http://openvidia.sourceforge.net/index.php/Optimization_Notes#avoiding_busy_waits
<brecht> mib2berlin: thanks, bookmarked

only tested on Ubuntu 14.04 / cuda 5.0 but ill do some more testing tomorrow.

Also unsure about the placement and the lifetime of the stream and the event. But creating / deleting these seems to incur a non trivial cost.

Reviewers: brecht

Reviewed By: brecht

CC: mib2berlin, dingto

Differential Revision: https://developer.blender.org/D262
This commit is contained in:
Martijn Berger 2014-01-28 18:40:08 +01:00
parent abf18033f3
commit 84f9587540

@ -41,6 +41,8 @@ public:
CUdevice cuDevice; CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
CUstream cuStream;
CUevent tileDone;
map<device_ptr, bool> tex_interp_map; map<device_ptr, bool> tex_interp_map;
int cuDevId; int cuDevId;
int cuDevArchitecture; int cuDevArchitecture;
@ -207,6 +209,9 @@ public:
if(cuda_error_(result, "cuCtxCreate")) if(cuda_error_(result, "cuCtxCreate"))
return; return;
cuda_assert(cuStreamCreate(&cuStream, 0))
cuda_assert(cuEventCreate(&tileDone, 0x1))
int major, minor; int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId); cuDeviceComputeCapability(&major, &minor, cuDevId);
cuDevArchitecture = major*100 + minor*10; cuDevArchitecture = major*100 + minor*10;
@ -223,6 +228,8 @@ public:
{ {
task_pool.stop(); task_pool.stop();
cuda_assert(cuEventDestroy(tileDone))
cuda_assert(cuStreamDestroy(cuStream))
cuda_assert(cuCtxDestroy(cuContext)) cuda_assert(cuCtxDestroy(cuContext))
} }
@ -645,9 +652,10 @@ public:
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)) cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)) cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream))
cuda_assert(cuCtxSynchronize()) cuda_assert(cuEventRecord(tileDone, cuStream ))
cuda_assert(cuEventSynchronize(tileDone))
cuda_pop_context(); cuda_pop_context();
} }