diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt index b6b7651ef..a4a5af272 100644 --- a/vtkm/cont/cuda/internal/CMakeLists.txt +++ b/vtkm/cont/cuda/internal/CMakeLists.txt @@ -43,4 +43,5 @@ target_sources(vtkm_cont PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu ${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu ${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu + ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu ) diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h index b0c780cb8..f70d82dcc 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h @@ -31,6 +31,8 @@ // Here are the actual implementation of the algorithms. #include +#include + #include namespace vtkm @@ -257,24 +259,24 @@ class DeviceTaskTypes { public: template - static vtkm::exec::internal::TaskSingular MakeTask( + static vtkm::exec::cuda::internal::TaskStrided1D MakeTask( WorkletType& worklet, InvocationType& invocation, vtkm::Id, vtkm::Id globalIndexOffset = 0) { - using Task = vtkm::exec::internal::TaskSingular; + using Task = vtkm::exec::cuda::internal::TaskStrided1D; return Task(worklet, invocation, globalIndexOffset); } template - static vtkm::exec::internal::TaskSingular MakeTask( + static vtkm::exec::cuda::internal::TaskStrided3D MakeTask( WorkletType& worklet, InvocationType& invocation, vtkm::Id3, vtkm::Id globalIndexOffset = 0) { - using Task = vtkm::exec::internal::TaskSingular; + using Task = vtkm::exec::cuda::internal::TaskStrided3D; return Task(worklet, invocation, globalIndexOffset); } }; diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu new file mode 100644 index 000000000..5344f480b --- /dev/null +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu @@ -0,0 +1,153 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS). +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#include + +#include +#include + +namespace vtkm +{ +namespace cont +{ +namespace cuda +{ +namespace internal +{ + +VTKM_CONT_EXPORT int getNumSMs(int dId) +{ + //check + static bool lookupBuilt = false; + static std::vector numSMs; + + if (!lookupBuilt) + { + //lock the mutex + static std::mutex built_mutex; + std::lock_guard lock(built_mutex); + + //iterate over all devices + int numberOfSMs = 0; + int count = 0; + VTKM_CUDA_CALL(cudaGetDeviceCount(&count)); + numSMs.reserve(static_cast(count)); + for (int deviceId = 0; deviceId < count; ++deviceId) + { //get the number of sm's per deviceId + VTKM_CUDA_CALL( + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId)); + numSMs.push_back(numberOfSMs); + } + lookupBuilt = true; + } + return numSMs[dId]; +} + +// we use cuda pinned memory to reduce the amount of synchronization +// and mem copies between the host and device. +template <> +char* DeviceAdapterAlgorithmThrust::GetPinnedErrorArray( + vtkm::Id& arraySize, + char** hostPointer) +{ + const vtkm::Id ERROR_ARRAY_SIZE = 1024; + static bool errorArrayInit = false; + static char* hostPtr = nullptr; + static char* devicePtr = nullptr; + if (!errorArrayInit) + { + VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped)); + VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0)); + errorArrayInit = true; + } + //set the size of the array + arraySize = ERROR_ARRAY_SIZE; + + //specify the host pointer to the memory + *hostPointer = hostPtr; + (void)hostPointer; + return devicePtr; +} + +template <> +char* DeviceAdapterAlgorithmThrust::SetupErrorBuffer( + vtkm::exec::cuda::internal::TaskStrided& functor) +{ + //since the memory is pinned we can access it safely on the host + //without a memcpy + vtkm::Id errorArraySize = 0; + char* hostErrorPtr = nullptr; + char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr); + + //clear the first character which means that we don't contain an error + hostErrorPtr[0] = '\0'; + + vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize); + functor.SetErrorMessageBuffer(errorMessage); + + return hostErrorPtr; +} + +template <> +void DeviceAdapterAlgorithmThrust::GetGridsAndBlocks( + int& grids, + int& blocks, + vtkm::Id size) +{ + (void)size; + int deviceId; + VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda + grids = 32 * getNumSMs(deviceId); + blocks = 128; +} + +template <> +void DeviceAdapterAlgorithmThrust::GetGridsAndBlocks( + int& grids, + dim3& blocks, + const dim3& size) +{ + int deviceId; + VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda + grids = 32 * getNumSMs(deviceId); + + if (size.x == 0) + { //grids that have no x dimension + blocks.x = 1; + blocks.y = 16; + blocks.z = 8; + } + else if (size.x > 128) + { + blocks.x = 64; + blocks.y = 2; + blocks.z = 1; + } + else + { //for really small grids + blocks.x = 8; + blocks.y = 4; + blocks.z = 4; + } +} +} +} +} +} diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h index 4838bd2c3..64851eba9 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h @@ -21,6 +21,8 @@ #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h +#include + #include #include #include @@ -34,15 +36,16 @@ #include #include +#include #include #include -#include #include -// #define ANALYZE_VTKM_SCHEDULER_1D -// #define ANALYZE_VTKM_SCHEDULER_3D -#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D) + +// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D +// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D +#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D) #include #endif @@ -62,7 +65,6 @@ VTKM_THIRDPARTY_PRE_INCLUDE #include VTKM_THIRDPARTY_POST_INCLUDE -#include namespace vtkm { @@ -82,73 +84,20 @@ namespace internal #pragma GCC diagnostic ignored "-Wunused-parameter" #endif -static __global__ void DetermineProperXGridSize(vtkm::UInt32 desired_size, - vtkm::UInt32* actual_size) +template +__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size) { - //used only to see if we can launch kernels with a x grid size that - //matches the max of the graphics card, or are we having to fall back - //to SM_2 grid sizes - if (blockIdx.x != 0) - { - return; - } -#if __CUDA_ARCH__ <= 200 - const vtkm::UInt32 maxXGridSizeForSM2 = 65535; - *actual_size = maxXGridSizeForSM2; -#else - *actual_size = desired_size; -#endif + //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ + //for why our inc is grid-stride + const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x; + const vtkm::Id inc = blockDim.x * gridDim.x; + task(start, size, inc); } -template -__global__ void Schedule1DIndexKernel(FunctorType functor, - vtkm::Id numberOfKernelsInvoked, - vtkm::Id length) -{ - //Note a cuda launch can only handle at most 2B iterations of a kernel - //because it holds all of the indexes inside UInt32, so for use to - //handle datasets larger than 2B, we need to execute multiple kernels - const vtkm::Id index = - numberOfKernelsInvoked + static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (index < length) - { - functor(index); - } -} - -template -__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size) -{ - const vtkm::Id3 index(blockIdx.x * blockDim.x + threadIdx.x, - blockIdx.y * blockDim.y + threadIdx.y, - blockIdx.z * blockDim.z + threadIdx.z); - if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z) - { - return; - } - functor(index); -} - -#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D) - -// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd -// for an unknown reason -template -__global__ void Schedule1DIndexKernel2(FunctorType functor, - vtkm::Id numberOfKernelsInvoked, - vtkm::Id length) -{ - vtkm::Id index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const vtkm::Id inc = static_cast(blockDim.x * gridDim.x); - for (; index < length; index += inc) - { - functor(index); - } -} - -template -__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size) +template +__global__ void TaskStrided3DLaunch(TaskType task, dim3 size) { + //This is the 3D version of executing in a grid-stride manner const dim3 start(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y, blockIdx.z * blockDim.z + threadIdx.z); @@ -158,17 +107,11 @@ __global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size) { for (uint j = start.y; j < size.y; j += inc.y) { - vtkm::Id3 index(start.x, j, k); - for (vtkm::Id i = start.x; i < size.x; i += inc.x) - { - index[0] = i; - functor(index); - } + task(start.x, size.x, inc.x, j, k); } } } -#endif template __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op) @@ -180,16 +123,6 @@ __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_ #pragma GCC diagnostic pop #endif -inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d) -{ - gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1) - : (rangeMax.x / blockSize3d.x); - gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1) - : (rangeMax.y / blockSize3d.y); - gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1) - : (rangeMax.z / blockSize3d.z); -} - /// This class can be subclassed to implement the DeviceAdapterAlgorithm for a /// device that uses thrust as its implementation. The subclass should pass in /// the correct device adapter tag as the template parameter. @@ -1142,89 +1075,24 @@ public: ::thrust::equal_to(), binary_functor); } -// Because of some funny code conversions in nvcc, kernels for devices have to -// be public. -#ifndef VTKM_CUDA -private: -#endif // we use cuda pinned memory to reduce the amount of synchronization // and mem copies between the host and device. - VTKM_CONT - static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer) - { - const vtkm::Id ERROR_ARRAY_SIZE = 1024; - static bool errorArrayInit = false; - static char* hostPtr = nullptr; - static char* devicePtr = nullptr; - if (!errorArrayInit) - { - VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped)); - VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0)); - errorArrayInit = true; - } - //set the size of the array - arraySize = ERROR_ARRAY_SIZE; + VTKM_CONT_EXPORT + static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer); - //specify the host pointer to the memory - *hostPointer = hostPtr; - (void)hostPointer; - return devicePtr; - } + VTKM_CONT_EXPORT + static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor); - // we query cuda for the max blocks per grid for 1D scheduling - // and cache the values in static variables - VTKM_CONT - static vtkm::Vec GetMaxGridOfThreadBlocks() - { - static std::atomic gridQueryInit(false); - static vtkm::Vec maxGridSize; - // NOTE: The following code may still be executed by multiple threads - // but it should not cause any correctness issues. - if (!gridQueryInit) - { - int currDevice; - VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda + VTKM_CONT_EXPORT + static void GetGridsAndBlocks(int& grid, int& blocks, vtkm::Id size); - cudaDeviceProp properties; - VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice)); - maxGridSize[0] = static_cast(properties.maxGridSize[0]); - maxGridSize[1] = static_cast(properties.maxGridSize[1]); - maxGridSize[2] = static_cast(properties.maxGridSize[2]); - - //Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids - //in the X direction, it is dependent on the code being compiled for SM3+. - //If not, it falls back to SM_2 limitation of 65535 being the largest grid - //size. - //Now since SM architecture is only available inside kernels we have to - //invoke one to see what the actual limit is for our device. So that is - //what we are going to do next, and than we will store that result - - vtkm::UInt32* dev_actual_size; - VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32))); - DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size); - VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0], - dev_actual_size, - sizeof(vtkm::UInt32), - cudaMemcpyDeviceToHost, - cudaStreamPerThread)); - VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread)); - gridQueryInit = true; - VTKM_CUDA_CALL(cudaFree(dev_actual_size)); - } - return maxGridSize; - } + VTKM_CONT_EXPORT + static void GetGridsAndBlocks(int& grid, dim3& blocks, const dim3& size); public: - template - VTKM_CONT static void ScheduleTask(TaskType task, RangeType size) - { //for now defer to the schedule api, we need to do a significant - //amount of build infrastructure work to implement type erasure tasks - //for cuda - Schedule(task, size); - } - - template - VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances) + template + static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D& functor, + vtkm::Id numInstances) { VTKM_ASSERT(numInstances >= 0); if (numInstances < 1) @@ -1232,45 +1100,12 @@ public: // No instances means nothing to run. Just return. return; } + char* hostErrorPtr = SetupErrorBuffer(functor); - //since the memory is pinned we can access it safely on the host - //without a memcpy - vtkm::Id errorArraySize = 0; - char* hostErrorPtr = nullptr; - char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr); + int grids, blocks; + GetGridsAndBlocks(grids, blocks, numInstances); - //clear the first character which means that we don't contain an error - hostErrorPtr[0] = '\0'; - - vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize); - - functor.SetErrorMessageBuffer(errorMessage); - - const vtkm::Id blockSizeAsId = 128; - const vtkm::UInt32 blockSize = 128; - const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0]; - const vtkm::UInt32 totalBlocks = - static_cast((numInstances + blockSizeAsId - 1) / blockSizeAsId); - - //Note a cuda launch can only handle at most 2B iterations of a kernel - //because it holds all of the indexes inside UInt32, so for use to - //handle datasets larger than 2B, we need to execute multiple kernels - if (totalBlocks < maxblocksPerLaunch) - { - Schedule1DIndexKernel<<>>( - functor, vtkm::Id(0), numInstances); - } - else - { - const vtkm::Id numberOfKernelsToRun = - blockSizeAsId * static_cast(maxblocksPerLaunch); - for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances; - numberOfKernelsInvoked += numberOfKernelsToRun) - { - Schedule1DIndexKernel<<>>( - functor, numberOfKernelsInvoked, numInstances); - } - } + TaskStrided1DLaunch<<>>(functor, numInstances); //sync so that we can check the results of the call. //In the future I want move this before the schedule call, and throwing @@ -1284,13 +1119,14 @@ public: throw vtkm::cont::ErrorExecution(hostErrorPtr); } -#ifdef ANALYZE_VTKM_SCHEDULER_1D - compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize); +#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D + parameter_sweep_1d_schedule(functor, numInstances); #endif } - template - VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax) + template + static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D& functor, + vtkm::Id3 rangeMax) { VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0)); if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1)) @@ -1298,46 +1134,17 @@ public: // No instances means nothing to run. Just return. return; } - - //since the memory is pinned we can access it safely on the host - //without a memcpy - vtkm::Id errorArraySize = 0; - char* hostErrorPtr = nullptr; - char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr); - - //clear the first character which means that we don't contain an error - hostErrorPtr[0] = '\0'; - - vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize); - - functor.SetErrorMessageBuffer(errorMessage); + char* hostErrorPtr = SetupErrorBuffer(functor); const dim3 ranges(static_cast(rangeMax[0]), static_cast(rangeMax[1]), static_cast(rangeMax[2])); - //currently we presume that 3d scheduling access patterns prefer accessing - //memory in the X direction. Also should be good for thin in the Z axis - //algorithms. - dim3 blockSize3d(64, 2, 1); - //In general we need more information as this doesn't work well when - //executing on the points, and need to fetch all cells used, as the z - //width is not fat enough + int grids; + dim3 blocks; + GetGridsAndBlocks(grids, blocks, ranges); - //handle the simple use case of 'bad' datasets which are thin in X - //but larger in the other directions, allowing us decent performance with - //that use case. - if (rangeMax[0] <= 128 && (rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2])) - { - blockSize3d = dim3(16, 4, 4); - } - - - dim3 gridSize3d; - compute_block_size(ranges, blockSize3d, gridSize3d); - - Schedule3DIndexKernel<<>>(functor, - ranges); + TaskStrided3DLaunch<<>>(functor, ranges); //sync so that we can check the results of the call. //In the future I want move this before the schedule call, and throwing @@ -1351,23 +1158,25 @@ public: throw vtkm::cont::ErrorExecution(hostErrorPtr); } -#ifdef ANALYZE_VTKM_SCHEDULER_1D - compare_1d_dynamic_block_picker(functor, - rangeMax[0] * rangeMax[1] * rangeMax[2], - gridSize3d.x * gridSize3d.y * gridSize3d.z, - blockSize3d.x * blockSize3d.y * blockSize3d.z); -#endif - -#ifdef ANALYZE_VTKM_SCHEDULER_3D - //requires the errormessage buffer be set - compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d); -#endif - #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D parameter_sweep_3d_schedule(functor, rangeMax); #endif } + template + VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances) + { + vtkm::exec::cuda::internal::TaskStrided1D kernel(functor); + ScheduleTask(kernel, numInstances); + } + + template + VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax) + { + vtkm::exec::cuda::internal::TaskStrided3D kernel(functor); + ScheduleTask(kernel, rangeMax); + } + template VTKM_CONT static void Sort(vtkm::cont::ArrayHandle& values) { diff --git a/vtkm/cont/cuda/internal/TaskTuner.h b/vtkm/cont/cuda/internal/TaskTuner.h index 134c9d514..29fdd3d74 100644 --- a/vtkm/cont/cuda/internal/TaskTuner.h +++ b/vtkm/cont/cuda/internal/TaskTuner.h @@ -41,293 +41,167 @@ namespace cuda namespace internal { -template -__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id); -template -__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id); -template -__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size); -template -__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size); +int getNumSMs(int dId); -void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d); +template +__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id); +template +__global__ void TaskStrided3DLaunch(TaskType task, dim3 size); - -template -__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size) +struct PerfRecord1d { - const vtkm::Id start = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const vtkm::Id inc = static_cast(blockDim.x * gridDim.x); - for (vtkm::Id i = start; i < size; i += inc) - { - task(i); - } -} - -class PerfRecord -{ -public: - PerfRecord(float elapsedT, dim3 block) + PerfRecord1d(float elapsedT, int g, int b) : elapsedTime(elapsedT) - , blockSize(block) + , grid(g) + , block(b) { } - bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; } + bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; } float elapsedTime; - dim3 blockSize; + int grid; + int block; }; -template -static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy) +inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record) { - int blockSize; // The launch configurator returned block size - int minGridSize; // The minimum grid size needed to achieve the - // maximum occupancy for a full device launch - int gridSize; // The actual grid size needed, based on number of SM's - int device; // device to run on - int numSMs; // number of SMs on the active device - - cudaGetDevice(&device); - cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device); - - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2, 0, 0); - - - blockSize /= (numSMs * 2); - // Round up according to array size - // gridSize = (size + blockSize - 1) / blockSize; - gridSize = 32 * numSMs; - // std::cout << "numSMs: " << numSMs << std::endl; - - // calculate theoretical occupancy - int maxActiveBlocks; - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &maxActiveBlocks, Schedule1DIndexKernel2, blockSize, 0); - - cudaDeviceProp props; - cudaGetDeviceProperties(&props, device); - - grids = gridSize; - blocks = blockSize; - occupancy = (maxActiveBlocks * blockSize / props.warpSize) / - (float)(props.maxThreadsPerMultiProcessor / props.warpSize); + os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block + << ">>> required: " << record.elapsedTime << "\n"; + return os; } -template -static void compare_1d_dynamic_block_picker(Functor functor, - vtkm::Id size, - const vtkm::Id& currentGridSize, - const vtkm::Id& currentBlockSize) + +struct PerfRecord3d { - const std::type_info& ti = typeid(functor); - std::cout << "fixed 1d block size performance " << ti.name() << std::endl; + PerfRecord3d(float elapsedT, int g, dim3 b) + : elapsedTime(elapsedT) + , grid(g) + , block(b) { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - Schedule1DIndexKernel<<>>( - functor, vtkm::Id(0), size); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - std::cout << "Schedule1DIndexKernel size: " << size << std::endl; - std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize - << " required: " << elapsedTimeMilliseconds << std::endl; } - std::cout << "dynamic 1d block size performance " << ti.name() << std::endl; - { + bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; } - int grids, blocks; - float occupancy; - BlockSizeGuesser(size, grids, blocks, occupancy); + float elapsedTime; + int grid; + dim3 block; +}; - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); +inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record) +{ - - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - Schedule1DIndexKernel2<<>>( - functor, vtkm::Id(0), size); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl; - std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks - << " required: " << elapsedTimeMilliseconds << std::endl; - } - std::cout << std::endl; + os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y + << "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n"; + return os; } -template -static void compare_3d_dynamic_block_picker(Functor functor, - vtkm::Id3 ranges, - const dim3& gridSize3d, - const dim3& blockSize3d) + +template +static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances) { - const std::type_info& ti = typeid(functor); - std::cout << "fixed 3d block size performance " << ti.name() << std::endl; + std::vector results; + constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 }; + constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128, + 256, 512, 1024, 2048, 4096, 8192 }; + + int deviceId; + VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda + + for (vtkm::UInt32 g = 0; g < 12; g++) { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); + int grids = gridIndexTable[g] * getNumSMs(deviceId); + for (vtkm::UInt32 b = 0; b < 12; b++) + { + int blocks = blockIndexTable[b]; - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - Schedule3DIndexKernel<<>>(functor, - ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + TaskStrided1DLaunch<<>>(task, numInstances); - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); + VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - // std::cout << "Schedule3DIndexKernel size: " << size << std::endl; - // std::cout << "GridSize of: " << currentGridSize - // << " BlockSize of: " << currentBlockSize << " required: " << elapsedTimeMilliseconds << std::endl; + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + results.emplace_back(elapsedTimeMilliseconds, grids, blocks); + } } - std::cout << "dynamic 3d block size performance " << ti.name() << std::endl; + std::sort(results.begin(), results.end()); + for (auto&& i : results) { - - // int grids, blocks; - // float occupancy; - // BlockSizeGuesser(size, grids, blocks, occupancy); - - // cudaEvent_t start, stop; - // VTKM_CUDA_CALL(cudaEventCreate(&start)); - // VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - - // VTKM_CUDA_CALL(cudaEventRecord(start, 0)); - // Schedule3DIndexKernel2<<>>(functor, vtkm::Id(0), size); - // VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - - // VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - // float elapsedTimeMilliseconds; - // VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - // VTKM_CUDA_CALL(cudaEventDestroy(start)); - // VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - // std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl; - // std::cout << "GridSize of: " << grids - // << " BlockSize of: " << blocks << " required: " << elapsedTimeMilliseconds << std::endl; + std::cout << i << std::endl; } - std::cout << std::endl; } -template -static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax) +template +static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax) { const dim3 ranges(static_cast(rangeMax[0]), static_cast(rangeMax[1]), static_cast(rangeMax[2])); - std::vector results; - vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 }; + std::vector results; - for (vtkm::UInt32 i = 0; i < 16; i++) + constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 }; + constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, + 28, 30, 32, 64, 128, 256, 512, 1024 }; + + int deviceId; + for (vtkm::UInt32 g = 0; g < 12; g++) { - for (vtkm::UInt32 j = 0; j < 16; j++) + int grids = gridIndexTable[g] * getNumSMs(deviceId); + for (vtkm::UInt32 i = 0; i < 16; i++) { - for (vtkm::UInt32 k = 0; k < 16; k++) + for (vtkm::UInt32 j = 0; j < 16; j++) { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]); - dim3 gridSize3d; - - if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 || - (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64) + for (vtkm::UInt32 k = 0; k < 16; k++) { - //cuda can't handle more than 1024 threads per block - //so don't try if we compute higher than that + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); - //also don't try stupidly low numbers + dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]); - //cuda can't handle more than 64 threads in the z direction - continue; + if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 || + blocks.z >= 64) + { + //cuda can't handle more than 1024 threads per block + //so don't try if we compute higher than that + + //also don't try stupidly low numbers + + //cuda can't handle more than 64 threads in the z direction + continue; + } + + VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); + TaskStrided3DLaunch<<>>(task, ranges); + VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + results.emplace_back(elapsedTimeMilliseconds, grids, blocks); } - - compute_block_size(ranges, blockSize3d, gridSize3d); - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - Schedule3DIndexKernel<<>>(functor, - ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - PerfRecord record(elapsedTimeMilliseconds, blockSize3d); - results.push_back(record); } } } std::sort(results.begin(), results.end()); - const vtkm::Int64 size = static_cast(results.size()); - for (vtkm::Int64 i = 1; i <= size; i++) + for (auto&& i : results) { - vtkm::UInt64 index = static_cast(size - i); - vtkm::UInt32 x = results[index].blockSize.x; - vtkm::UInt32 y = results[index].blockSize.y; - vtkm::UInt32 z = results[index].blockSize.z; - float t = results[index].elapsedTime; - - std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl; - } - - std::cout << "fixed 3d block size performance " << std::endl; - { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - dim3 blockSize3d(64, 2, 1); - dim3 gridSize3d; - - compute_block_size(ranges, blockSize3d, gridSize3d); - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - Schedule3DIndexKernel<<>>(functor, - ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z - << " required: " << elapsedTimeMilliseconds << std::endl; - std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z - << " required: " << elapsedTimeMilliseconds << std::endl; + std::cout << i << std::endl; } } } diff --git a/vtkm/exec/cuda/internal/CMakeLists.txt b/vtkm/exec/cuda/internal/CMakeLists.txt index e3e2dd010..5d8ca750c 100644 --- a/vtkm/exec/cuda/internal/CMakeLists.txt +++ b/vtkm/exec/cuda/internal/CMakeLists.txt @@ -22,8 +22,9 @@ set(headers ArrayPortalFromThrust.h ExecutionPolicy.h IteratorFromArrayPortal.h - WrappedOperators.h + TaskStrided.h ThrustPatches.h + WrappedOperators.h ) #----------------------------------------------------------------------------- diff --git a/vtkm/exec/cuda/internal/TaskStrided.h b/vtkm/exec/cuda/internal/TaskStrided.h new file mode 100644 index 000000000..f7d820485 --- /dev/null +++ b/vtkm/exec/cuda/internal/TaskStrided.h @@ -0,0 +1,207 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS). +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ +#ifndef vtk_m_exec_cuda_internal_TaskStrided_h +#define vtk_m_exec_cuda_internal_TaskStrided_h + +#include + +#include + +//Todo: rename this header to TaskInvokeWorkletDetail.h +#include + +namespace vtkm +{ +namespace exec +{ +namespace cuda +{ +namespace internal +{ + +template +void TaskStridedSetErrorBuffer(void* w, const vtkm::exec::internal::ErrorMessageBuffer& buffer) +{ + using WorkletType = typename std::remove_cv::type; + WorkletType* const worklet = static_cast(w); + worklet->SetErrorMessageBuffer(buffer); +} + +class TaskStrided : public vtkm::exec::TaskBase +{ +public: + void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer& buffer) + { + (void)buffer; + this->SetErrorBufferFunction(this->WPtr, buffer); + } + +protected: + void* WPtr = nullptr; + + using SetErrorBufferSignature = void (*)(void*, const vtkm::exec::internal::ErrorMessageBuffer&); + SetErrorBufferSignature SetErrorBufferFunction = nullptr; +}; + +template +class TaskStrided1D : public TaskStrided +{ +public: + TaskStrided1D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0) + : TaskStrided() + , Worklet(worklet) + , Invocation(invocation) + , GlobalIndexOffset(globalIndexOffset) + { + this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer; + //Bind the Worklet to void* + this->WPtr = (void*)&this->Worklet; + } + + VTKM_EXEC + void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const + { + for (vtkm::Id index = start; index < end; index += inc) + { + //Todo: rename this function to DoTaskInvokeWorklet + vtkm::exec::internal::detail::DoWorkletInvokeFunctor( + this->Worklet, + this->Invocation, + this->Worklet.GetThreadIndices(index, + this->Invocation.OutputToInputMap, + this->Invocation.VisitArray, + this->Invocation.GetInputDomain(), + this->GlobalIndexOffset)); + } + } + +private: + typename std::remove_const::type Worklet; + // This is held by by value so that when we transfer the invocation object + // over to CUDA it gets properly copied to the device. While we want to + // hold by reference to reduce the number of copies, it is not possible + // currently. + const IType Invocation; + const vtkm::Id GlobalIndexOffset; +}; + +template +class TaskStrided1D : public TaskStrided +{ +public: + TaskStrided1D(WType& worklet) + : TaskStrided() + , Worklet(worklet) + { + this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer; + //Bind the Worklet to void* + this->WPtr = (void*)&this->Worklet; + } + + VTKM_EXEC + void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const + { + for (vtkm::Id index = start; index < end; index += inc) + { + this->Worklet(index); + } + } + +private: + typename std::remove_const::type Worklet; +}; + +template +class TaskStrided3D : public TaskStrided +{ +public: + TaskStrided3D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0) + : TaskStrided() + , Worklet(worklet) + , Invocation(invocation) + , GlobalIndexOffset(globalIndexOffset) + { + this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer; + //Bind the Worklet to void* + this->WPtr = (void*)&this->Worklet; + } + + VTKM_EXEC + void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const + { + vtkm::Id3 index(start, j, k); + for (vtkm::Id i = start; i < end; i += inc) + { + index[0] = i; + //Todo: rename this function to DoTaskInvokeWorklet + vtkm::exec::internal::detail::DoWorkletInvokeFunctor( + this->Worklet, + this->Invocation, + this->Worklet.GetThreadIndices(index, + this->Invocation.OutputToInputMap, + this->Invocation.VisitArray, + this->Invocation.GetInputDomain(), + this->GlobalIndexOffset)); + } + } + +private: + typename std::remove_const::type Worklet; + // This is held by by value so that when we transfer the invocation object + // over to CUDA it gets properly copied to the device. While we want to + // hold by reference to reduce the number of copies, it is not possible + // currently. + const IType Invocation; + const vtkm::Id GlobalIndexOffset; +}; + +template +class TaskStrided3D : public TaskStrided +{ +public: + TaskStrided3D(WType& worklet) + : TaskStrided() + , Worklet(worklet) + { + this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer; + //Bind the Worklet to void* + this->WPtr = (void*)&this->Worklet; + } + + VTKM_EXEC + void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const + { + vtkm::Id3 index(start, j, k); + for (vtkm::Id i = start; i < end; i += inc) + { + index[0] = i; + this->Worklet(index); + } + } + +private: + typename std::remove_const::type Worklet; +}; +} +} +} +} // vtkm::exec::cuda::internal + +#endif //vtk_m_exec_cuda_internal_TaskStrided_h diff --git a/vtkm/exec/cuda/internal/testing/CMakeLists.txt b/vtkm/exec/cuda/internal/testing/CMakeLists.txt index 7702f6a42..84be197bb 100644 --- a/vtkm/exec/cuda/internal/testing/CMakeLists.txt +++ b/vtkm/exec/cuda/internal/testing/CMakeLists.txt @@ -22,6 +22,6 @@ set(unit_tests UnitTestTextureMemorySupport.cu - UnitTestTaskSingularCuda.cu + UnitTestTaskStrided.cu ) vtkm_unit_tests(SOURCES ${unit_tests}) diff --git a/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu b/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu similarity index 93% rename from vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu rename to vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu index f0fd9de82..1e962165c 100644 --- a/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu +++ b/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include @@ -165,14 +165,14 @@ using InvocationType2 = vtkm::internal::Invocation; template -static __global__ void ScheduleTaskSingular(TaskType task, vtkm::Id start, vtkm::Id end) +static __global__ void ScheduleTaskStrided(TaskType task, vtkm::Id start, vtkm::Id end) { const vtkm::Id index = blockIdx.x * blockDim.x + threadIdx.x; - + const vtkm::Id inc = blockDim.x * gridDim.x; if (index >= start && index < end) { - task(index); + task(index, end, inc); } } @@ -269,14 +269,13 @@ void TestNormalFunctorInvoke() TestExecObject(output.PrepareForOutput(3, DeviceAdapter()))); std::cout << " Try void return." << std::endl; - using TaskSingular1 = vtkm::exec::internal::TaskSingular; TestWorkletProxy worklet; InvocationType1 invocation1(execObjects); using TaskTypes = typename vtkm::cont::DeviceTaskTypes; auto task1 = TaskTypes::MakeTask(worklet, invocation1, vtkm::Id()); - ScheduleTaskSingular<<<32, 256>>>(task1, 1, 2); + ScheduleTaskStrided<<<32, 256>>>(task1, 1, 2); cudaDeviceSynchronize(); input.SyncControlArray(); output.SyncControlArray(); @@ -291,13 +290,12 @@ void TestNormalFunctorInvoke() TestExecObject(input.PrepareForInPlace(DeviceAdapter())), TestExecObject(output.PrepareForOutput(3, DeviceAdapter()))); - using TaskSingular2 = vtkm::exec::internal::TaskSingular; InvocationType2 invocation2(execObjects); using TaskTypes = typename vtkm::cont::DeviceTaskTypes; auto task2 = TaskTypes::MakeTask(worklet, invocation2, vtkm::Id()); - ScheduleTaskSingular<<<32, 256>>>(task2, 2, 3); + ScheduleTaskStrided<<<32, 256>>>(task2, 2, 3); cudaDeviceSynchronize(); input.SyncControlArray(); output.SyncControlArray(); @@ -323,7 +321,8 @@ void TestErrorFunctorInvoke() TestExecObject(input.PrepareForInPlace(DeviceAdapter())), TestExecObject(output.PrepareForInPlace(DeviceAdapter()))); - using TaskSingular1 = vtkm::exec::internal::TaskSingular; + using TaskStrided1 = + vtkm::exec::cuda::internal::TaskStrided1D; TestWorkletErrorProxy worklet; InvocationType1 invocation(execObjects); @@ -340,7 +339,7 @@ void TestErrorFunctorInvoke() vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize); task.SetErrorMessageBuffer(errorMessage); - ScheduleTaskSingular<<<32, 256>>>(task, 1, 2); + ScheduleTaskStrided<<<32, 256>>>(task, 1, 2); cudaDeviceSynchronize(); VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly."); @@ -348,7 +347,7 @@ void TestErrorFunctorInvoke() } template -void TestTaskSingular() +void TestTaskStrided() { TestNormalFunctorInvoke(); TestErrorFunctorInvoke(); @@ -356,7 +355,7 @@ void TestTaskSingular() } // anonymous namespace -int UnitTestTaskSingularCuda(int, char* []) +int UnitTestTaskStrided(int, char* []) { - return vtkm::cont::testing::Testing::Run(TestTaskSingular); + return vtkm::cont::testing::Testing::Run(TestTaskStrided); } diff --git a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h index 75c7972f4..8533d8b24 100644 --- a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h +++ b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h @@ -28,9 +28,11 @@ #ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h #define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h -#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \ - !defined(VTKM_TEST_HEADER_BUILD) -#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h +#if !defined(vtk_m_exec_internal_TaskSingular_h) && \ + !defined(vtk_m_exec_internal_TaskTiling_h) && \ + !defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \ + !defined(VTKM_TEST_HEADER_BUILD) +#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h #endif #include diff --git a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in index babcaec86..09b625868 100644 --- a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in +++ b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in @@ -40,9 +40,11 @@ $# Ignore the following comment. It is meant for the generated file. #ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h #define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h -#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \\ - !defined(VTKM_TEST_HEADER_BUILD) -#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h +#if !defined(vtk_m_exec_internal_TaskSingular_h) && \\ + !defined(vtk_m_exec_internal_TaskTiling_h) && \\ + !defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \\ + !defined(VTKM_TEST_HEADER_BUILD) +#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h #endif #include diff --git a/vtkm/worklet/wavelets/WaveletTransforms.h b/vtkm/worklet/wavelets/WaveletTransforms.h index 308d54fca..c5eeb6a26 100644 --- a/vtkm/worklet/wavelets/WaveletTransforms.h +++ b/vtkm/worklet/wavelets/WaveletTransforms.h @@ -3007,42 +3007,49 @@ public: template VTKM_EXEC void operator()(const InputPortalType& coeffs, OutputPortalType& sigOut, - const vtkm::Id& workIndex) const + vtkm::Id workIndex) const { - if (workIndex < cALen2) // valid calculation region + if (workIndex >= cALen2) // valid calculation region { - vtkm::Id xi; // coeff indices - vtkm::Id k1, k2; // indices for low and high filter - VAL sum = 0.0; - - if (workIndex % 2 != 0) - { - k1 = this->filterLen - 2; - k2 = this->filterLen - 1; - } - else - { - k1 = this->filterLen - 1; - k2 = this->filterLen - 2; - } - - xi = (workIndex + 1) / 2; - while (k1 > -1) // k1 >= 0 - { - sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi++)); - k1 -= 2; - } - - xi = workIndex / 2; - while (k2 > -1) // k2 >= 0 - { - sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(this->cALenExtended + xi++)); - k2 -= 2; - } - - sigOut.Set(workIndex, static_cast(sum)); + return; } + vtkm::Id xi1 = (workIndex + 1) / 2; // coeff indices + vtkm::Id xi2 = this->cALenExtended + ((workIndex) / 2); // coeff indices + VAL sum = 0.0; + + const bool odd = workIndex % 2 != 0; + if (odd) + { + vtkm::Id k1 = this->filterLen - 2; + vtkm::Id k2 = this->filterLen - 1; + for (; k1 >= 0; k1 -= 2, k2 -= 2) + { + sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++)); + sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++)); + } + if (k2 >= 0) + { + sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++)); + } + } + else //even + { + vtkm::Id k1 = this->filterLen - 1; + vtkm::Id k2 = this->filterLen - 2; + for (; k2 >= 0; k1 -= 2, k2 -= 2) + { + sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++)); + sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++)); + } + if (k1 >= 0) + { + sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++)); + } + } + + sigOut.Set(workIndex, static_cast(sum)); } + #undef MAKEVAL #undef VAL