Move VTK-m Cuda backend over to a grid-stride iteration pattern.

This allows for easier host side logic when determining grid and block
sizes, and allows for a smaller library side by moving some logic
into compiled in functions.
This commit is contained in:
Robert Maynard 2018-04-27 14:19:15 -04:00
parent 5c5fb020a8
commit b56894dd09
12 changed files with 596 additions and 539 deletions

@ -43,4 +43,5 @@ target_sources(vtkm_cont PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
)

@ -31,6 +31,8 @@
// Here are the actual implementation of the algorithms.
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <cuda.h>
namespace vtkm
@ -257,24 +259,24 @@ class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
{
public:
template <typename WorkletType, typename InvocationType>
static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType> MakeTask(
WorkletType& worklet,
InvocationType& invocation,
vtkm::Id,
vtkm::Id globalIndexOffset = 0)
{
using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
return Task(worklet, invocation, globalIndexOffset);
}
template <typename WorkletType, typename InvocationType>
static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType> MakeTask(
WorkletType& worklet,
InvocationType& invocation,
vtkm::Id3,
vtkm::Id globalIndexOffset = 0)
{
using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
return Task(worklet, invocation, globalIndexOffset);
}
};

@ -0,0 +1,153 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
#include <atomic>
#include <mutex>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
VTKM_CONT_EXPORT int getNumSMs(int dId)
{
//check
static bool lookupBuilt = false;
static std::vector<int> numSMs;
if (!lookupBuilt)
{
//lock the mutex
static std::mutex built_mutex;
std::lock_guard<std::mutex> lock(built_mutex);
//iterate over all devices
int numberOfSMs = 0;
int count = 0;
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
numSMs.reserve(static_cast<std::size_t>(count));
for (int deviceId = 0; deviceId < count; ++deviceId)
{ //get the number of sm's per deviceId
VTKM_CUDA_CALL(
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
numSMs.push_back(numberOfSMs);
}
lookupBuilt = true;
}
return numSMs[dId];
}
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
template <>
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
vtkm::Id& arraySize,
char** hostPointer)
{
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
static bool errorArrayInit = false;
static char* hostPtr = nullptr;
static char* devicePtr = nullptr;
if (!errorArrayInit)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
errorArrayInit = true;
}
//set the size of the array
arraySize = ERROR_ARRAY_SIZE;
//specify the host pointer to the memory
*hostPointer = hostPtr;
(void)hostPointer;
return devicePtr;
}
template <>
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
vtkm::exec::cuda::internal::TaskStrided& functor)
{
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = nullptr;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
functor.SetErrorMessageBuffer(errorMessage);
return hostErrorPtr;
}
template <>
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
int& grids,
int& blocks,
vtkm::Id size)
{
(void)size;
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * getNumSMs(deviceId);
blocks = 128;
}
template <>
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
int& grids,
dim3& blocks,
const dim3& size)
{
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * getNumSMs(deviceId);
if (size.x == 0)
{ //grids that have no x dimension
blocks.x = 1;
blocks.y = 16;
blocks.z = 8;
}
else if (size.x > 128)
{
blocks.x = 64;
blocks.y = 2;
blocks.z = 1;
}
else
{ //for really small grids
blocks.x = 8;
blocks.y = 4;
blocks.z = 4;
}
}
}
}
}
}

@ -21,6 +21,8 @@
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
#include <vtkm/cont/vtkm_cont_export.h>
#include <vtkm/TypeTraits.h>
#include <vtkm/Types.h>
#include <vtkm/UnaryPredicates.h>
@ -34,15 +36,16 @@
#include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <vtkm/exec/cuda/internal/WrappedOperators.h>
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
#include <vtkm/exec/internal/TaskSingular.h>
#include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
// #define ANALYZE_VTKM_SCHEDULER_1D
// #define ANALYZE_VTKM_SCHEDULER_3D
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
#include <vtkm/cont/cuda/internal/TaskTuner.h>
#endif
@ -62,7 +65,6 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
VTKM_THIRDPARTY_POST_INCLUDE
#include <atomic>
namespace vtkm
{
@ -82,73 +84,20 @@ namespace internal
#pragma GCC diagnostic ignored "-Wunused-parameter"
#endif
static __global__ void DetermineProperXGridSize(vtkm::UInt32 desired_size,
vtkm::UInt32* actual_size)
template <typename TaskType>
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size)
{
//used only to see if we can launch kernels with a x grid size that
//matches the max of the graphics card, or are we having to fall back
//to SM_2 grid sizes
if (blockIdx.x != 0)
{
return;
}
#if __CUDA_ARCH__ <= 200
const vtkm::UInt32 maxXGridSizeForSM2 = 65535;
*actual_size = maxXGridSizeForSM2;
#else
*actual_size = desired_size;
#endif
//see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
//for why our inc is grid-stride
const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
const vtkm::Id inc = blockDim.x * gridDim.x;
task(start, size, inc);
}
template <class FunctorType>
__global__ void Schedule1DIndexKernel(FunctorType functor,
vtkm::Id numberOfKernelsInvoked,
vtkm::Id length)
{
//Note a cuda launch can only handle at most 2B iterations of a kernel
//because it holds all of the indexes inside UInt32, so for use to
//handle datasets larger than 2B, we need to execute multiple kernels
const vtkm::Id index =
numberOfKernelsInvoked + static_cast<vtkm::Id>(blockDim.x * blockIdx.x + threadIdx.x);
if (index < length)
{
functor(index);
}
}
template <class FunctorType>
__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size)
{
const vtkm::Id3 index(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);
if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z)
{
return;
}
functor(index);
}
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd
// for an unknown reason
template <class FunctorType>
__global__ void Schedule1DIndexKernel2(FunctorType functor,
vtkm::Id numberOfKernelsInvoked,
vtkm::Id length)
{
vtkm::Id index = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
for (; index < length; index += inc)
{
functor(index);
}
}
template <class FunctorType>
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
template <typename TaskType>
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size)
{
//This is the 3D version of executing in a grid-stride manner
const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);
@ -158,17 +107,11 @@ __global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
{
for (uint j = start.y; j < size.y; j += inc.y)
{
vtkm::Id3 index(start.x, j, k);
for (vtkm::Id i = start.x; i < size.x; i += inc.x)
{
index[0] = i;
functor(index);
}
task(start.x, size.x, inc.x, j, k);
}
}
}
#endif
template <typename T, typename BinaryOperationType>
__global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
@ -180,16 +123,6 @@ __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_
#pragma GCC diagnostic pop
#endif
inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d)
{
gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1)
: (rangeMax.x / blockSize3d.x);
gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1)
: (rangeMax.y / blockSize3d.y);
gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1)
: (rangeMax.z / blockSize3d.z);
}
/// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
/// device that uses thrust as its implementation. The subclass should pass in
/// the correct device adapter tag as the template parameter.
@ -1142,89 +1075,24 @@ public:
::thrust::equal_to<T>(),
binary_functor);
}
// Because of some funny code conversions in nvcc, kernels for devices have to
// be public.
#ifndef VTKM_CUDA
private:
#endif
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
VTKM_CONT
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer)
{
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
static bool errorArrayInit = false;
static char* hostPtr = nullptr;
static char* devicePtr = nullptr;
if (!errorArrayInit)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
errorArrayInit = true;
}
//set the size of the array
arraySize = ERROR_ARRAY_SIZE;
VTKM_CONT_EXPORT
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);
//specify the host pointer to the memory
*hostPointer = hostPtr;
(void)hostPointer;
return devicePtr;
}
VTKM_CONT_EXPORT
static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
// we query cuda for the max blocks per grid for 1D scheduling
// and cache the values in static variables
VTKM_CONT
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
{
static std::atomic<bool> gridQueryInit(false);
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
// NOTE: The following code may still be executed by multiple threads
// but it should not cause any correctness issues.
if (!gridQueryInit)
{
int currDevice;
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
VTKM_CONT_EXPORT
static void GetGridsAndBlocks(int& grid, int& blocks, vtkm::Id size);
cudaDeviceProp properties;
VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
//Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids
//in the X direction, it is dependent on the code being compiled for SM3+.
//If not, it falls back to SM_2 limitation of 65535 being the largest grid
//size.
//Now since SM architecture is only available inside kernels we have to
//invoke one to see what the actual limit is for our device. So that is
//what we are going to do next, and than we will store that result
vtkm::UInt32* dev_actual_size;
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
dev_actual_size,
sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
gridQueryInit = true;
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
}
return maxGridSize;
}
VTKM_CONT_EXPORT
static void GetGridsAndBlocks(int& grid, dim3& blocks, const dim3& size);
public:
template <class TaskType, typename RangeType>
VTKM_CONT static void ScheduleTask(TaskType task, RangeType size)
{ //for now defer to the schedule api, we need to do a significant
//amount of build infrastructure work to implement type erasure tasks
//for cuda
Schedule(task, size);
}
template <class Functor>
VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
template <typename WType, typename IType>
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
vtkm::Id numInstances)
{
VTKM_ASSERT(numInstances >= 0);
if (numInstances < 1)
@ -1232,45 +1100,12 @@ public:
// No instances means nothing to run. Just return.
return;
}
char* hostErrorPtr = SetupErrorBuffer(functor);
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = nullptr;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
int grids, blocks;
GetGridsAndBlocks(grids, blocks, numInstances);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
functor.SetErrorMessageBuffer(errorMessage);
const vtkm::Id blockSizeAsId = 128;
const vtkm::UInt32 blockSize = 128;
const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0];
const vtkm::UInt32 totalBlocks =
static_cast<vtkm::UInt32>((numInstances + blockSizeAsId - 1) / blockSizeAsId);
//Note a cuda launch can only handle at most 2B iterations of a kernel
//because it holds all of the indexes inside UInt32, so for use to
//handle datasets larger than 2B, we need to execute multiple kernels
if (totalBlocks < maxblocksPerLaunch)
{
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), numInstances);
}
else
{
const vtkm::Id numberOfKernelsToRun =
blockSizeAsId * static_cast<vtkm::Id>(maxblocksPerLaunch);
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
numberOfKernelsInvoked += numberOfKernelsToRun)
{
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
functor, numberOfKernelsInvoked, numInstances);
}
}
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, numInstances);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
@ -1284,13 +1119,14 @@ public:
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
#ifdef ANALYZE_VTKM_SCHEDULER_1D
compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize);
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
parameter_sweep_1d_schedule(functor, numInstances);
#endif
}
template <class Functor>
VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
template <typename WType, typename IType>
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
vtkm::Id3 rangeMax)
{
VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
@ -1298,46 +1134,17 @@ public:
// No instances means nothing to run. Just return.
return;
}
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = nullptr;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
functor.SetErrorMessageBuffer(errorMessage);
char* hostErrorPtr = SetupErrorBuffer(functor);
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]));
//currently we presume that 3d scheduling access patterns prefer accessing
//memory in the X direction. Also should be good for thin in the Z axis
//algorithms.
dim3 blockSize3d(64, 2, 1);
//In general we need more information as this doesn't work well when
//executing on the points, and need to fetch all cells used, as the z
//width is not fat enough
int grids;
dim3 blocks;
GetGridsAndBlocks(grids, blocks, ranges);
//handle the simple use case of 'bad' datasets which are thin in X
//but larger in the other directions, allowing us decent performance with
//that use case.
if (rangeMax[0] <= 128 && (rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2]))
{
blockSize3d = dim3(16, 4, 4);
}
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
@ -1351,23 +1158,25 @@ public:
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
#ifdef ANALYZE_VTKM_SCHEDULER_1D
compare_1d_dynamic_block_picker(functor,
rangeMax[0] * rangeMax[1] * rangeMax[2],
gridSize3d.x * gridSize3d.y * gridSize3d.z,
blockSize3d.x * blockSize3d.y * blockSize3d.z);
#endif
#ifdef ANALYZE_VTKM_SCHEDULER_3D
//requires the errormessage buffer be set
compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d);
#endif
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
parameter_sweep_3d_schedule(functor, rangeMax);
#endif
}
template <class Functor>
VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
{
vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
ScheduleTask(kernel, numInstances);
}
template <class Functor>
VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
{
vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
ScheduleTask(kernel, rangeMax);
}
template <typename T, class Storage>
VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
{

@ -41,293 +41,167 @@ namespace cuda
namespace internal
{
template <class FunctorType>
__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id);
template <class FunctorType>
__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id);
template <class FunctorType>
__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size);
template <class FunctorType>
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size);
int getNumSMs(int dId);
void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d);
template <typename TaskType>
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
template <typename TaskType>
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
template <typename Task>
__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size)
struct PerfRecord1d
{
const vtkm::Id start = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
for (vtkm::Id i = start; i < size; i += inc)
{
task(i);
}
}
class PerfRecord
{
public:
PerfRecord(float elapsedT, dim3 block)
PerfRecord1d(float elapsedT, int g, int b)
: elapsedTime(elapsedT)
, blockSize(block)
, grid(g)
, block(b)
{
}
bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
float elapsedTime;
dim3 blockSize;
int grid;
int block;
};
template <typename Task>
static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy)
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on number of SM's
int device; // device to run on
int numSMs; // number of SMs on the active device
cudaGetDevice(&device);
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device);
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2<Task>, 0, 0);
blockSize /= (numSMs * 2);
// Round up according to array size
// gridSize = (size + blockSize - 1) / blockSize;
gridSize = 32 * numSMs;
// std::cout << "numSMs: " << numSMs << std::endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&maxActiveBlocks, Schedule1DIndexKernel2<Task>, blockSize, 0);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
grids = gridSize;
blocks = blockSize;
occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor / props.warpSize);
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
<< ">>> required: " << record.elapsedTime << "\n";
return os;
}
template <class Functor>
static void compare_1d_dynamic_block_picker(Functor functor,
vtkm::Id size,
const vtkm::Id& currentGridSize,
const vtkm::Id& currentBlockSize)
struct PerfRecord3d
{
const std::type_info& ti = typeid(functor);
std::cout << "fixed 1d block size performance " << ti.name() << std::endl;
PerfRecord3d(float elapsedT, int g, dim3 b)
: elapsedTime(elapsedT)
, grid(g)
, block(b)
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
std::cout << "Schedule1DIndexKernel size: " << size << std::endl;
std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize
<< " required: " << elapsedTimeMilliseconds << std::endl;
}
std::cout << "dynamic 1d block size performance " << ti.name() << std::endl;
{
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
int grids, blocks;
float occupancy;
BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
float elapsedTime;
int grid;
dim3 block;
};
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
{
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl;
std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks
<< " required: " << elapsedTimeMilliseconds << std::endl;
}
std::cout << std::endl;
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
return os;
}
template <class Functor>
static void compare_3d_dynamic_block_picker(Functor functor,
vtkm::Id3 ranges,
const dim3& gridSize3d,
const dim3& blockSize3d)
template <typename TaskT>
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
{
const std::type_info& ti = typeid(functor);
std::cout << "fixed 3d block size performance " << ti.name() << std::endl;
std::vector<PerfRecord1d> results;
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192 };
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
for (vtkm::UInt32 g = 0; g < 12; g++)
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
int grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 b = 0; b < 12; b++)
{
int blocks = blockIndexTable[b];
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
// std::cout << "Schedule3DIndexKernel size: " << size << std::endl;
// std::cout << "GridSize of: " << currentGridSize
// << " BlockSize of: " << currentBlockSize << " required: " << elapsedTimeMilliseconds << std::endl;
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
}
std::cout << "dynamic 3d block size performance " << ti.name() << std::endl;
std::sort(results.begin(), results.end());
for (auto&& i : results)
{
// int grids, blocks;
// float occupancy;
// BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
// cudaEvent_t start, stop;
// VTKM_CUDA_CALL(cudaEventCreate(&start));
// VTKM_CUDA_CALL(cudaEventCreate(&stop));
// VTKM_CUDA_CALL(cudaEventRecord(start, 0));
// Schedule3DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
// VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
// VTKM_CUDA_CALL(cudaEventSynchronize(stop));
// float elapsedTimeMilliseconds;
// VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
// VTKM_CUDA_CALL(cudaEventDestroy(start));
// VTKM_CUDA_CALL(cudaEventDestroy(stop));
// std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl;
// std::cout << "GridSize of: " << grids
// << " BlockSize of: " << blocks << " required: " << elapsedTimeMilliseconds << std::endl;
std::cout << i << std::endl;
}
std::cout << std::endl;
}
template <class Functor>
static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax)
template <typename TaskT>
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
{
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]));
std::vector<PerfRecord> results;
vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
std::vector<PerfRecord3d> results;
for (vtkm::UInt32 i = 0; i < 16; i++)
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
28, 30, 32, 64, 128, 256, 512, 1024 };
int deviceId;
for (vtkm::UInt32 g = 0; g < 12; g++)
{
for (vtkm::UInt32 j = 0; j < 16; j++)
int grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 i = 0; i < 16; i++)
{
for (vtkm::UInt32 k = 0; k < 16; k++)
for (vtkm::UInt32 j = 0; j < 16; j++)
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
dim3 gridSize3d;
if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
for (vtkm::UInt32 k = 0; k < 16; k++)
{
//cuda can't handle more than 1024 threads per block
//so don't try if we compute higher than that
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
//also don't try stupidly low numbers
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
//cuda can't handle more than 64 threads in the z direction
continue;
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
blocks.z >= 64)
{
//cuda can't handle more than 1024 threads per block
//so don't try if we compute higher than that
//also don't try stupidly low numbers
//cuda can't handle more than 64 threads in the z direction
continue;
}
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
results.push_back(record);
}
}
}
std::sort(results.begin(), results.end());
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
for (vtkm::Int64 i = 1; i <= size; i++)
for (auto&& i : results)
{
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
vtkm::UInt32 x = results[index].blockSize.x;
vtkm::UInt32 y = results[index].blockSize.y;
vtkm::UInt32 z = results[index].blockSize.z;
float t = results[index].elapsedTime;
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
}
std::cout << "fixed 3d block size performance " << std::endl;
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blockSize3d(64, 2, 1);
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
<< " required: " << elapsedTimeMilliseconds << std::endl;
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
<< " required: " << elapsedTimeMilliseconds << std::endl;
std::cout << i << std::endl;
}
}
}

@ -22,8 +22,9 @@ set(headers
ArrayPortalFromThrust.h
ExecutionPolicy.h
IteratorFromArrayPortal.h
WrappedOperators.h
TaskStrided.h
ThrustPatches.h
WrappedOperators.h
)
#-----------------------------------------------------------------------------

@ -0,0 +1,207 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_exec_cuda_internal_TaskStrided_h
#define vtk_m_exec_cuda_internal_TaskStrided_h
#include <vtkm/exec/TaskBase.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
//Todo: rename this header to TaskInvokeWorkletDetail.h
#include <vtkm/exec/internal/WorkletInvokeFunctorDetail.h>
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
template <typename WType>
void TaskStridedSetErrorBuffer(void* w, const vtkm::exec::internal::ErrorMessageBuffer& buffer)
{
using WorkletType = typename std::remove_cv<WType>::type;
WorkletType* const worklet = static_cast<WorkletType*>(w);
worklet->SetErrorMessageBuffer(buffer);
}
class TaskStrided : public vtkm::exec::TaskBase
{
public:
void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer& buffer)
{
(void)buffer;
this->SetErrorBufferFunction(this->WPtr, buffer);
}
protected:
void* WPtr = nullptr;
using SetErrorBufferSignature = void (*)(void*, const vtkm::exec::internal::ErrorMessageBuffer&);
SetErrorBufferSignature SetErrorBufferFunction = nullptr;
};
template <typename WType, typename IType>
class TaskStrided1D : public TaskStrided
{
public:
TaskStrided1D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
: TaskStrided()
, Worklet(worklet)
, Invocation(invocation)
, GlobalIndexOffset(globalIndexOffset)
{
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
//Bind the Worklet to void*
this->WPtr = (void*)&this->Worklet;
}
VTKM_EXEC
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
{
for (vtkm::Id index = start; index < end; index += inc)
{
//Todo: rename this function to DoTaskInvokeWorklet
vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
this->Worklet,
this->Invocation,
this->Worklet.GetThreadIndices(index,
this->Invocation.OutputToInputMap,
this->Invocation.VisitArray,
this->Invocation.GetInputDomain(),
this->GlobalIndexOffset));
}
}
private:
typename std::remove_const<WType>::type Worklet;
// This is held by by value so that when we transfer the invocation object
// over to CUDA it gets properly copied to the device. While we want to
// hold by reference to reduce the number of copies, it is not possible
// currently.
const IType Invocation;
const vtkm::Id GlobalIndexOffset;
};
template <typename WType>
class TaskStrided1D<WType, vtkm::internal::NullType> : public TaskStrided
{
public:
TaskStrided1D(WType& worklet)
: TaskStrided()
, Worklet(worklet)
{
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
//Bind the Worklet to void*
this->WPtr = (void*)&this->Worklet;
}
VTKM_EXEC
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
{
for (vtkm::Id index = start; index < end; index += inc)
{
this->Worklet(index);
}
}
private:
typename std::remove_const<WType>::type Worklet;
};
template <typename WType, typename IType>
class TaskStrided3D : public TaskStrided
{
public:
TaskStrided3D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
: TaskStrided()
, Worklet(worklet)
, Invocation(invocation)
, GlobalIndexOffset(globalIndexOffset)
{
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
//Bind the Worklet to void*
this->WPtr = (void*)&this->Worklet;
}
VTKM_EXEC
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
{
vtkm::Id3 index(start, j, k);
for (vtkm::Id i = start; i < end; i += inc)
{
index[0] = i;
//Todo: rename this function to DoTaskInvokeWorklet
vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
this->Worklet,
this->Invocation,
this->Worklet.GetThreadIndices(index,
this->Invocation.OutputToInputMap,
this->Invocation.VisitArray,
this->Invocation.GetInputDomain(),
this->GlobalIndexOffset));
}
}
private:
typename std::remove_const<WType>::type Worklet;
// This is held by by value so that when we transfer the invocation object
// over to CUDA it gets properly copied to the device. While we want to
// hold by reference to reduce the number of copies, it is not possible
// currently.
const IType Invocation;
const vtkm::Id GlobalIndexOffset;
};
template <typename WType>
class TaskStrided3D<WType, vtkm::internal::NullType> : public TaskStrided
{
public:
TaskStrided3D(WType& worklet)
: TaskStrided()
, Worklet(worklet)
{
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
//Bind the Worklet to void*
this->WPtr = (void*)&this->Worklet;
}
VTKM_EXEC
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
{
vtkm::Id3 index(start, j, k);
for (vtkm::Id i = start; i < end; i += inc)
{
index[0] = i;
this->Worklet(index);
}
}
private:
typename std::remove_const<WType>::type Worklet;
};
}
}
}
} // vtkm::exec::cuda::internal
#endif //vtk_m_exec_cuda_internal_TaskStrided_h

@ -22,6 +22,6 @@
set(unit_tests
UnitTestTextureMemorySupport.cu
UnitTestTaskSingularCuda.cu
UnitTestTaskStrided.cu
)
vtkm_unit_tests(SOURCES ${unit_tests})

@ -23,7 +23,7 @@
#include <vtkm/exec/FunctorBase.h>
#include <vtkm/exec/arg/BasicArg.h>
#include <vtkm/exec/arg/ThreadIndicesBasic.h>
#include <vtkm/exec/internal/TaskSingular.h>
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <vtkm/StaticAssert.h>
@ -165,14 +165,14 @@ using InvocationType2 = vtkm::internal::Invocation<ExecutionParameterInterface,
MyVisitArrayPortal>;
template <typename TaskType>
static __global__ void ScheduleTaskSingular(TaskType task, vtkm::Id start, vtkm::Id end)
static __global__ void ScheduleTaskStrided(TaskType task, vtkm::Id start, vtkm::Id end)
{
const vtkm::Id index = blockIdx.x * blockDim.x + threadIdx.x;
const vtkm::Id inc = blockDim.x * gridDim.x;
if (index >= start && index < end)
{
task(index);
task(index, end, inc);
}
}
@ -269,14 +269,13 @@ void TestNormalFunctorInvoke()
TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));
std::cout << " Try void return." << std::endl;
using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType1>;
TestWorkletProxy worklet;
InvocationType1 invocation1(execObjects);
using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
auto task1 = TaskTypes::MakeTask(worklet, invocation1, vtkm::Id());
ScheduleTaskSingular<decltype(task1)><<<32, 256>>>(task1, 1, 2);
ScheduleTaskStrided<decltype(task1)><<<32, 256>>>(task1, 1, 2);
cudaDeviceSynchronize();
input.SyncControlArray();
output.SyncControlArray();
@ -291,13 +290,12 @@ void TestNormalFunctorInvoke()
TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));
using TaskSingular2 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType2>;
InvocationType2 invocation2(execObjects);
using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
auto task2 = TaskTypes::MakeTask(worklet, invocation2, vtkm::Id());
ScheduleTaskSingular<decltype(task2)><<<32, 256>>>(task2, 2, 3);
ScheduleTaskStrided<decltype(task2)><<<32, 256>>>(task2, 2, 3);
cudaDeviceSynchronize();
input.SyncControlArray();
output.SyncControlArray();
@ -323,7 +321,8 @@ void TestErrorFunctorInvoke()
TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
TestExecObject(output.PrepareForInPlace(DeviceAdapter())));
using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletErrorProxy, InvocationType1>;
using TaskStrided1 =
vtkm::exec::cuda::internal::TaskStrided1D<TestWorkletErrorProxy, InvocationType1>;
TestWorkletErrorProxy worklet;
InvocationType1 invocation(execObjects);
@ -340,7 +339,7 @@ void TestErrorFunctorInvoke()
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
task.SetErrorMessageBuffer(errorMessage);
ScheduleTaskSingular<decltype(task)><<<32, 256>>>(task, 1, 2);
ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
cudaDeviceSynchronize();
VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
@ -348,7 +347,7 @@ void TestErrorFunctorInvoke()
}
template <typename DeviceAdapter>
void TestTaskSingular()
void TestTaskStrided()
{
TestNormalFunctorInvoke<DeviceAdapter>();
TestErrorFunctorInvoke<DeviceAdapter>();
@ -356,7 +355,7 @@ void TestTaskSingular()
} // anonymous namespace
int UnitTestTaskSingularCuda(int, char* [])
int UnitTestTaskStrided(int, char* [])
{
return vtkm::cont::testing::Testing::Run(TestTaskSingular<vtkm::cont::DeviceAdapterTagCuda>);
return vtkm::cont::testing::Testing::Run(TestTaskStrided<vtkm::cont::DeviceAdapterTagCuda>);
}

@ -28,9 +28,11 @@
#ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
#define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \
!defined(VTKM_TEST_HEADER_BUILD)
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
#if !defined(vtk_m_exec_internal_TaskSingular_h) && \
!defined(vtk_m_exec_internal_TaskTiling_h) && \
!defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \
!defined(VTKM_TEST_HEADER_BUILD)
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
#endif
#include <vtkm/internal/FunctionInterface.h>

@ -40,9 +40,11 @@ $# Ignore the following comment. It is meant for the generated file.
#ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
#define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \\
!defined(VTKM_TEST_HEADER_BUILD)
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
#if !defined(vtk_m_exec_internal_TaskSingular_h) && \\
!defined(vtk_m_exec_internal_TaskTiling_h) && \\
!defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \\
!defined(VTKM_TEST_HEADER_BUILD)
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
#endif
#include <vtkm/internal/FunctionInterface.h>

@ -3007,42 +3007,49 @@ public:
template <typename InputPortalType, typename OutputPortalType>
VTKM_EXEC void operator()(const InputPortalType& coeffs,
OutputPortalType& sigOut,
const vtkm::Id& workIndex) const
vtkm::Id workIndex) const
{
if (workIndex < cALen2) // valid calculation region
if (workIndex >= cALen2) // valid calculation region
{
vtkm::Id xi; // coeff indices
vtkm::Id k1, k2; // indices for low and high filter
VAL sum = 0.0;
if (workIndex % 2 != 0)
{
k1 = this->filterLen - 2;
k2 = this->filterLen - 1;
}
else
{
k1 = this->filterLen - 1;
k2 = this->filterLen - 2;
}
xi = (workIndex + 1) / 2;
while (k1 > -1) // k1 >= 0
{
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi++));
k1 -= 2;
}
xi = workIndex / 2;
while (k2 > -1) // k2 >= 0
{
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(this->cALenExtended + xi++));
k2 -= 2;
}
sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
return;
}
vtkm::Id xi1 = (workIndex + 1) / 2; // coeff indices
vtkm::Id xi2 = this->cALenExtended + ((workIndex) / 2); // coeff indices
VAL sum = 0.0;
const bool odd = workIndex % 2 != 0;
if (odd)
{
vtkm::Id k1 = this->filterLen - 2;
vtkm::Id k2 = this->filterLen - 1;
for (; k1 >= 0; k1 -= 2, k2 -= 2)
{
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
}
if (k2 >= 0)
{
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
}
}
else //even
{
vtkm::Id k1 = this->filterLen - 1;
vtkm::Id k2 = this->filterLen - 2;
for (; k2 >= 0; k1 -= 2, k2 -= 2)
{
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
}
if (k1 >= 0)
{
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
}
}
sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
}
#undef MAKEVAL
#undef VAL