Merge topic 'cuda_task_strided'
b56894dd Move VTK-m Cuda backend over to a grid-stride iteration pattern. Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Kenneth Moreland <kmorel@sandia.gov> Merge-request: !1171
This commit is contained in:
commit
e32eebeae9
@ -43,4 +43,5 @@ target_sources(vtkm_cont PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
|
||||
)
|
||||
|
@ -31,6 +31,8 @@
|
||||
// Here are the actual implementation of the algorithms.
|
||||
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
|
||||
|
||||
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
namespace vtkm
|
||||
@ -257,24 +259,24 @@ class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
|
||||
{
|
||||
public:
|
||||
template <typename WorkletType, typename InvocationType>
|
||||
static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
|
||||
static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType> MakeTask(
|
||||
WorkletType& worklet,
|
||||
InvocationType& invocation,
|
||||
vtkm::Id,
|
||||
vtkm::Id globalIndexOffset = 0)
|
||||
{
|
||||
using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
|
||||
using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
|
||||
return Task(worklet, invocation, globalIndexOffset);
|
||||
}
|
||||
|
||||
template <typename WorkletType, typename InvocationType>
|
||||
static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
|
||||
static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType> MakeTask(
|
||||
WorkletType& worklet,
|
||||
InvocationType& invocation,
|
||||
vtkm::Id3,
|
||||
vtkm::Id globalIndexOffset = 0)
|
||||
{
|
||||
using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
|
||||
using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
|
||||
return Task(worklet, invocation, globalIndexOffset);
|
||||
}
|
||||
};
|
||||
|
153
vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
Normal file
153
vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
Normal file
@ -0,0 +1,153 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
|
||||
// Copyright 2014 UT-Battelle, LLC.
|
||||
// Copyright 2014 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
VTKM_CONT_EXPORT int getNumSMs(int dId)
|
||||
{
|
||||
//check
|
||||
static bool lookupBuilt = false;
|
||||
static std::vector<int> numSMs;
|
||||
|
||||
if (!lookupBuilt)
|
||||
{
|
||||
//lock the mutex
|
||||
static std::mutex built_mutex;
|
||||
std::lock_guard<std::mutex> lock(built_mutex);
|
||||
|
||||
//iterate over all devices
|
||||
int numberOfSMs = 0;
|
||||
int count = 0;
|
||||
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
|
||||
numSMs.reserve(static_cast<std::size_t>(count));
|
||||
for (int deviceId = 0; deviceId < count; ++deviceId)
|
||||
{ //get the number of sm's per deviceId
|
||||
VTKM_CUDA_CALL(
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
|
||||
numSMs.push_back(numberOfSMs);
|
||||
}
|
||||
lookupBuilt = true;
|
||||
}
|
||||
return numSMs[dId];
|
||||
}
|
||||
|
||||
// we use cuda pinned memory to reduce the amount of synchronization
|
||||
// and mem copies between the host and device.
|
||||
template <>
|
||||
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
|
||||
vtkm::Id& arraySize,
|
||||
char** hostPointer)
|
||||
{
|
||||
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
|
||||
static bool errorArrayInit = false;
|
||||
static char* hostPtr = nullptr;
|
||||
static char* devicePtr = nullptr;
|
||||
if (!errorArrayInit)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
|
||||
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
|
||||
errorArrayInit = true;
|
||||
}
|
||||
//set the size of the array
|
||||
arraySize = ERROR_ARRAY_SIZE;
|
||||
|
||||
//specify the host pointer to the memory
|
||||
*hostPointer = hostPtr;
|
||||
(void)hostPointer;
|
||||
return devicePtr;
|
||||
}
|
||||
|
||||
template <>
|
||||
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
|
||||
vtkm::exec::cuda::internal::TaskStrided& functor)
|
||||
{
|
||||
//since the memory is pinned we can access it safely on the host
|
||||
//without a memcpy
|
||||
vtkm::Id errorArraySize = 0;
|
||||
char* hostErrorPtr = nullptr;
|
||||
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
||||
|
||||
//clear the first character which means that we don't contain an error
|
||||
hostErrorPtr[0] = '\0';
|
||||
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
functor.SetErrorMessageBuffer(errorMessage);
|
||||
|
||||
return hostErrorPtr;
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
||||
int& grids,
|
||||
int& blocks,
|
||||
vtkm::Id size)
|
||||
{
|
||||
(void)size;
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
grids = 32 * getNumSMs(deviceId);
|
||||
blocks = 128;
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
||||
int& grids,
|
||||
dim3& blocks,
|
||||
const dim3& size)
|
||||
{
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
grids = 32 * getNumSMs(deviceId);
|
||||
|
||||
if (size.x == 0)
|
||||
{ //grids that have no x dimension
|
||||
blocks.x = 1;
|
||||
blocks.y = 16;
|
||||
blocks.z = 8;
|
||||
}
|
||||
else if (size.x > 128)
|
||||
{
|
||||
blocks.x = 64;
|
||||
blocks.y = 2;
|
||||
blocks.z = 1;
|
||||
}
|
||||
else
|
||||
{ //for really small grids
|
||||
blocks.x = 8;
|
||||
blocks.y = 4;
|
||||
blocks.z = 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -21,6 +21,8 @@
|
||||
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
|
||||
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
|
||||
|
||||
#include <vtkm/cont/vtkm_cont_export.h>
|
||||
|
||||
#include <vtkm/TypeTraits.h>
|
||||
#include <vtkm/Types.h>
|
||||
#include <vtkm/UnaryPredicates.h>
|
||||
@ -34,15 +36,16 @@
|
||||
#include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
|
||||
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
|
||||
|
||||
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
||||
#include <vtkm/exec/cuda/internal/WrappedOperators.h>
|
||||
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
||||
#include <vtkm/exec/internal/TaskSingular.h>
|
||||
|
||||
#include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
|
||||
|
||||
// #define ANALYZE_VTKM_SCHEDULER_1D
|
||||
// #define ANALYZE_VTKM_SCHEDULER_3D
|
||||
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
|
||||
|
||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
|
||||
#include <vtkm/cont/cuda/internal/TaskTuner.h>
|
||||
#endif
|
||||
|
||||
@ -62,7 +65,6 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
|
||||
VTKM_THIRDPARTY_POST_INCLUDE
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
@ -82,73 +84,20 @@ namespace internal
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
#endif
|
||||
|
||||
static __global__ void DetermineProperXGridSize(vtkm::UInt32 desired_size,
|
||||
vtkm::UInt32* actual_size)
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size)
|
||||
{
|
||||
//used only to see if we can launch kernels with a x grid size that
|
||||
//matches the max of the graphics card, or are we having to fall back
|
||||
//to SM_2 grid sizes
|
||||
if (blockIdx.x != 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
#if __CUDA_ARCH__ <= 200
|
||||
const vtkm::UInt32 maxXGridSizeForSM2 = 65535;
|
||||
*actual_size = maxXGridSizeForSM2;
|
||||
#else
|
||||
*actual_size = desired_size;
|
||||
#endif
|
||||
//see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
|
||||
//for why our inc is grid-stride
|
||||
const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const vtkm::Id inc = blockDim.x * gridDim.x;
|
||||
task(start, size, inc);
|
||||
}
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel(FunctorType functor,
|
||||
vtkm::Id numberOfKernelsInvoked,
|
||||
vtkm::Id length)
|
||||
{
|
||||
//Note a cuda launch can only handle at most 2B iterations of a kernel
|
||||
//because it holds all of the indexes inside UInt32, so for use to
|
||||
//handle datasets larger than 2B, we need to execute multiple kernels
|
||||
const vtkm::Id index =
|
||||
numberOfKernelsInvoked + static_cast<vtkm::Id>(blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (index < length)
|
||||
{
|
||||
functor(index);
|
||||
}
|
||||
}
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size)
|
||||
{
|
||||
const vtkm::Id3 index(blockIdx.x * blockDim.x + threadIdx.x,
|
||||
blockIdx.y * blockDim.y + threadIdx.y,
|
||||
blockIdx.z * blockDim.z + threadIdx.z);
|
||||
if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z)
|
||||
{
|
||||
return;
|
||||
}
|
||||
functor(index);
|
||||
}
|
||||
|
||||
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
|
||||
|
||||
// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd
|
||||
// for an unknown reason
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel2(FunctorType functor,
|
||||
vtkm::Id numberOfKernelsInvoked,
|
||||
vtkm::Id length)
|
||||
{
|
||||
vtkm::Id index = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
|
||||
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
|
||||
for (; index < length; index += inc)
|
||||
{
|
||||
functor(index);
|
||||
}
|
||||
}
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size)
|
||||
{
|
||||
//This is the 3D version of executing in a grid-stride manner
|
||||
const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
|
||||
blockIdx.y * blockDim.y + threadIdx.y,
|
||||
blockIdx.z * blockDim.z + threadIdx.z);
|
||||
@ -158,17 +107,11 @@ __global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
|
||||
{
|
||||
for (uint j = start.y; j < size.y; j += inc.y)
|
||||
{
|
||||
vtkm::Id3 index(start.x, j, k);
|
||||
for (vtkm::Id i = start.x; i < size.x; i += inc.x)
|
||||
{
|
||||
index[0] = i;
|
||||
functor(index);
|
||||
}
|
||||
task(start.x, size.x, inc.x, j, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T, typename BinaryOperationType>
|
||||
__global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
|
||||
@ -180,16 +123,6 @@ __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d)
|
||||
{
|
||||
gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1)
|
||||
: (rangeMax.x / blockSize3d.x);
|
||||
gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1)
|
||||
: (rangeMax.y / blockSize3d.y);
|
||||
gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1)
|
||||
: (rangeMax.z / blockSize3d.z);
|
||||
}
|
||||
|
||||
/// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
|
||||
/// device that uses thrust as its implementation. The subclass should pass in
|
||||
/// the correct device adapter tag as the template parameter.
|
||||
@ -1142,89 +1075,24 @@ public:
|
||||
::thrust::equal_to<T>(),
|
||||
binary_functor);
|
||||
}
|
||||
// Because of some funny code conversions in nvcc, kernels for devices have to
|
||||
// be public.
|
||||
#ifndef VTKM_CUDA
|
||||
private:
|
||||
#endif
|
||||
// we use cuda pinned memory to reduce the amount of synchronization
|
||||
// and mem copies between the host and device.
|
||||
VTKM_CONT
|
||||
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer)
|
||||
{
|
||||
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
|
||||
static bool errorArrayInit = false;
|
||||
static char* hostPtr = nullptr;
|
||||
static char* devicePtr = nullptr;
|
||||
if (!errorArrayInit)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
|
||||
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
|
||||
errorArrayInit = true;
|
||||
}
|
||||
//set the size of the array
|
||||
arraySize = ERROR_ARRAY_SIZE;
|
||||
VTKM_CONT_EXPORT
|
||||
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);
|
||||
|
||||
//specify the host pointer to the memory
|
||||
*hostPointer = hostPtr;
|
||||
(void)hostPointer;
|
||||
return devicePtr;
|
||||
}
|
||||
VTKM_CONT_EXPORT
|
||||
static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
||||
|
||||
// we query cuda for the max blocks per grid for 1D scheduling
|
||||
// and cache the values in static variables
|
||||
VTKM_CONT
|
||||
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
|
||||
{
|
||||
static std::atomic<bool> gridQueryInit(false);
|
||||
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
|
||||
// NOTE: The following code may still be executed by multiple threads
|
||||
// but it should not cause any correctness issues.
|
||||
if (!gridQueryInit)
|
||||
{
|
||||
int currDevice;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
|
||||
VTKM_CONT_EXPORT
|
||||
static void GetGridsAndBlocks(int& grid, int& blocks, vtkm::Id size);
|
||||
|
||||
cudaDeviceProp properties;
|
||||
VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
|
||||
maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
|
||||
maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
|
||||
maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
|
||||
|
||||
//Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids
|
||||
//in the X direction, it is dependent on the code being compiled for SM3+.
|
||||
//If not, it falls back to SM_2 limitation of 65535 being the largest grid
|
||||
//size.
|
||||
//Now since SM architecture is only available inside kernels we have to
|
||||
//invoke one to see what the actual limit is for our device. So that is
|
||||
//what we are going to do next, and than we will store that result
|
||||
|
||||
vtkm::UInt32* dev_actual_size;
|
||||
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
|
||||
DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
|
||||
dev_actual_size,
|
||||
sizeof(vtkm::UInt32),
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
gridQueryInit = true;
|
||||
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
|
||||
}
|
||||
return maxGridSize;
|
||||
}
|
||||
VTKM_CONT_EXPORT
|
||||
static void GetGridsAndBlocks(int& grid, dim3& blocks, const dim3& size);
|
||||
|
||||
public:
|
||||
template <class TaskType, typename RangeType>
|
||||
VTKM_CONT static void ScheduleTask(TaskType task, RangeType size)
|
||||
{ //for now defer to the schedule api, we need to do a significant
|
||||
//amount of build infrastructure work to implement type erasure tasks
|
||||
//for cuda
|
||||
Schedule(task, size);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
|
||||
template <typename WType, typename IType>
|
||||
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
|
||||
vtkm::Id numInstances)
|
||||
{
|
||||
VTKM_ASSERT(numInstances >= 0);
|
||||
if (numInstances < 1)
|
||||
@ -1232,45 +1100,12 @@ public:
|
||||
// No instances means nothing to run. Just return.
|
||||
return;
|
||||
}
|
||||
char* hostErrorPtr = SetupErrorBuffer(functor);
|
||||
|
||||
//since the memory is pinned we can access it safely on the host
|
||||
//without a memcpy
|
||||
vtkm::Id errorArraySize = 0;
|
||||
char* hostErrorPtr = nullptr;
|
||||
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
||||
int grids, blocks;
|
||||
GetGridsAndBlocks(grids, blocks, numInstances);
|
||||
|
||||
//clear the first character which means that we don't contain an error
|
||||
hostErrorPtr[0] = '\0';
|
||||
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
|
||||
functor.SetErrorMessageBuffer(errorMessage);
|
||||
|
||||
const vtkm::Id blockSizeAsId = 128;
|
||||
const vtkm::UInt32 blockSize = 128;
|
||||
const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0];
|
||||
const vtkm::UInt32 totalBlocks =
|
||||
static_cast<vtkm::UInt32>((numInstances + blockSizeAsId - 1) / blockSizeAsId);
|
||||
|
||||
//Note a cuda launch can only handle at most 2B iterations of a kernel
|
||||
//because it holds all of the indexes inside UInt32, so for use to
|
||||
//handle datasets larger than 2B, we need to execute multiple kernels
|
||||
if (totalBlocks < maxblocksPerLaunch)
|
||||
{
|
||||
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), numInstances);
|
||||
}
|
||||
else
|
||||
{
|
||||
const vtkm::Id numberOfKernelsToRun =
|
||||
blockSizeAsId * static_cast<vtkm::Id>(maxblocksPerLaunch);
|
||||
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
|
||||
numberOfKernelsInvoked += numberOfKernelsToRun)
|
||||
{
|
||||
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, numberOfKernelsInvoked, numInstances);
|
||||
}
|
||||
}
|
||||
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, numInstances);
|
||||
|
||||
//sync so that we can check the results of the call.
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
@ -1284,13 +1119,14 @@ public:
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_1D
|
||||
compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize);
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
||||
parameter_sweep_1d_schedule(functor, numInstances);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
|
||||
template <typename WType, typename IType>
|
||||
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
|
||||
vtkm::Id3 rangeMax)
|
||||
{
|
||||
VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
|
||||
if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
|
||||
@ -1298,46 +1134,17 @@ public:
|
||||
// No instances means nothing to run. Just return.
|
||||
return;
|
||||
}
|
||||
|
||||
//since the memory is pinned we can access it safely on the host
|
||||
//without a memcpy
|
||||
vtkm::Id errorArraySize = 0;
|
||||
char* hostErrorPtr = nullptr;
|
||||
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
||||
|
||||
//clear the first character which means that we don't contain an error
|
||||
hostErrorPtr[0] = '\0';
|
||||
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
|
||||
functor.SetErrorMessageBuffer(errorMessage);
|
||||
char* hostErrorPtr = SetupErrorBuffer(functor);
|
||||
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
|
||||
//currently we presume that 3d scheduling access patterns prefer accessing
|
||||
//memory in the X direction. Also should be good for thin in the Z axis
|
||||
//algorithms.
|
||||
dim3 blockSize3d(64, 2, 1);
|
||||
//In general we need more information as this doesn't work well when
|
||||
//executing on the points, and need to fetch all cells used, as the z
|
||||
//width is not fat enough
|
||||
int grids;
|
||||
dim3 blocks;
|
||||
GetGridsAndBlocks(grids, blocks, ranges);
|
||||
|
||||
//handle the simple use case of 'bad' datasets which are thin in X
|
||||
//but larger in the other directions, allowing us decent performance with
|
||||
//that use case.
|
||||
if (rangeMax[0] <= 128 && (rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2]))
|
||||
{
|
||||
blockSize3d = dim3(16, 4, 4);
|
||||
}
|
||||
|
||||
|
||||
dim3 gridSize3d;
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
|
||||
|
||||
//sync so that we can check the results of the call.
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
@ -1351,23 +1158,25 @@ public:
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_1D
|
||||
compare_1d_dynamic_block_picker(functor,
|
||||
rangeMax[0] * rangeMax[1] * rangeMax[2],
|
||||
gridSize3d.x * gridSize3d.y * gridSize3d.z,
|
||||
blockSize3d.x * blockSize3d.y * blockSize3d.z);
|
||||
#endif
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_3D
|
||||
//requires the errormessage buffer be set
|
||||
compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d);
|
||||
#endif
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
parameter_sweep_3d_schedule(functor, rangeMax);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
|
||||
{
|
||||
vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
|
||||
ScheduleTask(kernel, numInstances);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
|
||||
{
|
||||
vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
|
||||
ScheduleTask(kernel, rangeMax);
|
||||
}
|
||||
|
||||
template <typename T, class Storage>
|
||||
VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
|
||||
{
|
||||
|
@ -41,293 +41,167 @@ namespace cuda
|
||||
namespace internal
|
||||
{
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size);
|
||||
int getNumSMs(int dId);
|
||||
|
||||
void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d);
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
|
||||
|
||||
|
||||
template <typename Task>
|
||||
__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size)
|
||||
struct PerfRecord1d
|
||||
{
|
||||
const vtkm::Id start = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
|
||||
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
|
||||
for (vtkm::Id i = start; i < size; i += inc)
|
||||
{
|
||||
task(i);
|
||||
}
|
||||
}
|
||||
|
||||
class PerfRecord
|
||||
{
|
||||
public:
|
||||
PerfRecord(float elapsedT, dim3 block)
|
||||
PerfRecord1d(float elapsedT, int g, int b)
|
||||
: elapsedTime(elapsedT)
|
||||
, blockSize(block)
|
||||
, grid(g)
|
||||
, block(b)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
|
||||
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
float elapsedTime;
|
||||
dim3 blockSize;
|
||||
int grid;
|
||||
int block;
|
||||
};
|
||||
|
||||
template <typename Task>
|
||||
static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy)
|
||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
|
||||
{
|
||||
int blockSize; // The launch configurator returned block size
|
||||
int minGridSize; // The minimum grid size needed to achieve the
|
||||
// maximum occupancy for a full device launch
|
||||
int gridSize; // The actual grid size needed, based on number of SM's
|
||||
int device; // device to run on
|
||||
int numSMs; // number of SMs on the active device
|
||||
|
||||
cudaGetDevice(&device);
|
||||
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device);
|
||||
|
||||
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2<Task>, 0, 0);
|
||||
|
||||
|
||||
blockSize /= (numSMs * 2);
|
||||
// Round up according to array size
|
||||
// gridSize = (size + blockSize - 1) / blockSize;
|
||||
gridSize = 32 * numSMs;
|
||||
// std::cout << "numSMs: " << numSMs << std::endl;
|
||||
|
||||
// calculate theoretical occupancy
|
||||
int maxActiveBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&maxActiveBlocks, Schedule1DIndexKernel2<Task>, blockSize, 0);
|
||||
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device);
|
||||
|
||||
grids = gridSize;
|
||||
blocks = blockSize;
|
||||
occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
|
||||
(float)(props.maxThreadsPerMultiProcessor / props.warpSize);
|
||||
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
|
||||
<< ">>> required: " << record.elapsedTime << "\n";
|
||||
return os;
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void compare_1d_dynamic_block_picker(Functor functor,
|
||||
vtkm::Id size,
|
||||
const vtkm::Id& currentGridSize,
|
||||
const vtkm::Id& currentBlockSize)
|
||||
|
||||
struct PerfRecord3d
|
||||
{
|
||||
const std::type_info& ti = typeid(functor);
|
||||
std::cout << "fixed 1d block size performance " << ti.name() << std::endl;
|
||||
PerfRecord3d(float elapsedT, int g, dim3 b)
|
||||
: elapsedTime(elapsedT)
|
||||
, grid(g)
|
||||
, block(b)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "Schedule1DIndexKernel size: " << size << std::endl;
|
||||
std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "dynamic 1d block size performance " << ti.name() << std::endl;
|
||||
{
|
||||
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
int grids, blocks;
|
||||
float occupancy;
|
||||
BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
|
||||
float elapsedTime;
|
||||
int grid;
|
||||
dim3 block;
|
||||
};
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
|
||||
{
|
||||
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl;
|
||||
std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
|
||||
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
|
||||
return os;
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void compare_3d_dynamic_block_picker(Functor functor,
|
||||
vtkm::Id3 ranges,
|
||||
const dim3& gridSize3d,
|
||||
const dim3& blockSize3d)
|
||||
|
||||
template <typename TaskT>
|
||||
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
|
||||
{
|
||||
const std::type_info& ti = typeid(functor);
|
||||
std::cout << "fixed 3d block size performance " << ti.name() << std::endl;
|
||||
std::vector<PerfRecord1d> results;
|
||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
||||
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
|
||||
256, 512, 1024, 2048, 4096, 8192 };
|
||||
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
|
||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
int grids = gridIndexTable[g] * getNumSMs(deviceId);
|
||||
for (vtkm::UInt32 b = 0; b < 12; b++)
|
||||
{
|
||||
int blocks = blockIndexTable[b];
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
// std::cout << "Schedule3DIndexKernel size: " << size << std::endl;
|
||||
// std::cout << "GridSize of: " << currentGridSize
|
||||
// << " BlockSize of: " << currentBlockSize << " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "dynamic 3d block size performance " << ti.name() << std::endl;
|
||||
std::sort(results.begin(), results.end());
|
||||
for (auto&& i : results)
|
||||
{
|
||||
|
||||
// int grids, blocks;
|
||||
// float occupancy;
|
||||
// BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
|
||||
|
||||
// cudaEvent_t start, stop;
|
||||
// VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
// VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
// Schedule3DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
|
||||
// VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
// float elapsedTimeMilliseconds;
|
||||
// VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
// VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
// std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl;
|
||||
// std::cout << "GridSize of: " << grids
|
||||
// << " BlockSize of: " << blocks << " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax)
|
||||
template <typename TaskT>
|
||||
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
|
||||
{
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
std::vector<PerfRecord> results;
|
||||
vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
|
||||
std::vector<PerfRecord3d> results;
|
||||
|
||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
||||
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
|
||||
28, 30, 32, 64, 128, 256, 512, 1024 };
|
||||
|
||||
int deviceId;
|
||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
||||
{
|
||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
||||
int grids = gridIndexTable[g] * getNumSMs(deviceId);
|
||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
||||
{
|
||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
|
||||
dim3 gridSize3d;
|
||||
|
||||
if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
|
||||
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
|
||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
||||
{
|
||||
//cuda can't handle more than 1024 threads per block
|
||||
//so don't try if we compute higher than that
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
//also don't try stupidly low numbers
|
||||
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
|
||||
|
||||
//cuda can't handle more than 64 threads in the z direction
|
||||
continue;
|
||||
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
|
||||
blocks.z >= 64)
|
||||
{
|
||||
//cuda can't handle more than 1024 threads per block
|
||||
//so don't try if we compute higher than that
|
||||
|
||||
//also don't try stupidly low numbers
|
||||
|
||||
//cuda can't handle more than 64 threads in the z direction
|
||||
continue;
|
||||
}
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
||||
}
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
|
||||
results.push_back(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
|
||||
for (vtkm::Int64 i = 1; i <= size; i++)
|
||||
for (auto&& i : results)
|
||||
{
|
||||
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
|
||||
vtkm::UInt32 x = results[index].blockSize.x;
|
||||
vtkm::UInt32 y = results[index].blockSize.y;
|
||||
vtkm::UInt32 z = results[index].blockSize.z;
|
||||
float t = results[index].elapsedTime;
|
||||
|
||||
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "fixed 3d block size performance " << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(64, 2, 1);
|
||||
dim3 gridSize3d;
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,8 +22,9 @@ set(headers
|
||||
ArrayPortalFromThrust.h
|
||||
ExecutionPolicy.h
|
||||
IteratorFromArrayPortal.h
|
||||
WrappedOperators.h
|
||||
TaskStrided.h
|
||||
ThrustPatches.h
|
||||
WrappedOperators.h
|
||||
)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
207
vtkm/exec/cuda/internal/TaskStrided.h
Normal file
207
vtkm/exec/cuda/internal/TaskStrided.h
Normal file
@ -0,0 +1,207 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
|
||||
// Copyright 2014 UT-Battelle, LLC.
|
||||
// Copyright 2014 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
#ifndef vtk_m_exec_cuda_internal_TaskStrided_h
|
||||
#define vtk_m_exec_cuda_internal_TaskStrided_h
|
||||
|
||||
#include <vtkm/exec/TaskBase.h>
|
||||
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
|
||||
//Todo: rename this header to TaskInvokeWorkletDetail.h
|
||||
#include <vtkm/exec/internal/WorkletInvokeFunctorDetail.h>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace exec
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
template <typename WType>
|
||||
void TaskStridedSetErrorBuffer(void* w, const vtkm::exec::internal::ErrorMessageBuffer& buffer)
|
||||
{
|
||||
using WorkletType = typename std::remove_cv<WType>::type;
|
||||
WorkletType* const worklet = static_cast<WorkletType*>(w);
|
||||
worklet->SetErrorMessageBuffer(buffer);
|
||||
}
|
||||
|
||||
class TaskStrided : public vtkm::exec::TaskBase
|
||||
{
|
||||
public:
|
||||
void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer& buffer)
|
||||
{
|
||||
(void)buffer;
|
||||
this->SetErrorBufferFunction(this->WPtr, buffer);
|
||||
}
|
||||
|
||||
protected:
|
||||
void* WPtr = nullptr;
|
||||
|
||||
using SetErrorBufferSignature = void (*)(void*, const vtkm::exec::internal::ErrorMessageBuffer&);
|
||||
SetErrorBufferSignature SetErrorBufferFunction = nullptr;
|
||||
};
|
||||
|
||||
template <typename WType, typename IType>
|
||||
class TaskStrided1D : public TaskStrided
|
||||
{
|
||||
public:
|
||||
TaskStrided1D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
|
||||
: TaskStrided()
|
||||
, Worklet(worklet)
|
||||
, Invocation(invocation)
|
||||
, GlobalIndexOffset(globalIndexOffset)
|
||||
{
|
||||
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
|
||||
//Bind the Worklet to void*
|
||||
this->WPtr = (void*)&this->Worklet;
|
||||
}
|
||||
|
||||
VTKM_EXEC
|
||||
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
|
||||
{
|
||||
for (vtkm::Id index = start; index < end; index += inc)
|
||||
{
|
||||
//Todo: rename this function to DoTaskInvokeWorklet
|
||||
vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
|
||||
this->Worklet,
|
||||
this->Invocation,
|
||||
this->Worklet.GetThreadIndices(index,
|
||||
this->Invocation.OutputToInputMap,
|
||||
this->Invocation.VisitArray,
|
||||
this->Invocation.GetInputDomain(),
|
||||
this->GlobalIndexOffset));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typename std::remove_const<WType>::type Worklet;
|
||||
// This is held by by value so that when we transfer the invocation object
|
||||
// over to CUDA it gets properly copied to the device. While we want to
|
||||
// hold by reference to reduce the number of copies, it is not possible
|
||||
// currently.
|
||||
const IType Invocation;
|
||||
const vtkm::Id GlobalIndexOffset;
|
||||
};
|
||||
|
||||
template <typename WType>
|
||||
class TaskStrided1D<WType, vtkm::internal::NullType> : public TaskStrided
|
||||
{
|
||||
public:
|
||||
TaskStrided1D(WType& worklet)
|
||||
: TaskStrided()
|
||||
, Worklet(worklet)
|
||||
{
|
||||
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
|
||||
//Bind the Worklet to void*
|
||||
this->WPtr = (void*)&this->Worklet;
|
||||
}
|
||||
|
||||
VTKM_EXEC
|
||||
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
|
||||
{
|
||||
for (vtkm::Id index = start; index < end; index += inc)
|
||||
{
|
||||
this->Worklet(index);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typename std::remove_const<WType>::type Worklet;
|
||||
};
|
||||
|
||||
template <typename WType, typename IType>
|
||||
class TaskStrided3D : public TaskStrided
|
||||
{
|
||||
public:
|
||||
TaskStrided3D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
|
||||
: TaskStrided()
|
||||
, Worklet(worklet)
|
||||
, Invocation(invocation)
|
||||
, GlobalIndexOffset(globalIndexOffset)
|
||||
{
|
||||
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
|
||||
//Bind the Worklet to void*
|
||||
this->WPtr = (void*)&this->Worklet;
|
||||
}
|
||||
|
||||
VTKM_EXEC
|
||||
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
|
||||
{
|
||||
vtkm::Id3 index(start, j, k);
|
||||
for (vtkm::Id i = start; i < end; i += inc)
|
||||
{
|
||||
index[0] = i;
|
||||
//Todo: rename this function to DoTaskInvokeWorklet
|
||||
vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
|
||||
this->Worklet,
|
||||
this->Invocation,
|
||||
this->Worklet.GetThreadIndices(index,
|
||||
this->Invocation.OutputToInputMap,
|
||||
this->Invocation.VisitArray,
|
||||
this->Invocation.GetInputDomain(),
|
||||
this->GlobalIndexOffset));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typename std::remove_const<WType>::type Worklet;
|
||||
// This is held by by value so that when we transfer the invocation object
|
||||
// over to CUDA it gets properly copied to the device. While we want to
|
||||
// hold by reference to reduce the number of copies, it is not possible
|
||||
// currently.
|
||||
const IType Invocation;
|
||||
const vtkm::Id GlobalIndexOffset;
|
||||
};
|
||||
|
||||
template <typename WType>
|
||||
class TaskStrided3D<WType, vtkm::internal::NullType> : public TaskStrided
|
||||
{
|
||||
public:
|
||||
TaskStrided3D(WType& worklet)
|
||||
: TaskStrided()
|
||||
, Worklet(worklet)
|
||||
{
|
||||
this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
|
||||
//Bind the Worklet to void*
|
||||
this->WPtr = (void*)&this->Worklet;
|
||||
}
|
||||
|
||||
VTKM_EXEC
|
||||
void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
|
||||
{
|
||||
vtkm::Id3 index(start, j, k);
|
||||
for (vtkm::Id i = start; i < end; i += inc)
|
||||
{
|
||||
index[0] = i;
|
||||
this->Worklet(index);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typename std::remove_const<WType>::type Worklet;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
} // vtkm::exec::cuda::internal
|
||||
|
||||
#endif //vtk_m_exec_cuda_internal_TaskStrided_h
|
@ -22,6 +22,6 @@
|
||||
|
||||
set(unit_tests
|
||||
UnitTestTextureMemorySupport.cu
|
||||
UnitTestTaskSingularCuda.cu
|
||||
UnitTestTaskStrided.cu
|
||||
)
|
||||
vtkm_unit_tests(SOURCES ${unit_tests})
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include <vtkm/exec/FunctorBase.h>
|
||||
#include <vtkm/exec/arg/BasicArg.h>
|
||||
#include <vtkm/exec/arg/ThreadIndicesBasic.h>
|
||||
#include <vtkm/exec/internal/TaskSingular.h>
|
||||
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
||||
|
||||
#include <vtkm/StaticAssert.h>
|
||||
|
||||
@ -165,14 +165,14 @@ using InvocationType2 = vtkm::internal::Invocation<ExecutionParameterInterface,
|
||||
MyVisitArrayPortal>;
|
||||
|
||||
template <typename TaskType>
|
||||
static __global__ void ScheduleTaskSingular(TaskType task, vtkm::Id start, vtkm::Id end)
|
||||
static __global__ void ScheduleTaskStrided(TaskType task, vtkm::Id start, vtkm::Id end)
|
||||
{
|
||||
|
||||
const vtkm::Id index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const vtkm::Id inc = blockDim.x * gridDim.x;
|
||||
if (index >= start && index < end)
|
||||
{
|
||||
task(index);
|
||||
task(index, end, inc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -269,14 +269,13 @@ void TestNormalFunctorInvoke()
|
||||
TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));
|
||||
|
||||
std::cout << " Try void return." << std::endl;
|
||||
using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType1>;
|
||||
TestWorkletProxy worklet;
|
||||
InvocationType1 invocation1(execObjects);
|
||||
|
||||
using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
|
||||
auto task1 = TaskTypes::MakeTask(worklet, invocation1, vtkm::Id());
|
||||
|
||||
ScheduleTaskSingular<decltype(task1)><<<32, 256>>>(task1, 1, 2);
|
||||
ScheduleTaskStrided<decltype(task1)><<<32, 256>>>(task1, 1, 2);
|
||||
cudaDeviceSynchronize();
|
||||
input.SyncControlArray();
|
||||
output.SyncControlArray();
|
||||
@ -291,13 +290,12 @@ void TestNormalFunctorInvoke()
|
||||
TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
|
||||
TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));
|
||||
|
||||
using TaskSingular2 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType2>;
|
||||
InvocationType2 invocation2(execObjects);
|
||||
|
||||
using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
|
||||
auto task2 = TaskTypes::MakeTask(worklet, invocation2, vtkm::Id());
|
||||
|
||||
ScheduleTaskSingular<decltype(task2)><<<32, 256>>>(task2, 2, 3);
|
||||
ScheduleTaskStrided<decltype(task2)><<<32, 256>>>(task2, 2, 3);
|
||||
cudaDeviceSynchronize();
|
||||
input.SyncControlArray();
|
||||
output.SyncControlArray();
|
||||
@ -323,7 +321,8 @@ void TestErrorFunctorInvoke()
|
||||
TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
|
||||
TestExecObject(output.PrepareForInPlace(DeviceAdapter())));
|
||||
|
||||
using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletErrorProxy, InvocationType1>;
|
||||
using TaskStrided1 =
|
||||
vtkm::exec::cuda::internal::TaskStrided1D<TestWorkletErrorProxy, InvocationType1>;
|
||||
TestWorkletErrorProxy worklet;
|
||||
InvocationType1 invocation(execObjects);
|
||||
|
||||
@ -340,7 +339,7 @@ void TestErrorFunctorInvoke()
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
task.SetErrorMessageBuffer(errorMessage);
|
||||
|
||||
ScheduleTaskSingular<decltype(task)><<<32, 256>>>(task, 1, 2);
|
||||
ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
|
||||
@ -348,7 +347,7 @@ void TestErrorFunctorInvoke()
|
||||
}
|
||||
|
||||
template <typename DeviceAdapter>
|
||||
void TestTaskSingular()
|
||||
void TestTaskStrided()
|
||||
{
|
||||
TestNormalFunctorInvoke<DeviceAdapter>();
|
||||
TestErrorFunctorInvoke<DeviceAdapter>();
|
||||
@ -356,7 +355,7 @@ void TestTaskSingular()
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int UnitTestTaskSingularCuda(int, char* [])
|
||||
int UnitTestTaskStrided(int, char* [])
|
||||
{
|
||||
return vtkm::cont::testing::Testing::Run(TestTaskSingular<vtkm::cont::DeviceAdapterTagCuda>);
|
||||
return vtkm::cont::testing::Testing::Run(TestTaskStrided<vtkm::cont::DeviceAdapterTagCuda>);
|
||||
}
|
@ -28,9 +28,11 @@
|
||||
#ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
|
||||
#define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
|
||||
|
||||
#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \
|
||||
!defined(VTKM_TEST_HEADER_BUILD)
|
||||
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
|
||||
#if !defined(vtk_m_exec_internal_TaskSingular_h) && \
|
||||
!defined(vtk_m_exec_internal_TaskTiling_h) && \
|
||||
!defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \
|
||||
!defined(VTKM_TEST_HEADER_BUILD)
|
||||
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
|
||||
#endif
|
||||
|
||||
#include <vtkm/internal/FunctionInterface.h>
|
||||
|
@ -40,9 +40,11 @@ $# Ignore the following comment. It is meant for the generated file.
|
||||
#ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
|
||||
#define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
|
||||
|
||||
#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) && \\
|
||||
!defined(VTKM_TEST_HEADER_BUILD)
|
||||
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
|
||||
#if !defined(vtk_m_exec_internal_TaskSingular_h) && \\
|
||||
!defined(vtk_m_exec_internal_TaskTiling_h) && \\
|
||||
!defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \\
|
||||
!defined(VTKM_TEST_HEADER_BUILD)
|
||||
#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
|
||||
#endif
|
||||
|
||||
#include <vtkm/internal/FunctionInterface.h>
|
||||
|
@ -3007,42 +3007,49 @@ public:
|
||||
template <typename InputPortalType, typename OutputPortalType>
|
||||
VTKM_EXEC void operator()(const InputPortalType& coeffs,
|
||||
OutputPortalType& sigOut,
|
||||
const vtkm::Id& workIndex) const
|
||||
vtkm::Id workIndex) const
|
||||
{
|
||||
if (workIndex < cALen2) // valid calculation region
|
||||
if (workIndex >= cALen2) // valid calculation region
|
||||
{
|
||||
vtkm::Id xi; // coeff indices
|
||||
vtkm::Id k1, k2; // indices for low and high filter
|
||||
VAL sum = 0.0;
|
||||
|
||||
if (workIndex % 2 != 0)
|
||||
{
|
||||
k1 = this->filterLen - 2;
|
||||
k2 = this->filterLen - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
k1 = this->filterLen - 1;
|
||||
k2 = this->filterLen - 2;
|
||||
}
|
||||
|
||||
xi = (workIndex + 1) / 2;
|
||||
while (k1 > -1) // k1 >= 0
|
||||
{
|
||||
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi++));
|
||||
k1 -= 2;
|
||||
}
|
||||
|
||||
xi = workIndex / 2;
|
||||
while (k2 > -1) // k2 >= 0
|
||||
{
|
||||
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(this->cALenExtended + xi++));
|
||||
k2 -= 2;
|
||||
}
|
||||
|
||||
sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
|
||||
return;
|
||||
}
|
||||
vtkm::Id xi1 = (workIndex + 1) / 2; // coeff indices
|
||||
vtkm::Id xi2 = this->cALenExtended + ((workIndex) / 2); // coeff indices
|
||||
VAL sum = 0.0;
|
||||
|
||||
const bool odd = workIndex % 2 != 0;
|
||||
if (odd)
|
||||
{
|
||||
vtkm::Id k1 = this->filterLen - 2;
|
||||
vtkm::Id k2 = this->filterLen - 1;
|
||||
for (; k1 >= 0; k1 -= 2, k2 -= 2)
|
||||
{
|
||||
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
|
||||
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
|
||||
}
|
||||
if (k2 >= 0)
|
||||
{
|
||||
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
|
||||
}
|
||||
}
|
||||
else //even
|
||||
{
|
||||
vtkm::Id k1 = this->filterLen - 1;
|
||||
vtkm::Id k2 = this->filterLen - 2;
|
||||
for (; k2 >= 0; k1 -= 2, k2 -= 2)
|
||||
{
|
||||
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
|
||||
sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
|
||||
}
|
||||
if (k1 >= 0)
|
||||
{
|
||||
sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
|
||||
}
|
||||
}
|
||||
|
||||
sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
|
||||
}
|
||||
|
||||
#undef MAKEVAL
|
||||
#undef VAL
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user