Solve reduce by key bugs with cuda 7.5 + maxwell hardware.

The concern is now all architectures are doing a hardware sync on reduce_by_key.
This isn't a super serious concern, but it is a downside.
This commit is contained in:
Robert Maynard 2016-05-12 09:46:05 -04:00
parent c2dcd1df2f
commit e5c3f9c42d
2 changed files with 56 additions and 1 deletions

@ -400,9 +400,10 @@ private:
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
BinaryFunctor> bop(binary_functor);
try
{
result_iterators = ::thrust::reduce_by_key(thrust::cuda::par,
result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(),
IteratorBegin(keys),
IteratorEnd(keys),
IteratorBegin(values),

@ -167,4 +167,58 @@ __host__ __device__
}
template<typename T,
typename InputIterator2,
typename OutputIterator1,
typename OutputIterator2,
typename BinaryPredicate,
typename BinaryFunction>
__host__
::thrust::pair<OutputIterator1,OutputIterator2>
reduce_by_key(const vtkm_cuda_policy &exec,
thrust::system::cuda::pointer<T> keys_first,
thrust::system::cuda::pointer<T> keys_last,
InputIterator2 values_first,
OutputIterator1 keys_output,
OutputIterator2 values_output,
BinaryPredicate binary_pred,
BinaryFunction binary_op)
{
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
::thrust::pair<OutputIterator1,OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par, keys_first.get(), keys_last.get(), values_first, keys_output, values_output, binary_pred, binary_op);
cudaDeviceSynchronize();
return result;
#else
return thrust::reduce_by_key(thrust::cuda::par, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
#endif
}
template<typename InputIterator1,
typename InputIterator2,
typename OutputIterator1,
typename OutputIterator2,
typename BinaryPredicate,
typename BinaryFunction>
__host__
::thrust::pair<OutputIterator1,OutputIterator2>
reduce_by_key(const vtkm_cuda_policy &exec,
InputIterator1 keys_first,
InputIterator1 keys_last,
InputIterator2 values_first,
OutputIterator1 keys_output,
OutputIterator2 values_output,
BinaryPredicate binary_pred,
BinaryFunction binary_op)
{
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
::thrust::pair<OutputIterator1,OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
cudaDeviceSynchronize();
return result;
#else
return thrust::reduce_by_key(thrust::cuda::par, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
#endif
}
#endif