ThrustPatches now only patches fixes for relevant cuda versions

Rather than always patch Thrust functions, we now only patch
based on the Thrust version.
This commit is contained in:
Robert Maynard 2019-04-10 11:13:50 -04:00
parent 20d6201a98
commit f05940aaca

@ -24,108 +24,17 @@
#ifdef VTKM_ENABLE_CUDA
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
// Needed so we can conditionally include components
#include <thrust/version.h>
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif
//So for thrust 1.9.0+ the aligned_reinterpret_cast has a bug
#if THRUST_VERSION >= 100900
//So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
//where it is not marked as __host__device__. To fix this we add a new
//overload for void* with the correct markup (which is what everyone calls).
namespace thrust
{
namespace detail
{
//just in-case somebody has this fix also for primitive types
template <typename T, typename U>
T aligned_reinterpret_cast(U u);
@ -265,16 +174,18 @@ ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float64);
#undef ALIGN_RE_PAIR
}
}
#endif //THRUST_VERSION >= 100900
//need to guard in 1.9.0 check
#if THRUST_VERSION >= 100903
//So for thrust 1.9.3+ (CUDA 10.1+) the stateless_resource_allocator has a bug
//where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
//To fix this we add a new partial specialization on cuda::memory_resource
//which the correct markup (which is what everyone calls).
//See: https://github.com/thrust/thrust/issues/972
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/mr/allocator.h>
#include <thrust/system/cuda/memory_resource.h>
VTKM_THIRDPARTY_POST_INCLUDE
//So for thrust 1.9.0+ the stateless_resource_allocator has a bug
//where it is not marked as __host__device__. To fix this we add a new
//overload for void* with the correct markup (which is what everyone calls).
namespace thrust
{
namespace mr
@ -329,5 +240,103 @@ public:
};
}
}
#endif //THRUST_VERSION >= 100903
#if THRUST_VERSION < 100900
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif //THRUST_VERSION < 100900
#endif //CUDA enabled
#endif //vtk_m_exec_cuda_internal_ThrustPatches_h