ThrustPatches now only patches fixes for relevant cuda versions

Rather than always patch Thrust functions, we now only patch based on the Thrust version.
2024-09-08 13:23:51 +00:00 · 2019-04-10 11:13:50 -04:00 · 2019-04-10 11:13:50 -04:00 · f05940aaca
commit f05940aaca
parent 20d6201a98
1 changed files with 109 additions and 100 deletions
--- a/vtkm/exec/cuda/internal/ThrustPatches.h
+++ b/vtkm/exec/cuda/internal/ThrustPatches.h
@ -24,108 +24,17 @@

 #ifdef VTKM_ENABLE_CUDA

-//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
-//values when the binary operators states it is not commutative.
-//For more complex value types, we patch thrust/bulk with fix that is found
-//in issue: https://github.com/thrust/thrust/issues/692
-//
-//This specialization needs to be included before ANY thrust includes otherwise
-//other device code inside thrust that calls it will not see it
-namespace vtkm
-{
-namespace exec
-{
-namespace cuda
-{
-namespace internal
-{
-//Forward declare of WrappedBinaryOperator
-template <typename T, typename F>
-class WrappedBinaryOperator;
-}
-}
-}
-} //namespace vtkm::exec::cuda::internal
+// Needed so we can conditionally include components
+#include <thrust/version.h>

-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace bulk_
-{
-namespace detail
-{
-namespace accumulate_detail
-{
-template <typename ConcurrentGroup,
-          typename RandomAccessIterator,
-          typename Size,
-          typename T,
-          typename F>
-__device__ T
-destructive_accumulate_n(ConcurrentGroup& g,
-                         RandomAccessIterator first,
-                         Size n,
-                         T init,
-                         vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
-{
-  using size_type = typename ConcurrentGroup::size_type;
-
-  size_type tid = g.this_exec.index();
-
-  T x = init;
-  if (tid < n)
-  {
-    x = first[tid];
-  }
-
-  g.wait();
-
-  for (size_type offset = 1; offset < g.size(); offset += offset)
-  {
-    if (tid >= offset && tid - offset < n)
-    {
-      x = binary_op(first[tid - offset], x);
-    }
-
-    g.wait();
-
-    if (tid < n)
-    {
-      first[tid] = x;
-    }
-
-    g.wait();
-  }
-
-  T result = binary_op(init, first[n - 1]);
-
-  g.wait();
-
-  return result;
-}
-}
-}
-} //namespace bulk_::detail::accumulate_detail
-}
-}
-}
-} //namespace thrust::system::cuda::detail
-
-#endif
-
-//So for thrust 1.9.0+ the aligned_reinterpret_cast has a bug
+#if THRUST_VERSION >= 100900
+//So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
 //where it is not marked as __host__device__. To fix this we add a new
 //overload for void* with the correct markup (which is what everyone calls).
 namespace thrust
 {
 namespace detail
 {
-
 //just in-case somebody has this fix also for primitive types
 template <typename T, typename U>
 T aligned_reinterpret_cast(U u);
@ -265,16 +174,18 @@ ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float64);
 #undef ALIGN_RE_PAIR
 }
 }
+#endif //THRUST_VERSION >= 100900

-//need to guard in 1.9.0 check
+#if THRUST_VERSION >= 100903
+//So for thrust 1.9.3+ (CUDA 10.1+) the stateless_resource_allocator has a bug
+//where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
+//To fix this we add a new partial specialization on cuda::memory_resource
+//which the correct markup (which is what everyone calls).
+//See: https://github.com/thrust/thrust/issues/972
 VTKM_THIRDPARTY_PRE_INCLUDE
 #include <thrust/mr/allocator.h>
 #include <thrust/system/cuda/memory_resource.h>
 VTKM_THIRDPARTY_POST_INCLUDE
-
-//So for thrust 1.9.0+ the stateless_resource_allocator has a bug
-//where it is not marked as __host__device__. To fix this we add a new
-//overload for void* with the correct markup (which is what everyone calls).
 namespace thrust
 {
 namespace mr
@ -329,5 +240,103 @@ public:
 };
 }
 }
+#endif //THRUST_VERSION >= 100903
+
+
+#if THRUST_VERSION < 100900
+//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
+//values when the binary operators states it is not commutative.
+//For more complex value types, we patch thrust/bulk with fix that is found
+//in issue: https://github.com/thrust/thrust/issues/692
+//
+//This specialization needs to be included before ANY thrust includes otherwise
+//other device code inside thrust that calls it will not see it
+namespace vtkm
+{
+namespace exec
+{
+namespace cuda
+{
+namespace internal
+{
+//Forward declare of WrappedBinaryOperator
+template <typename T, typename F>
+class WrappedBinaryOperator;
+}
+}
+}
+} //namespace vtkm::exec::cuda::internal
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace bulk_
+{
+namespace detail
+{
+namespace accumulate_detail
+{
+template <typename ConcurrentGroup,
+          typename RandomAccessIterator,
+          typename Size,
+          typename T,
+          typename F>
+__device__ T
+destructive_accumulate_n(ConcurrentGroup& g,
+                         RandomAccessIterator first,
+                         Size n,
+                         T init,
+                         vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
+{
+  using size_type = typename ConcurrentGroup::size_type;
+
+  size_type tid = g.this_exec.index();
+
+  T x = init;
+  if (tid < n)
+  {
+    x = first[tid];
+  }
+
+  g.wait();
+
+  for (size_type offset = 1; offset < g.size(); offset += offset)
+  {
+    if (tid >= offset && tid - offset < n)
+    {
+      x = binary_op(first[tid - offset], x);
+    }
+
+    g.wait();
+
+    if (tid < n)
+    {
+      first[tid] = x;
+    }
+
+    g.wait();
+  }
+
+  T result = binary_op(init, first[n - 1]);
+
+  g.wait();
+
+  return result;
+}
+}
+}
+} //namespace bulk_::detail::accumulate_detail
+}
+}
+}
+} //namespace thrust::system::cuda::detail
+#endif //THRUST_VERSION < 100900
+
+#endif //CUDA enabled

 #endif //vtk_m_exec_cuda_internal_ThrustPatches_h