Merge branch 'use_ldg_cuda_load'

2024-09-16 17:22:55 +00:00 · 2015-03-11 10:47:57 -04:00 · 2015-03-11 10:47:57 -04:00 · e4bd0132e6
commit e4bd0132e6
parent 0c49c20035 63b1f03187
4 changed files with 54 additions and 691 deletions
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@ -44,30 +44,12 @@
 #endif // gcc && !CUDA

 #include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
-#include <vtkm/exec/cuda/internal/ArrayPortalFromTexture.h>
-
-#include <boost/utility/enable_if.hpp>

 namespace vtkm {
 namespace cont {
 namespace cuda {
 namespace internal {

-template<typename T> struct UseTexturePortal      {typedef  boost::false_type type;};
-
-//Currently disabled as we are still tracking down issues with Texture
-//Memory. The major issue is that in testing it is slower than classic arrays
-#ifdef VTKM_USE_TEXTURE_MEM
-template<> struct UseTexturePortal<vtkm::Int8>    {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::UInt8>   {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::Int16>   {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::UInt16>  {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::Int32>   {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::UInt32>  {typedef boost::true_type type; };
-template<> struct UseTexturePortal<vtkm::Float32> {typedef boost::true_type type; };
-#endif
-
-
 /// \c ArrayManagerExecutionThrustDevice provides an implementation for a \c
 /// ArrayManagerExecution class for a thrust device adapter that is designed
 /// for the cuda backend which has separate memory spaces for host and device. This
@ -77,15 +59,9 @@ template<> struct UseTexturePortal<vtkm::Float32> {typedef boost::true_type type
 /// This array manager should only be used with the cuda device adapter,
 /// since in the future it will take advantage of texture memory and
 /// the unique memory access patterns of cuda systems.
-template<typename T, class StorageTag, typename Enable= void>
+template<typename T, class StorageTag>
 class ArrayManagerExecutionThrustDevice
 {
-  //we need a way to detect that we are using FERMI or lower and disable
-  //the usage of texture iterator. The __CUDA_ARCH__ define is only around
-  //for device code so that can't be used. I expect that we will have to devise
-  //some form of Try/Compile with CUDA or just offer this as an advanced CMake
-  //option. We could also try and see if a runtime switch is possible.
-
 public:
  typedef T ValueType;

@ -217,164 +193,6 @@ private:
 };


-/// This is a specialization that is used to enable texture memory iterators
-template<typename T, class StorageTag>
-class ArrayManagerExecutionThrustDevice<T, StorageTag,
-    typename ::boost::enable_if< typename UseTexturePortal<T>::type >::type >
-{
-  //we need a way to detect that we are using FERMI or lower and disable
-  //the usage of texture iterator. The __CUDA_ARCH__ define is only around
-  //for device code so that can't be used. I expect that we will have to devise
-  //some form of Try/Compile with CUDA or just offer this as an advanced CMake
-  //option. We could also try and see if a runtime switch is possible.
-
-public:
-  typedef T ValueType;
-
-  typedef vtkm::cont::internal::Storage<ValueType, StorageTag> ContainerType;
-
-  typedef vtkm::exec::cuda::internal::ArrayPortalFromThrust< T > PortalType;
-  typedef ::vtkm::exec::cuda::internal::DaxTexObjInputIterator<T> TextureIteratorType;
-  typedef ::vtkm::exec::cuda::internal::ConstArrayPortalFromTexture< TextureIteratorType > PortalConstType;
-
-  VTKM_CONT_EXPORT ArrayManagerExecutionThrustDevice():
-    NumberOfValues(0),
-    ArrayBegin(),
-    ArrayEnd(),
-    HaveTextureBound(false),
-    InputArrayIterator()
-  {
-
-  }
-
-  ~ArrayManagerExecutionThrustDevice()
-  {
-    this->ReleaseResources();
-  }
-
-  /// Returns the size of the array.
-  ///
-  VTKM_CONT_EXPORT vtkm::Id GetNumberOfValues() const {
-    return this->NumberOfValues;
-  }
-
-  /// Allocates the appropriate size of the array and copies the given data
-  /// into the array.
-  ///
-  template<class PortalControl>
-  VTKM_CONT_EXPORT void LoadDataForInput(PortalControl arrayPortal)
-  {
-    //don't bind to the texture yet, as we could have allocate the array
-    //on a previous call with AllocateArrayForOutput and now are directly
-    //calling get portal const
-    try
-      {
-      this->NumberOfValues = arrayPortal.GetNumberOfValues();
-      this->ArrayBegin = ::thrust::system::cuda::malloc<T>( static_cast<std::size_t>(this->NumberOfValues)  );
-      this->ArrayEnd = this->ArrayBegin + this->NumberOfValues;
-
-      ::thrust::copy(arrayPortal.GetRawIterator(),
-                     arrayPortal.GetRawIterator() + this->NumberOfValues,
-                     this->ArrayBegin);
-      }
-    catch (std::bad_alloc error)
-      {
-      throw vtkm::cont::ErrorControlOutOfMemory(error.what());
-      }
-  }
-
-  /// Allocates the appropriate size of the array and copies the given data
-  /// into the array.
-  ///
-  template<class PortalControl>
-  VTKM_CONT_EXPORT void LoadDataForInPlace(PortalControl arrayPortal)
-  {
-    this->LoadDataForInput(arrayPortal);
-  }
-
-  /// Allocates the array to the given size.
-  ///
-  VTKM_CONT_EXPORT void AllocateArrayForOutput(
-      ContainerType &vtkmNotUsed(container),
-      vtkm::Id numberOfValues)
-  {
-    if(this->NumberOfValues > 0)
-      {
-      ::thrust::system::cuda::free( this->ArrayBegin  );
-      }
-    this->NumberOfValues = numberOfValues;
-    this->ArrayBegin = ::thrust::system::cuda::malloc<T>( this->NumberOfValues  );
-    this->ArrayEnd = this->ArrayBegin + numberOfValues;
-  }
-
-  /// Allocates enough space in \c controlArray and copies the data in the
-  /// device vector into it.
-  ///
-  VTKM_CONT_EXPORT void RetrieveOutputData(ContainerType &controlArray) const
-  {
-    controlArray.Allocate(this->NumberOfValues);
-    ::thrust::copy(this->ArrayBegin,
-                   this->ArrayEnd,
-                   controlArray.GetPortal().GetRawIterator());
-  }
-
-  /// Resizes the device vector.
-  ///
-  VTKM_CONT_EXPORT void Shrink(vtkm::Id numberOfValues)
-  {
-    // The operation will succeed even if this assertion fails, but this
-    // is still supposed to be a precondition to Shrink.
-    VTKM_ASSERT_CONT(numberOfValues <= this->NumberOfValues);
-    this->NumberOfValues = numberOfValues;
-    this->ArrayEnd = this->ArrayBegin + this->NumberOfValues;
-  }
-
-  VTKM_CONT_EXPORT PortalType GetPortal()
-  {
-    return PortalType(this->ArrayBegin, this->ArrayEnd);
-  }
-
-  VTKM_CONT_EXPORT PortalConstType GetPortalConst() const
-  {
-    if(!this->HaveTextureBound)
-      {
-      this->HaveTextureBound = true;
-      this->InputArrayIterator.BindTexture(ArrayBegin,this->NumberOfValues);
-      }
-
-    //if we have a texture iterator bound use that
-    return PortalConstType(this->InputArrayIterator, this->NumberOfValues);
-  }
-
-
-  /// Frees all memory.
-  ///
-  VTKM_CONT_EXPORT void ReleaseResources() {
-  if(this->HaveTextureBound)
-    {
-    this->HaveTextureBound = false;
-    this->InputArrayIterator.UnbindTexture();
-    }
-    ::thrust::system::cuda::free( this->ArrayBegin  );
-    this->ArrayBegin = ::thrust::system::cuda::pointer<ValueType>();
-    this->ArrayEnd = ::thrust::system::cuda::pointer<ValueType>();
-  }
-
-private:
-  // Not implemented
-  ArrayManagerExecutionThrustDevice(
-      ArrayManagerExecutionThrustDevice<T, StorageTag> &);
-  void operator=(
-      ArrayManagerExecutionThrustDevice<T, StorageTag> &);
-
-  vtkm::Id NumberOfValues;
-  ::thrust::system::cuda::pointer<ValueType> ArrayBegin;
-  ::thrust::system::cuda::pointer<ValueType> ArrayEnd;
-  mutable bool HaveTextureBound;
-  mutable TextureIteratorType InputArrayIterator;
-};
-
-
 }
 }
 }
--- a/vtkm/cont/cuda/internal/MakeThrustIterator.h
+++ b/vtkm/cont/cuda/internal/MakeThrustIterator.h
@ -24,7 +24,6 @@
 #include <vtkm/internal/ExportMacros.h>

 #include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
-#include <vtkm/exec/cuda/internal/ArrayPortalFromTexture.h>

 // Disable GCC warnings we check vtkmfor but Thrust does not.
 #if defined(__GNUC__) && !defined(VTKM_CUDA)
@ -59,7 +58,6 @@ namespace detail {
 // Tags to specify what type of thrust iterator to use.
 struct ThrustIteratorTransformTag {  };
 struct ThrustIteratorDevicePtrTag {  };
-struct ThrustIteratorDeviceTextureTag {  };

 // Traits to help classify what thrust iterators will be used.
 template<class IteratorType>
@ -149,13 +147,6 @@ struct IteratorTraits
  typedef typename IteratorChooser<PortalType, Tag>::Type IteratorType;
 };

-template<typename T>
-struct IteratorTraits< vtkm::exec::cuda::internal::ConstArrayPortalFromTexture< T > >
-{
-  typedef vtkm::exec::cuda::internal::ConstArrayPortalFromTexture< T > PortalType;
-  typedef ThrustIteratorDeviceTextureTag Tag;
-  typedef typename PortalType::IteratorType IteratorType;
-};

 template<typename T>
 VTKM_CONT_EXPORT static
@ -190,14 +181,6 @@ MakeIteratorBegin(PortalType portal, detail::ThrustIteratorDevicePtrTag)
  return MakeDevicePtr(portal.GetIteratorBegin());
 }

-template<class PortalType>
-VTKM_CONT_EXPORT static
-typename IteratorTraits<PortalType>::IteratorType
-MakeIteratorBegin(PortalType portal, detail::ThrustIteratorDeviceTextureTag)
-{
-  return portal.GetIteratorBegin();
-}
-
 } // namespace detail


--- a/vtkm/exec/cuda/internal/ArrayPortalFromTexture.h
+++ b/vtkm/exec/cuda/internal/ArrayPortalFromTexture.h
@ -1,490 +0,0 @@
-//============================================================================
-//  Copyright (c) Kitware, Inc.
-//  All rights reserved.
-//  See LICENSE.txt for details.
-//  This software is distributed WITHOUT ANY WARRANTY; without even
-//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-//  PURPOSE.  See the above copyright notice for more information.
-//
-//  Copyright 2014 Sandia Corporation.
-//  Copyright 2014 UT-Battelle, LLC.
-//  Copyright 2014. Los Alamos National Security
-//
-//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-//  the U.S. Government retains certain rights in this software.
-//
-//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
-//  Laboratory (LANL), the U.S. Government retains certain rights in
-//  this software.
-//============================================================================
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#ifndef vtk_m_exec_cuda_internal_ArrayPortalFromTexture_h
-#define vtk_m_exec_cuda_internal_ArrayPortalFromTexture_h
-
-#include <vtkm/Types.h>
-
-#include <iterator>
-
-// Disable GCC warnings we check vtkmfor but Thrust does not.
-#if defined(__GNUC__) && !defined(VTKM_CUDA)
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic push
-#endif // gcc version >= 4.6
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 2)
-#pragma GCC diagnostic ignored "-Wshadow"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif // gcc version >= 4.2
-#endif // gcc && !CUDA
-
-#include <thrust/system/cuda/memory.h>
-#include <thrust/iterator/iterator_facade.h>
-
-#if defined(__GNUC__) && !defined(VTKM_CUDA)
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic pop
-#endif // gcc version >= 4.6
-#endif // gcc && !CUDA
-
-
-namespace
-{
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-/******************************************************************************
-* Size and alignment
-******************************************************************************/
-
-/// Structure alignment
-template <typename T>
-struct AlignBytes
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-};
-
-// Specializations where host C++ compilers (e.g., Windows) may disagree with device C++ compilers (EDG)
-
-template <> struct AlignBytes<short4>               { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<ushort4>              { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<int2>                 { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<uint2>                { enum { ALIGN_BYTES = 8 }; };
-#ifdef _WIN32
-    template <> struct AlignBytes<long2>            { enum { ALIGN_BYTES = 8 }; };
-    template <> struct AlignBytes<ulong2>           { enum { ALIGN_BYTES = 8 }; };
-#endif
-template <> struct AlignBytes<long long>            { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<unsigned long long>   { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<float2>               { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<double>               { enum { ALIGN_BYTES = 8 }; };
-
-template <> struct AlignBytes<int4>                 { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<uint4>                { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<float4>               { enum { ALIGN_BYTES = 16 }; };
-#ifndef _WIN32
-    template <> struct AlignBytes<long2>            { enum { ALIGN_BYTES = 16 }; };
-    template <> struct AlignBytes<ulong2>           { enum { ALIGN_BYTES = 16 }; };
-#endif
-template <> struct AlignBytes<long4>                { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulong4>               { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<longlong2>            { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulonglong2>           { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<double2>              { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<longlong4>            { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulonglong4>           { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<double4>              { enum { ALIGN_BYTES = 16 }; };
-
-
-/// Unit-words of data movement
-template <typename T>
-struct UnitWord
-{
-    enum {
-        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-    };
-
-    template <typename Unit>
-    struct IsMultiple
-    {
-        enum {
-            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
-        };
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
-        unsigned int,
-        typename If<IsMultiple<short>::IS_MULTIPLE,
-            unsigned short,
-            unsigned char>::Type>::Type         ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
-        unsigned long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
-        ulonglong2,
-        VolatileWord>::Type                     DeviceWord;
-
-    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
-        uint4,
-        typename If<IsMultiple<int2>::IS_MULTIPLE,
-            uint2,
-            ShuffleWord>::Type>::Type           TextureWord;
-};
-
-}
-
-
-namespace vtkm {
-namespace exec {
-namespace cuda {
-namespace internal {
-
-template <
-    typename    T,
-    typename    Offset = ptrdiff_t>
-class DaxTexObjInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef DaxTexObjInputIterator              self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename ::thrust::detail::iterator_facade_category<
-        ::thrust::device_system_tag,
-        ::thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-
-private:
-
-    // Largest texture word we can use in device
-    typedef typename UnitWord<T>::TextureWord TextureWord;
-
-    // Number of texture words per T
-    enum { TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) };
-
-private:
-
-    const T*                  ptr;
-    difference_type     tex_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ DaxTexObjInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0),
-        tex_obj(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    cudaError_t BindTexture(
-        const ::thrust::system::cuda::pointer<T>  ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          numElements,        ///< Number of elements in the range
-        size_t          tex_offset = 0)     ///< Offset (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = ptr.get();
-        this->tex_offset = tex_offset;
-
-        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
-        cudaResourceDesc        res_desc;
-        cudaTextureDesc         tex_desc;
-        memset(&res_desc, 0, sizeof(cudaResourceDesc));
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        res_desc.resType                = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr      = (void*)ptr.get();
-        res_desc.res.linear.desc        = channel_desc;
-        res_desc.res.linear.sizeInBytes = numElements * sizeof(T);
-        tex_desc.readMode               = cudaReadModeElementType;
-
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-      return cudaDestroyTextureObject(tex_obj);
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Postfix decrement
-    __host__ __device__ __forceinline__ self_type operator--(int)
-    {
-        self_type retval = *this;
-        tex_offset--;
-        return retval;
-    }
-
-    /// Prefix decrement
-    __host__ __device__ __forceinline__ self_type operator--()
-    {
-        tex_offset--;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#ifndef DAX_CUDA_COMPILATION
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
-        }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return *(*this + n);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) const
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) const
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
-    }
-
-    /// less than
-    __host__ __device__ __forceinline__ bool operator<(const self_type& rhs)
-    {
-        return (tex_offset < rhs.tex_offset);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-template<class TextureIterator>
-class ConstArrayPortalFromTexture
-{
-public:
-
-  typedef typename TextureIterator::value_type ValueType;
-  typedef TextureIterator IteratorType;
-
-  VTKM_EXEC_CONT_EXPORT ConstArrayPortalFromTexture() {  }
-
-  VTKM_CONT_EXPORT
-  ConstArrayPortalFromTexture(IteratorType begin, ptrdiff_t size)
-    : Length(size),
-      BeginIterator(begin),
-      EndIterator(begin+size)
-  {  }
-
-  /// Copy constructor for any other ConstArrayPortalFromTexture with an iterator
-  /// type that can be copied to this iterator type. This allows us to do any
-  /// type casting that the iterators do (like the non-const to const cast).
-  ///
-  template<typename OtherIteratorT>
-  VTKM_EXEC_CONT_EXPORT
-  ConstArrayPortalFromTexture(const ConstArrayPortalFromTexture<OtherIteratorT> &src)
-    : Length(src.Length),
-      BeginIterator(src.BeginIterator),
-      EndIterator(src.EndIterator)
-  {  }
-
-  template<typename OtherIteratorT>
-  VTKM_EXEC_CONT_EXPORT
-  ConstArrayPortalFromTexture<IteratorType> &operator=(
-      const ConstArrayPortalFromTexture<OtherIteratorT> &src)
-  {
-    this->Length = src.Length;
-    this->BeginIterator = src.BeginIterator;
-    this->EndIterator = src.EndIterator;
-    return *this;
-  }
-
-  VTKM_EXEC_CONT_EXPORT
-  vtkm::Id GetNumberOfValues() const {
-    return static_cast<vtkm::Id>(this->Length);
-  }
-
-  VTKM_EXEC_EXPORT
-  ValueType Get(vtkm::Id index) const {
-    return *this->IteratorAt(index);
-  }
-
-  VTKM_EXEC_EXPORT
-  void Set(vtkm::Id index, ValueType value) const {
-    *this->IteratorAt(index) = value;
-  }
-
-  VTKM_CONT_EXPORT
-  IteratorType GetIteratorBegin() const { return this->BeginIterator; }
-
-  VTKM_CONT_EXPORT
-  IteratorType GetIteratorEnd() const { return this->EndIterator; }
-
-private:
-  ptrdiff_t Length;
-  IteratorType BeginIterator;
-  IteratorType EndIterator;
-
-  VTKM_EXEC_EXPORT
-  IteratorType IteratorAt(vtkm::Id index) const {
-    // Not using std::advance because on CUDA it cannot be used on a device.
-    return (this->BeginIterator + index);
-  }
-};
-
-}
-}
-}
-} // namespace vtkm::exec::cuda::internal
-
-
-#endif //vtk_m_exec_cuda_internal_ArrayPortalFromTexture_h
--- a/vtkm/exec/cuda/internal/ArrayPortalFromThrust.h
+++ b/vtkm/exec/cuda/internal/ArrayPortalFromThrust.h
@ -42,11 +42,63 @@
 #pragma GCC diagnostic pop
 #endif // gcc version >= 4.6
 #endif // gcc && !CUDA
+
+#include <boost/utility/enable_if.hpp>
+
 namespace vtkm {
 namespace exec {
 namespace cuda {
 namespace internal {

+template<typename T> struct UseTextureLoad      {typedef  boost::false_type type;};
+
+template<> struct UseTextureLoad<vtkm::Int8*>    {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::UInt8*>   {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Int16*>   {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::UInt16*>  {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Int32*>   {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::UInt32*>  {typedef boost::true_type type; };
+
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::Int32,2>* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::UInt32,2>* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::Int32,4>* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::UInt32,4>* > {typedef boost::true_type type; };
+
+template<> struct UseTextureLoad<vtkm::Float32* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Float64* > {typedef boost::true_type type; };
+
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::Float32,2>* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::Float32,4>* > {typedef boost::true_type type; };
+template<> struct UseTextureLoad<vtkm::Vec<vtkm::Float64,2>* > {typedef boost::true_type type; };
+
+//this T type is not one that is valid to be loaded through texture memory
+template<typename T, typename Enable = void>
+struct load_through_texture
+{
+  VTKM_EXEC_EXPORT
+  static  T get(const thrust::system::cuda::pointer<T> data)
+  {
+  return *(data.get());
+  }
+};
+
+//this T type is valid to be loaded through texture memory
+template<typename T>
+struct load_through_texture<T, typename ::boost::enable_if< typename UseTextureLoad<T>::type >::type >
+{
+  VTKM_EXEC_EXPORT
+  static T get(const thrust::system::cuda::pointer<T> data)
+  {
+  //only load through a texture if we have sm 35 support
+#if __CUDA_ARCH__ >= 350
+  return __ldg(data.get());
+#else
+  return *(data.get());
+#endif
+  }
+};
+
+
 class ArrayPortalFromThrustBase {};

 /// This templated implementation of an ArrayPortal allows you to adapt a pair
@ -168,7 +220,7 @@ public:

  VTKM_EXEC_EXPORT
  ValueType Get(vtkm::Id index) const {
-    return *this->IteratorAt(index);
+    return vtkm::exec::cuda::internal::load_through_texture<ValueType>::get( this->IteratorAt(index) );
  }

  VTKM_EXEC_EXPORT