diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0320bb839..3f2c0e01c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,18 +108,6 @@ option(VTKm_USE_DOUBLE_PRECISION
   )
 option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)
 
-if (VTKm_ENABLE_CUDA)
-  option(VTKm_USE_UNIFIED_MEMORY
-    "Use CUDA unified memory"
-    OFF
-    )
-endif (VTKm_ENABLE_CUDA)
-
-if (VTKm_USE_UNIFIED_MEMORY)
-  set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
-endif()
-
-
 option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
 set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 
diff --git a/examples/unified_memory/UnifiedMemory.cu b/examples/unified_memory/UnifiedMemory.cu
index 989a9e59d..d3c696678 100644
--- a/examples/unified_memory/UnifiedMemory.cu
+++ b/examples/unified_memory/UnifiedMemory.cu
@@ -30,6 +30,7 @@
 #include <vtkm/cont/CellSetExplicit.h>
 #include <vtkm/cont/DataSet.h>
 #include <vtkm/cont/Timer.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 
 namespace
 {
@@ -92,8 +93,8 @@ vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
   vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
 
   vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
-  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0] * vdims[1] *
-                                                                       vdims[2]);
+  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
+    0, 1, vdims[0] * vdims[1] * vdims[2]);
   vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
     TangleField(vdims, mins, maxs));
   tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
@@ -153,37 +154,39 @@ int main(int argc, char* argv[])
   typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
   vtkm::worklet::SineWorklet sineWorklet;
 
-#ifdef VTKM_USE_UNIFIED_MEMORY
-  std::cout << "Testing with unified memory" << std::endl;
+  bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
 
-  vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+  if (usingManagedMemory)
+  {
+    std::cout << "Testing with unified memory" << std::endl;
 
-  vtkm::cont::Timer<> timer;
+    vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
 
-  dispatcher.Invoke(input, output);
-  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
+    vtkm::cont::Timer<> timer;
 
-  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-  std::cout << "Time: " << elapsedTime << std::endl;
+    dispatcher.Invoke(input, output);
+    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
 
-#else
+    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+    std::cout << "Time: " << elapsedTime << std::endl;
+  }
+  else
+  {
+    vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+    vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
+    NBlocks *= 2;
+    dispatcher.SetNumberOfBlocks(NBlocks);
+    std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
+              << std::endl;
 
-  vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
-  vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
-  NBlocks *= 2;
-  dispatcher.SetNumberOfBlocks(NBlocks);
-  std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
-            << std::endl;
+    vtkm::cont::Timer<> timer;
 
-  vtkm::cont::Timer<> timer;
+    dispatcher.Invoke(input, output);
+    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
 
-  dispatcher.Invoke(input, output);
-  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
-
-  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-  std::cout << "Time: " << elapsedTime << std::endl;
-
-#endif
+    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+    std::cout << "Time: " << elapsedTime << std::endl;
+  }
 
   int dim = 128;
   if (argc > 2)
diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
index 7c04584c0..f2d2120f8 100644
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
@@ -21,6 +21,9 @@
 #define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_cu
 
 #include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
+
+using vtkm::cont::cuda::internal::CudaAllocator;
 
 namespace vtkm
 {
@@ -64,18 +67,8 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Allocate(TypelessExecut
   // Attempt to allocate:
   try
   {
-    char* tmp;
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    int dev;
-    VTKM_CUDA_CALL(cudaGetDevice(&dev));
-    VTKM_CUDA_CALL(cudaMallocManaged(&tmp, static_cast<std::size_t>(numBytes)));
-    VTKM_CUDA_CALL(cudaMemAdvise(tmp, numBytes, cudaMemAdviseSetPreferredLocation, dev));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, numBytes, dev, 0));
-    VTKM_CUDA_CALL(cudaStreamSynchronize(0));
-#else
-    VTKM_CUDA_CALL(cudaMalloc(&tmp, static_cast<std::size_t>(numBytes)));
-#endif
-
+    // Cast to char* so that the pointer math below will work.
+    char* tmp = static_cast<char*>(CudaAllocator::Allocate(static_cast<size_t>(numBytes)));
     execArray.Array = tmp;
     execArray.ArrayEnd = tmp + numBytes;
     execArray.ArrayCapacity = tmp + numBytes;
@@ -93,7 +86,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Free(
 {
   if (execArray.Array != nullptr)
   {
-    VTKM_CUDA_CALL(cudaFree(execArray.Array));
+    CudaAllocator::Free(execArray.Array);
     execArray.Array = nullptr;
     execArray.ArrayEnd = nullptr;
     execArray.ArrayCapacity = nullptr;
diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
index 547465c6c..8798ad260 100644
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@@ -35,6 +35,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 VTKM_THIRDPARTY_POST_INCLUDE
 
 #include <vtkm/cont/cuda/ErrorCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
 #include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
 
@@ -172,17 +173,8 @@ public:
     // Attempt to allocate:
     try
     {
-      ValueType* tmp;
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      int dev;
-      VTKM_CUDA_CALL(cudaGetDevice(&dev));
-      VTKM_CUDA_CALL(cudaMallocManaged(&tmp, bufferSize));
-      VTKM_CUDA_CALL(cudaMemAdvise(tmp, bufferSize, cudaMemAdviseSetPreferredLocation, dev));
-      VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, bufferSize, dev, 0));
-      VTKM_CUDA_CALL(cudaStreamSynchronize(0));
-#else
-      VTKM_CUDA_CALL(cudaMalloc(&tmp, bufferSize));
-#endif
+      ValueType* tmp =
+        static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
       this->Begin = PointerType(tmp);
     }
     catch (const std::exception& error)
@@ -214,9 +206,6 @@ public:
     storage->Allocate(this->GetNumberOfValues());
     try
     {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      cudaDeviceSynchronize();
-#endif
       ::thrust::copy(
         this->Begin, this->End, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
     }
@@ -254,7 +243,7 @@ public:
   {
     if (this->Begin.get() != nullptr)
     {
-      VTKM_CUDA_CALL(cudaFree(this->Begin.get()));
+      vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin.get());
       this->Begin = PointerType(static_cast<ValueType*>(nullptr));
       this->End = PointerType(static_cast<ValueType*>(nullptr));
       this->Capacity = PointerType(static_cast<ValueType*>(nullptr));
diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt
index 944ca3d5e..fdeca3a2a 100644
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@@ -21,6 +21,7 @@
 set(headers
   ArrayManagerExecutionCuda.h
   ArrayManagerExecutionThrustDevice.h
+  CudaAllocator.h
   DeviceAdapterAlgorithmCuda.h
   DeviceAdapterAlgorithmThrust.h
   DeviceAdapterTagCuda.h
@@ -30,6 +31,11 @@ set(headers
   VirtualObjectTransferCuda.h
   )
 
+set(sources
+  ArrayManagerExecutionCuda.cu
+  CudaAllocator.cu
+  )
+
 vtkm_declare_headers(CUDA ${headers} TESTABLE ${VTKm_ENABLE_CUDA})
 
 #-----------------------------------------------------------------------------
@@ -49,7 +55,7 @@ if (VTKm_ENABLE_CUDA)
     list(APPEND compile_options -Xcompiler=-fPIC)
   endif()
 
-  cuda_compile(vtkm_cont_cuda_object_files ArrayManagerExecutionCuda.cu
+  cuda_compile(vtkm_cont_cuda_object_files ${sources}
                OPTIONS "${compile_options}")
 
   #Setup the dependency chain for the custom object build so that
diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu
new file mode 100644
index 000000000..6785bd280
--- /dev/null
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@@ -0,0 +1,164 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 Sandia Corporation.
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/cuda/ErrorCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
+
+#include <cuda_runtime.h>
+
+// These static vars are in an anon namespace to work around MSVC linker issues.
+namespace
+{
+// Has CudaAllocator::Initialize been called?
+static bool IsInitialized = false;
+
+// True if all devices support concurrent pagable managed memory.
+static bool ManagedMemorySupported = false;
+}
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+bool CudaAllocator::UsingManagedMemory()
+{
+  CudaAllocator::Initialize();
+  return ManagedMemorySupported;
+}
+
+void* CudaAllocator::Allocate(std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  void* ptr = nullptr;
+  if (ManagedMemorySupported)
+  {
+    VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
+  }
+  else
+  {
+    VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
+  }
+
+  return ptr;
+}
+
+void CudaAllocator::Free(void* ptr)
+{
+  CudaAllocator::Initialize();
+
+  VTKM_CUDA_CALL(cudaFree(ptr));
+}
+
+void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    // TODO these hints need to be benchmarked and adjusted once we start
+    // sharing the pointers between cont/exec
+    VTKM_CUDA_CALL(
+      cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
+  }
+}
+
+void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::Initialize()
+{
+  if (!IsInitialized)
+  {
+    int numDevices;
+    VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
+
+    if (numDevices == 0)
+    {
+      ManagedMemorySupported = false;
+      IsInitialized = true;
+      return;
+    }
+
+    // Check all devices, use the feature set supported by all
+    bool managed = true;
+    cudaDeviceProp prop;
+    for (int i = 0; i < numDevices && managed; ++i)
+    {
+      VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
+      // We check for concurrentManagedAccess, as devices with only the
+      // managedAccess property have extra synchronization requirements.
+      managed = managed && prop.concurrentManagedAccess;
+    }
+
+    ManagedMemorySupported = managed;
+    IsInitialized = true;
+  }
+}
+}
+}
+}
+} // end namespace vtkm::cont::cuda::internal
diff --git a/vtkm/cont/cuda/internal/CudaAllocator.h b/vtkm/cont/cuda/internal/CudaAllocator.h
new file mode 100644
index 000000000..8a3d738f3
--- /dev/null
+++ b/vtkm/cont/cuda/internal/CudaAllocator.h
@@ -0,0 +1,59 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 Sandia Corporation.
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_cuda_internal_CudaAllocator_h
+#define vtk_m_cont_cuda_internal_CudaAllocator_h
+
+#include <vtkm/cont/vtkm_cont_export.h>
+#include <vtkm/internal/ExportMacros.h>
+
+#include <cstddef>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+struct VTKM_CONT_EXPORT CudaAllocator
+{
+  static VTKM_CONT bool UsingManagedMemory();
+
+  static VTKM_CONT void* Allocate(std::size_t numBytes);
+  static VTKM_CONT void Free(void* ptr);
+
+  static VTKM_CONT void PrepareForControl(const void* ptr, std::size_t numBytes);
+
+  static VTKM_CONT void PrepareForInput(const void* ptr, std::size_t numBytes);
+  static VTKM_CONT void PrepareForOutput(const void* ptr, std::size_t numBytes);
+  static VTKM_CONT void PrepareForInPlace(const void* ptr, std::size_t numBytes);
+
+private:
+  static VTKM_CONT void Initialize();
+};
+}
+}
+}
+} // end namespace vtkm::cont::cuda::internal
+
+#endif // vtk_m_cont_cuda_internal_CudaAllocator_h