diff --git a/CMake/UseVTKmCUDA.cmake b/CMake/UseVTKmCUDA.cmake
index 219886150..715df3128 100644
--- a/CMake/UseVTKmCUDA.cmake
+++ b/CMake/UseVTKmCUDA.cmake
@@ -51,6 +51,80 @@ if (VTKm_Base_FOUND)
     endif ()
   endif ()
 
+  if(VTKm_CUDA_FOUND)
+  #---------------------------------------------------------------------------
+  # Setup build flags for CUDA
+  #---------------------------------------------------------------------------
+  # Populates CUDA_NVCC_FLAGS with the best set of flags to compile for a
+  # given GPU architecture. The majority of developers should leave the
+  # option at the default of 'native' which uses system introspection to
+  # determine the smallest numerous of virtual and real architectures it
+  # should target.
+  #
+  # The option of 'all' is provided for people generating libraries that
+  # will deployed to any number of machines, it will compile all CUDA code
+  # for all major virtual architectures, guaranteeing that the code will run
+  # anywhere.
+  #
+  #
+  # 1 - native
+  #   - Uses system introspection to determine compile flags
+  # 2 - fermi
+  #   - Uses: --generate-code arch=compute_20,code=compute_20
+  # 3 - kepler
+  #   - Uses: --generate-code arch=compute_30,code=compute_30
+  #   - Uses: --generate-code arch=compute_35,code=compute_35
+  # 4 - maxwell
+  #   - Uses: --generate-code arch=compute_50,code=compute_50
+  #   - Uses: --generate-code arch=compute_52,code=compute_52
+  # 5 - all
+  #   - Uses: --generate-code arch=compute_20,code=compute_20
+  #   - Uses: --generate-code arch=compute_30,code=compute_30
+  #   - Uses: --generate-code arch=compute_35,code=compute_35
+  #   - Uses: --generate-code arch=compute_50,code=compute_50
+  #
+
+    #specify the property
+    set(VTKm_CUDA_Architecture "native" CACHE STRING "Which GPU Architecture(s) to compile for")
+    set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell all)
+
+    #detect what the propery is set too
+    if(VTKm_CUDA_Architecture STREQUAL "native")
+      #run execute_process to do auto_detection
+      execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${CMAKE_CURRENT_LIST_DIR}/VTKmDetectCUDAVersion.cxx"
+                      RESULT_VARIABLE ran_properly
+                      OUTPUT_VARIABLE run_output)
+
+      if(ran_properly EQUAL 0)
+        #find the position of the "--generate-code" output. With some compilers such as
+        #msvc we get compile output plus run output. So we need to strip out just the
+        #run output
+        string(FIND "${run_output}" "--generate-code" position)
+        string(SUBSTRING "${run_output}" ${position} -1 run_output)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${run_output}")
+      else()
+        message(STATUS "Unable to run \"${CUDA_NVCC_EXECUTABLE}\" to autodetect GPU architecture."
+                       "Falling back to fermi, please manually specify if you want something else.")
+        set(VTKm_CUDA_Architecture "fermi")
+      endif()
+
+    elseif(VTKm_CUDA_Architecture STREQUAL "fermi")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
+    elseif(VTKm_CUDA_Architecture STREQUAL "kepler")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
+    elseif(VTKm_CUDA_Architecture STREQUAL "maxwell")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
+    elseif(VTKm_CUDA_Architecture STREQUAL "all")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
+    endif()
+  endif()
+
   #---------------------------------------------------------------------------
   # Find Thrust library.
   #---------------------------------------------------------------------------
diff --git a/CMake/VTKmDetectCUDAVersion.cxx b/CMake/VTKmDetectCUDAVersion.cxx
new file mode 100644
index 000000000..0c95a5d69
--- /dev/null
+++ b/CMake/VTKmDetectCUDAVersion.cxx
@@ -0,0 +1,88 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 Sandia Corporation.
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <string>
+#include <map>
+
+int main(int argc, char **argv)
+{
+  std::map< int, std::string > arch_to_compute;
+  arch_to_compute[11] = "compute_11";
+  arch_to_compute[12] = "compute_12";
+  arch_to_compute[13] = "compute_13";
+  arch_to_compute[20] = "compute_20";
+  arch_to_compute[21] = "compute_20";
+  arch_to_compute[30] = "compute_30";
+  arch_to_compute[32] = "compute_32";
+  arch_to_compute[35] = "compute_35";
+  arch_to_compute[37] = "compute_37";
+  arch_to_compute[50] = "compute_50";
+  arch_to_compute[52] = "compute_52";
+  arch_to_compute[53] = "compute_53";
+
+  int nDevices;
+  cudaGetDeviceCount(&nDevices);
+  if(nDevices == 0)
+  { //return failure if no cuda devices found
+    return 1;
+  }
+
+  //iterate over the devices outputting a string that would be the compile
+  //flags needed to target all gpu's on this machine.
+  int prev_arch = 0;
+  for (int i = 0; i < nDevices; i++)
+  {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+
+    //convert 2.1 to 21, 3.5 to 35, etc
+    int arch = (prop.major * 10) + prop.minor;
+
+    //if we have multiple gpu's make sure they have different arch's
+    //instead of adding the same compile options multiple times
+    if(prev_arch == arch)
+    {
+      continue;
+    }
+    prev_arch = arch;
+
+    //look up the closest virtual architecture, if the arch we are building
+    //for is not found
+    if(arch_to_compute.find(arch) != arch_to_compute.end() )
+    {
+    std::string compute_level = arch_to_compute[arch];
+    std::cout << "--generate-code arch=" << compute_level << ",code=sm_"<< arch << " ";
+    }
+    else
+    {
+    //if not found default to known highest arch, and compile to a virtual arch
+    //instead of a known sm.
+    std::map< int, std::string >::const_iterator i = arch_to_compute.end();
+    --i;
+    std::string compute_level = i->second;
+    std::cout << "--generate-code arch=" << compute_level << ",code=" << compute_level << " ";
+    }
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8b9b84ae..ab88f453e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ install(
 install(
   FILES
     ${VTKm_SOURCE_DIR}/CMake/VTKmCompilerOptimizations.cmake
+    ${VTKm_SOURCE_DIR}/CMake/VTKmDetectCUDAVersion.cxx
   DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
   )