diff --git a/CMakeLists.txt b/CMakeLists.txt index 0320bb839..3f2c0e01c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,18 +108,6 @@ option(VTKm_USE_DOUBLE_PRECISION ) option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON) -if (VTKm_ENABLE_CUDA) - option(VTKm_USE_UNIFIED_MEMORY - "Use CUDA unified memory" - OFF - ) -endif (VTKm_ENABLE_CUDA) - -if (VTKm_USE_UNIFIED_MEMORY) - set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}") -endif() - - option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON) set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) diff --git a/examples/unified_memory/UnifiedMemory.cu b/examples/unified_memory/UnifiedMemory.cu index 989a9e59d..d3c696678 100644 --- a/examples/unified_memory/UnifiedMemory.cu +++ b/examples/unified_memory/UnifiedMemory.cu @@ -30,6 +30,7 @@ #include #include #include +#include namespace { @@ -92,8 +93,8 @@ vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims) vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f }; vtkm::cont::ArrayHandle fieldArray; - vtkm::cont::ArrayHandleCounting vertexCountImplicitArray(0, 1, vdims[0] * vdims[1] * - vdims[2]); + vtkm::cont::ArrayHandleCounting vertexCountImplicitArray( + 0, 1, vdims[0] * vdims[1] * vdims[2]); vtkm::worklet::DispatcherMapField tangleFieldDispatcher( TangleField(vdims, mins, maxs)); tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray); @@ -153,37 +154,39 @@ int main(int argc, char* argv[]) typedef vtkm::cont::DeviceAdapterAlgorithm DeviceAlgorithms; vtkm::worklet::SineWorklet sineWorklet; -#ifdef VTKM_USE_UNIFIED_MEMORY - std::cout << "Testing with unified memory" << std::endl; + bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory(); - vtkm::worklet::DispatcherMapField dispatcher(sineWorklet); + if (usingManagedMemory) + { + std::cout << "Testing with unified memory" << std::endl; - vtkm::cont::Timer<> timer; + vtkm::worklet::DispatcherMapField dispatcher(sineWorklet); - dispatcher.Invoke(input, output); - std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl; + vtkm::cont::Timer<> timer; - vtkm::Float64 elapsedTime = timer.GetElapsedTime(); - std::cout << "Time: " << elapsedTime << std::endl; + dispatcher.Invoke(input, output); + std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl; -#else + vtkm::Float64 elapsedTime = timer.GetElapsedTime(); + std::cout << "Time: " << elapsedTime << std::endl; + } + else + { + vtkm::worklet::DispatcherStreamingMapField dispatcher(sineWorklet); + vtkm::Id NBlocks = N / (1024 * 1024 * 1024); + NBlocks *= 2; + dispatcher.SetNumberOfBlocks(NBlocks); + std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks" + << std::endl; - vtkm::worklet::DispatcherStreamingMapField dispatcher(sineWorklet); - vtkm::Id NBlocks = N / (1024 * 1024 * 1024); - NBlocks *= 2; - dispatcher.SetNumberOfBlocks(NBlocks); - std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks" - << std::endl; + vtkm::cont::Timer<> timer; - vtkm::cont::Timer<> timer; + dispatcher.Invoke(input, output); + std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl; - dispatcher.Invoke(input, output); - std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl; - - vtkm::Float64 elapsedTime = timer.GetElapsedTime(); - std::cout << "Time: " << elapsedTime << std::endl; - -#endif + vtkm::Float64 elapsedTime = timer.GetElapsedTime(); + std::cout << "Time: " << elapsedTime << std::endl; + } int dim = 128; if (argc > 2) diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu index 7c04584c0..f2d2120f8 100644 --- a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu +++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu @@ -21,6 +21,9 @@ #define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_cu #include +#include + +using vtkm::cont::cuda::internal::CudaAllocator; namespace vtkm { @@ -64,18 +67,8 @@ void ExecutionArrayInterfaceBasic::Allocate(TypelessExecut // Attempt to allocate: try { - char* tmp; -#ifdef VTKM_USE_UNIFIED_MEMORY - int dev; - VTKM_CUDA_CALL(cudaGetDevice(&dev)); - VTKM_CUDA_CALL(cudaMallocManaged(&tmp, static_cast(numBytes))); - VTKM_CUDA_CALL(cudaMemAdvise(tmp, numBytes, cudaMemAdviseSetPreferredLocation, dev)); - VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, numBytes, dev, 0)); - VTKM_CUDA_CALL(cudaStreamSynchronize(0)); -#else - VTKM_CUDA_CALL(cudaMalloc(&tmp, static_cast(numBytes))); -#endif - + // Cast to char* so that the pointer math below will work. + char* tmp = static_cast(CudaAllocator::Allocate(static_cast(numBytes))); execArray.Array = tmp; execArray.ArrayEnd = tmp + numBytes; execArray.ArrayCapacity = tmp + numBytes; @@ -93,7 +86,7 @@ void ExecutionArrayInterfaceBasic::Free( { if (execArray.Array != nullptr) { - VTKM_CUDA_CALL(cudaFree(execArray.Array)); + CudaAllocator::Free(execArray.Array); execArray.Array = nullptr; execArray.ArrayEnd = nullptr; execArray.ArrayCapacity = nullptr; diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h index 547465c6c..8798ad260 100644 --- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h +++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h @@ -35,6 +35,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE VTKM_THIRDPARTY_POST_INCLUDE #include +#include #include #include @@ -172,17 +173,8 @@ public: // Attempt to allocate: try { - ValueType* tmp; -#ifdef VTKM_USE_UNIFIED_MEMORY - int dev; - VTKM_CUDA_CALL(cudaGetDevice(&dev)); - VTKM_CUDA_CALL(cudaMallocManaged(&tmp, bufferSize)); - VTKM_CUDA_CALL(cudaMemAdvise(tmp, bufferSize, cudaMemAdviseSetPreferredLocation, dev)); - VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, bufferSize, dev, 0)); - VTKM_CUDA_CALL(cudaStreamSynchronize(0)); -#else - VTKM_CUDA_CALL(cudaMalloc(&tmp, bufferSize)); -#endif + ValueType* tmp = + static_cast(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize)); this->Begin = PointerType(tmp); } catch (const std::exception& error) @@ -214,9 +206,6 @@ public: storage->Allocate(this->GetNumberOfValues()); try { -#ifdef VTKM_USE_UNIFIED_MEMORY - cudaDeviceSynchronize(); -#endif ::thrust::copy( this->Begin, this->End, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal())); } @@ -254,7 +243,7 @@ public: { if (this->Begin.get() != nullptr) { - VTKM_CUDA_CALL(cudaFree(this->Begin.get())); + vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin.get()); this->Begin = PointerType(static_cast(nullptr)); this->End = PointerType(static_cast(nullptr)); this->Capacity = PointerType(static_cast(nullptr)); diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt index 944ca3d5e..fdeca3a2a 100644 --- a/vtkm/cont/cuda/internal/CMakeLists.txt +++ b/vtkm/cont/cuda/internal/CMakeLists.txt @@ -21,6 +21,7 @@ set(headers ArrayManagerExecutionCuda.h ArrayManagerExecutionThrustDevice.h + CudaAllocator.h DeviceAdapterAlgorithmCuda.h DeviceAdapterAlgorithmThrust.h DeviceAdapterTagCuda.h @@ -30,6 +31,11 @@ set(headers VirtualObjectTransferCuda.h ) +set(sources + ArrayManagerExecutionCuda.cu + CudaAllocator.cu + ) + vtkm_declare_headers(CUDA ${headers} TESTABLE ${VTKm_ENABLE_CUDA}) #----------------------------------------------------------------------------- @@ -49,7 +55,7 @@ if (VTKm_ENABLE_CUDA) list(APPEND compile_options -Xcompiler=-fPIC) endif() - cuda_compile(vtkm_cont_cuda_object_files ArrayManagerExecutionCuda.cu + cuda_compile(vtkm_cont_cuda_object_files ${sources} OPTIONS "${compile_options}") #Setup the dependency chain for the custom object build so that diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu new file mode 100644 index 000000000..6785bd280 --- /dev/null +++ b/vtkm/cont/cuda/internal/CudaAllocator.cu @@ -0,0 +1,164 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2017 Sandia Corporation. +// Copyright 2017 UT-Battelle, LLC. +// Copyright 2017 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#include +#include + +#include + +// These static vars are in an anon namespace to work around MSVC linker issues. +namespace +{ +// Has CudaAllocator::Initialize been called? +static bool IsInitialized = false; + +// True if all devices support concurrent pagable managed memory. +static bool ManagedMemorySupported = false; +} + +namespace vtkm +{ +namespace cont +{ +namespace cuda +{ +namespace internal +{ + +bool CudaAllocator::UsingManagedMemory() +{ + CudaAllocator::Initialize(); + return ManagedMemorySupported; +} + +void* CudaAllocator::Allocate(std::size_t numBytes) +{ + CudaAllocator::Initialize(); + + void* ptr = nullptr; + if (ManagedMemorySupported) + { + VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes)); + } + else + { + VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes)); + } + + return ptr; +} + +void CudaAllocator::Free(void* ptr) +{ + CudaAllocator::Initialize(); + + VTKM_CUDA_CALL(cudaFree(ptr)); +} + +void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes) +{ + CudaAllocator::Initialize(); + + if (ManagedMemorySupported) + { + // TODO these hints need to be benchmarked and adjusted once we start + // sharing the pointers between cont/exec + VTKM_CUDA_CALL( + cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId)); + VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0)); + } +} + +void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes) +{ + CudaAllocator::Initialize(); + + if (ManagedMemorySupported) + { + int dev; + VTKM_CUDA_CALL(cudaGetDevice(&dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev)); + VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); + } +} + +void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes) +{ + CudaAllocator::Initialize(); + + if (ManagedMemorySupported) + { + int dev; + VTKM_CUDA_CALL(cudaGetDevice(&dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); + VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); + } +} + +void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes) +{ + CudaAllocator::Initialize(); + + if (ManagedMemorySupported) + { + int dev; + VTKM_CUDA_CALL(cudaGetDevice(&dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); + VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); + VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); + } +} + +void CudaAllocator::Initialize() +{ + if (!IsInitialized) + { + int numDevices; + VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices)); + + if (numDevices == 0) + { + ManagedMemorySupported = false; + IsInitialized = true; + return; + } + + // Check all devices, use the feature set supported by all + bool managed = true; + cudaDeviceProp prop; + for (int i = 0; i < numDevices && managed; ++i) + { + VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i)); + // We check for concurrentManagedAccess, as devices with only the + // managedAccess property have extra synchronization requirements. + managed = managed && prop.concurrentManagedAccess; + } + + ManagedMemorySupported = managed; + IsInitialized = true; + } +} +} +} +} +} // end namespace vtkm::cont::cuda::internal diff --git a/vtkm/cont/cuda/internal/CudaAllocator.h b/vtkm/cont/cuda/internal/CudaAllocator.h new file mode 100644 index 000000000..8a3d738f3 --- /dev/null +++ b/vtkm/cont/cuda/internal/CudaAllocator.h @@ -0,0 +1,59 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2017 Sandia Corporation. +// Copyright 2017 UT-Battelle, LLC. +// Copyright 2017 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#ifndef vtk_m_cont_cuda_internal_CudaAllocator_h +#define vtk_m_cont_cuda_internal_CudaAllocator_h + +#include +#include + +#include + +namespace vtkm +{ +namespace cont +{ +namespace cuda +{ +namespace internal +{ + +struct VTKM_CONT_EXPORT CudaAllocator +{ + static VTKM_CONT bool UsingManagedMemory(); + + static VTKM_CONT void* Allocate(std::size_t numBytes); + static VTKM_CONT void Free(void* ptr); + + static VTKM_CONT void PrepareForControl(const void* ptr, std::size_t numBytes); + + static VTKM_CONT void PrepareForInput(const void* ptr, std::size_t numBytes); + static VTKM_CONT void PrepareForOutput(const void* ptr, std::size_t numBytes); + static VTKM_CONT void PrepareForInPlace(const void* ptr, std::size_t numBytes); + +private: + static VTKM_CONT void Initialize(); +}; +} +} +} +} // end namespace vtkm::cont::cuda::internal + +#endif // vtk_m_cont_cuda_internal_CudaAllocator_h