First support for unified memory

This commit is contained in:
Christopher Sewell 2017-01-18 11:43:49 -07:00
parent 503bebda54
commit 82c40a6374
5 changed files with 372 additions and 9 deletions

@ -104,6 +104,18 @@ option(VTKm_USE_DOUBLE_PRECISION
)
option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)
if (VTKm_ENABLE_CUDA)
option(VTKm_USE_UNIFIED_MEMORY
"Use CUDA unified memory"
OFF
)
endif (VTKm_ENABLE_CUDA)
if (VTKm_USE_UNIFIED_MEMORY)
set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
endif()
option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})

@ -24,7 +24,6 @@
set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})
add_subdirectory(clipping)
add_subdirectory(contour_tree)
add_subdirectory(demo)
add_subdirectory(dynamic_dispatcher)
add_subdirectory(hello_world)
@ -35,4 +34,5 @@ add_subdirectory(tetrahedra)
if(VTKm_ENABLE_RENDERING)
add_subdirectory(rendering)
endif()
add_subdirectory(unified_memory)

@ -0,0 +1,36 @@
##=============================================================================
##
## Copyright (c) Kitware, Inc.
## All rights reserved.
## See LICENSE.txt for details.
##
## This software is distributed WITHOUT ANY WARRANTY; without even
## the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
## PURPOSE. See the above copyright notice for more information.
##
## Copyright 2015 Sandia Corporation.
## Copyright 2015 UT-Battelle, LLC.
## Copyright 2015 Los Alamos National Security.
##
## Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
## the U.S. Government retains certain rights in this software.
## Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
## Laboratory (LANL), the U.S. Government retains certain rights in
## this software.
##
##=============================================================================
#Find the VTK-m package
find_package(VTKm REQUIRED QUIET
OPTIONAL_COMPONENTS Serial CUDA TBB OpenGL GLUT
)
if(VTKm_CUDA_FOUND)
vtkm_disable_troublesome_thrust_warnings()
# Cuda compiles do not respect target_include_directories
cuda_include_directories(${VTKm_INCLUDE_DIRS})
cuda_add_executable(UnifiedMemory_CUDA UnifiedMemory.cu)
target_link_libraries(UnifiedMemory_CUDA ${VTKm_LIBRARIES})
target_compile_options(UnifiedMemory_CUDA PRIVATE ${VTKm_COMPILE_OPTIONS})
endif()

@ -0,0 +1,223 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 Sandia Corporation.
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
#include <vtkm/cont/ArrayHandleStreaming.h>
#include <vtkm/worklet/DispatcherStreamingMapField.h>
#include <vtkm/filter/MarchingCubes.h>
#include <vtkm/worklet/DispatcherMapField.h>
#include <vtkm/Math.h>
#include <vtkm/cont/ArrayHandleCounting.h>
#include <vtkm/cont/CellSetExplicit.h>
#include <vtkm/cont/DataSet.h>
#include <vtkm/cont/Timer.h>
namespace {
// Define the tangle field for the input data
class TangleField : public vtkm::worklet::WorkletMapField
{
public:
typedef void ControlSignature(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
typedef void ExecutionSignature(_1, _2);
typedef _1 InputDomain;
const vtkm::Id xdim, ydim, zdim;
const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
const vtkm::Id cellsPerLayer;
VTKM_CONT
TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3]) : xdim(dims[0]), ydim(dims[1]), zdim(dims[2]),
xmin(mins[0]), ymin(mins[1]), zmin(mins[2]), xmax(maxs[0]), ymax(maxs[1]), zmax(maxs[2]), cellsPerLayer((xdim) * (ydim)) { };
VTKM_EXEC
void operator()(const vtkm::Id &vertexId, vtkm::Float32 &v) const
{
const vtkm::Id x = vertexId % (xdim);
const vtkm::Id y = (vertexId / (xdim)) % (ydim);
const vtkm::Id z = vertexId / cellsPerLayer;
const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim-1);
const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim-1);
const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim-1);
const vtkm::Float32 xx = 3.0f*(xmin+(xmax-xmin)*(fx));
const vtkm::Float32 yy = 3.0f*(ymin+(ymax-ymin)*(fy));
const vtkm::Float32 zz = 3.0f*(zmin+(zmax-zmin)*(fz));
v = (xx*xx*xx*xx - 5.0f*xx*xx + yy*yy*yy*yy - 5.0f*yy*yy + zz*zz*zz*zz - 5.0f*zz*zz + 11.8f) * 0.2f + 0.5f;
}
};
// Construct an input data set using the tangle field worklet
vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
{
vtkm::cont::DataSet dataSet;
const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
vtkm::Float32 mins[3] = {-1.0f, -1.0f, -1.0f};
vtkm::Float32 maxs[3] = {1.0f, 1.0f, 1.0f};
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0]*vdims[1]*vdims[2]);
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(TangleField(vdims, mins, maxs));
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
vtkm::Vec<vtkm::FloatDefault,3> origin(0.0f, 0.0f, 0.0f);
vtkm::Vec<vtkm::FloatDefault,3> spacing(
1.0f/static_cast<vtkm::FloatDefault>(dims[0]),
1.0f/static_cast<vtkm::FloatDefault>(dims[2]),
1.0f/static_cast<vtkm::FloatDefault>(dims[1]));
vtkm::cont::ArrayHandleUniformPointCoordinates
coordinates(vdims, origin, spacing);
dataSet.AddCoordinateSystem(
vtkm::cont::CoordinateSystem("coordinates", coordinates));
dataSet.AddField(vtkm::cont::Field("nodevar", vtkm::cont::Field::ASSOC_POINTS, fieldArray));
static const vtkm::IdComponent ndim = 3;
vtkm::cont::CellSetStructured<ndim> cellSet("cells");
cellSet.SetPointDimensions(vdims);
dataSet.AddCellSet(cellSet);
return dataSet;
}
}
namespace vtkm
{
namespace worklet
{
class SineWorklet : public vtkm::worklet::WorkletMapField
{
public:
typedef void ControlSignature(FieldIn<>, FieldOut<>);
typedef _2 ExecutionSignature(_1, WorkIndex);
VTKM_EXEC
vtkm::Float32 operator()(vtkm::Int64 x, vtkm::Id& index) const {
return (vtkm::Sin(1.0*x));
}
};
}
}
// Run a simple worklet, and compute an isosurface
int main(int argc, char* argv[])
{
vtkm::Int64 N = 1024*1024*1024;
if (argc > 1) N = N*atoi(argv[1]);
else N = N*4;
std::cout << "Testing streaming worklet with size " << N << std::endl;
vtkm::cont::ArrayHandle<vtkm::Int64> input;
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::vector<vtkm::Int64> data(N);
for (vtkm::Int64 i=0; i<N; i++) data[i] = i;
input = vtkm::cont::make_ArrayHandle(data);
typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
vtkm::worklet::SineWorklet sineWorklet;
#ifdef VTKM_USE_UNIFIED_MEMORY
std::cout << "Testing with unified memory" << std::endl;
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet>
dispatcher(sineWorklet);
vtkm::cont::Timer<> timer;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
#else
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet>
dispatcher(sineWorklet);
vtkm::Id NBlocks = N/(1024*1024*1024);
NBlocks *= 2;
dispatcher.SetNumberOfBlocks(NBlocks);
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks" << std::endl;
vtkm::cont::Timer<> timer;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
#endif
int dim = 128;
if (argc > 2) dim = atoi(argv[2]);
std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
vtkm::Id3 dims(dim, dim, dim);
vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32,3> > verticesArray, normalsArray;
vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
vtkm::filter::MarchingCubes filter;
filter.SetGenerateNormals(true);
filter.SetMergeDuplicatePoints( false );
filter.SetIsoValue( 0.5 );
vtkm::filter::ResultDataSet result =
filter.Execute( dataSet, dataSet.GetField("nodevar") );
filter.MapFieldOntoOutput(result, dataSet.GetField("nodevar"));
//need to extract vertices, normals, and scalars
vtkm::cont::DataSet& outputData = result.GetDataSet();
typedef vtkm::cont::ArrayHandle< vtkm::Vec<vtkm::Float32,3> > VertType;
vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
verticesArray = coords.GetData().Cast<VertType>();
normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
scalarsArray = outputData.GetField("nodevar").GetData().Cast< vtkm::cont::ArrayHandle<vtkm::Float32> >();
std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
std::cout << "vertices: ";
vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
std::cout << std::endl;
std::cout << "normals: ";
vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
std::cout << std::endl;
std::cout << "scalars: ";
vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
std::cout << std::endl;
return 0;
}

@ -77,8 +77,13 @@ public:
typedef vtkm::exec::cuda::internal::ConstArrayPortalFromThrust< T > PortalConstType;
VTKM_CONT
#ifdef VTKM_USE_UNIFIED_MEMORY
ArrayManagerExecutionThrustDevice(StorageType *storage)
: Storage(storage), Pointer(0), Length(0)
#else
ArrayManagerExecutionThrustDevice(StorageType *storage)
: Storage(storage), Array()
#endif
{
}
@ -93,7 +98,11 @@ public:
///
VTKM_CONT
vtkm::Id GetNumberOfValues() const {
#ifdef VTKM_USE_UNIFIED_MEMORY
return this->Length;
#else
return static_cast<vtkm::Id>(this->Array.size());
#endif
}
/// Allocates the appropriate size of the array and copies the given data
@ -111,9 +120,14 @@ public:
// The data in this->Array should already be valid.
}
#ifdef VTKM_USE_UNIFIED_MEMORY
::thrust::cuda::pointer<ValueType> first(this->Pointer);
::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
return PortalType(first, last);
#else
return PortalConstType(this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()));
#endif
}
/// Workaround for nvcc 7.5 compiler warning bug.
@ -139,8 +153,14 @@ public:
// The data in this->Array should already be valid.
}
#ifdef VTKM_USE_UNIFIED_MEMORY
::thrust::cuda::pointer<ValueType> first(this->Pointer);
::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
return PortalType(first, last);
#else
return PortalType(this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()));
#endif
}
/// Workaround for nvcc 7.5 compiler warning bug.
@ -160,20 +180,43 @@ public:
{
// Resize to 0 first so that you don't have to copy data when resizing
// to a larger size.
#ifdef VTKM_USE_UNIFIED_MEMORY
if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0; this->Length = 0; }
#else
this->Array.clear();
#endif
}
try
{
#ifdef VTKM_USE_UNIFIED_MEMORY
if (numberOfValues != this->GetNumberOfValues())
{
ValueType* temp;
cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
if (numberOfValues <= this->Length) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0; this->Length = 0; }
this->Pointer = temp;
this->Length = numberOfValues;
}
#else
this->Array.resize(static_cast<std::size_t>(numberOfValues));
#endif
}
catch (std::bad_alloc error)
{
throw vtkm::cont::ErrorControlBadAllocation(error.what());
}
#ifdef VTKM_USE_UNIFIED_MEMORY
::thrust::cuda::pointer<ValueType> first(this->Pointer);
::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
return PortalType(first, last);
#else
return PortalType(this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()));
#endif
}
/// Workaround for nvcc 7.5 compiler warning bug.
@ -190,14 +233,23 @@ public:
VTKM_CONT
void RetrieveOutputData(StorageType *storage) const
{
#ifdef VTKM_USE_UNIFIED_MEMORY
storage->Allocate(this->Length);
#else
storage->Allocate(static_cast<vtkm::Id>(this->Array.size()));
#endif
try
{
{
#ifdef VTKM_USE_UNIFIED_MEMORY
cudaDeviceSynchronize();
::thrust::copy(this->Pointer, this->Pointer + this->Length, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
#else
::thrust::copy(
this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()),
vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
}
this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()),
vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
#endif
}
catch (...)
{
vtkm::cont::cuda::internal::throwAsVTKmException();
@ -211,10 +263,14 @@ public:
template <class IteratorTypeControl>
VTKM_CONT void CopyInto(IteratorTypeControl dest) const
{
#ifdef VTKM_USE_UNIFIED_MEMORY
::thrust::copy(this->Pointer, this->Pointer + this->Length, dest);
#else
::thrust::copy(
this->Array.data(),
this->Array.data() + static_cast<difference_type>(this->Array.size()),
dest);
#endif
}
/// Resizes the device vector.
@ -223,9 +279,27 @@ public:
{
// The operation will succeed even if this assertion fails, but this
// is still supposed to be a precondition to Shrink.
VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
#ifdef VTKM_USE_UNIFIED_MEMORY
VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Length));
try
{
ValueType* temp;
cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
if (this->Length > 0) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
if (this->Pointer) cudaFree(this->Pointer);
this->Pointer = temp;
this->Length = numberOfValues;
}
catch (std::bad_alloc error)
{
throw vtkm::cont::ErrorControlBadAllocation(error.what());
}
#else
VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
this->Array.resize(static_cast<std::size_t>(numberOfValues));
#endif
}
@ -233,8 +307,12 @@ public:
///
VTKM_CONT void ReleaseResources()
{
#ifdef VTKM_USE_UNIFIED_MEMORY
if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0; this->Length = 0; }
#else
this->Array.clear();
this->Array.shrink_to_fit();
#endif
}
private:
@ -246,17 +324,31 @@ private:
StorageType *Storage;
#ifdef VTKM_USE_UNIFIED_MEMORY
ValueType *Pointer;
vtkm::Id Length;
#else
::thrust::system::cuda::vector<ValueType,
UninitializedAllocator<ValueType> > Array;
#endif
VTKM_CONT
void CopyToExecution()
{
try
{
#ifdef VTKM_USE_UNIFIED_MEMORY
cudaError_t r = cudaMallocManaged(&(this->Pointer), (this->Storage->GetNumberOfValues())*sizeof(ValueType));
if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
this->Pointer);
this->Length = this->Storage->GetNumberOfValues();
#else
this->Array.assign(
vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()));
#endif
}
catch (...)
{