2018-05-04 18:42:46 +00:00
|
|
|
//============================================================================
|
|
|
|
// Copyright (c) Kitware, Inc.
|
|
|
|
// All rights reserved.
|
|
|
|
// See LICENSE.txt for details.
|
2019-04-15 23:24:21 +00:00
|
|
|
//
|
2018-05-04 18:42:46 +00:00
|
|
|
// This software is distributed WITHOUT ANY WARRANTY; without even
|
|
|
|
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
|
|
// PURPOSE. See the above copyright notice for more information.
|
|
|
|
//============================================================================
|
|
|
|
|
|
|
|
#include "MultiDeviceGradient.h"
|
|
|
|
|
2022-09-26 16:23:26 +00:00
|
|
|
#include <vtkm/cont/Logging.h>
|
|
|
|
#include <vtkm/cont/RuntimeDeviceInformation.h>
|
|
|
|
#include <vtkm/cont/RuntimeDeviceTracker.h>
|
|
|
|
#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
|
|
|
|
#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
|
|
|
|
#include <vtkm/cont/tbb/DeviceAdapterTBB.h>
|
|
|
|
|
|
|
|
#include <vtkm/filter/vector_analysis/Gradient.h>
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
void process_partition_tbb(RuntimeTaskQueue& queue)
|
|
|
|
{
|
|
|
|
//Step 1. Set the device adapter to this thread to TBB.
|
|
|
|
//This makes sure that any vtkm::filters used by our
|
|
|
|
//task operate only on TBB. The "global" thread tracker
|
|
|
|
//is actually thread-local, so we can use that.
|
|
|
|
//
|
|
|
|
vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});
|
|
|
|
|
|
|
|
while (queue.hasTasks())
|
|
|
|
{
|
|
|
|
//Step 2. Get the task to run on TBB
|
|
|
|
auto task = queue.pop();
|
|
|
|
|
|
|
|
//Step 3. Run the task on TBB. We check the validity
|
|
|
|
//of the task since we could be given an empty task
|
|
|
|
//when the queue is empty and we are shutting down
|
|
|
|
if (task != nullptr)
|
|
|
|
{
|
|
|
|
task();
|
|
|
|
}
|
|
|
|
|
|
|
|
//Step 4. Notify the queue that we finished processing this task
|
|
|
|
queue.completedTask();
|
|
|
|
std::cout << "finished a partition on tbb (" << std::this_thread::get_id() << ")" << std::endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void process_partition_openMP(RuntimeTaskQueue& queue)
|
|
|
|
{
|
|
|
|
//Step 1. Set the device adapter to this thread to openMP.
|
|
|
|
//This makes sure that any vtkm::filters used by our
|
|
|
|
//task operate only on openMP. The "global" thread tracker
|
|
|
|
//is actually thread-local, so we can use that.
|
|
|
|
//
|
|
|
|
vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
|
|
|
|
|
|
|
|
while (queue.hasTasks())
|
|
|
|
{
|
|
|
|
//Step 2. Get the task to run on openMP
|
|
|
|
auto task = queue.pop();
|
|
|
|
|
|
|
|
//Step 3. Run the task on openMP. We check the validity
|
|
|
|
//of the task since we could be given an empty task
|
|
|
|
//when the queue is empty and we are shutting down
|
|
|
|
if (task != nullptr)
|
|
|
|
{
|
|
|
|
task();
|
|
|
|
}
|
|
|
|
|
|
|
|
//Step 4. Notify the queue that we finished processing this task
|
|
|
|
queue.completedTask();
|
|
|
|
std::cout << "finished a partition on openMP (" << std::this_thread::get_id() << ")"
|
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void process_partition_cuda(RuntimeTaskQueue& queue, int gpuId)
|
|
|
|
{
|
|
|
|
//Step 1. Set the device adapter to this thread to cuda.
|
|
|
|
//This makes sure that any vtkm::filters used by our
|
|
|
|
//task operate only on cuda. The "global" thread tracker
|
|
|
|
//is actually thread-local, so we can use that.
|
|
|
|
//
|
|
|
|
vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});
|
|
|
|
(void)gpuId;
|
|
|
|
|
|
|
|
while (queue.hasTasks())
|
|
|
|
{
|
|
|
|
//Step 2. Get the task to run on cuda
|
|
|
|
auto task = queue.pop();
|
|
|
|
|
|
|
|
//Step 3. Run the task on cuda. We check the validity
|
|
|
|
//of the task since we could be given an empty task
|
|
|
|
//when the queue is empty and we are shutting down
|
|
|
|
if (task != nullptr)
|
|
|
|
{
|
|
|
|
task();
|
|
|
|
}
|
|
|
|
|
|
|
|
//Step 4. Notify the queue that we finished processing this task
|
|
|
|
queue.completedTask();
|
|
|
|
std::cout << "finished a partition on cuda (" << std::this_thread::get_id() << ")" << std::endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} //namespace
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()
|
|
|
|
: ComputePointGradient(false)
|
|
|
|
, Queue()
|
|
|
|
, Workers()
|
|
|
|
{
|
|
|
|
//Step 1. Determine the number of workers we want
|
|
|
|
auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
|
|
|
const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});
|
|
|
|
const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});
|
|
|
|
const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});
|
|
|
|
|
|
|
|
//Note currently the virtual implementation has some issues
|
|
|
|
//In a multi-threaded environment only cuda can be used or
|
|
|
|
//all SMP backends ( Serial, TBB, OpenMP ).
|
|
|
|
//Once this issue is resolved we can enable CUDA + TBB in
|
|
|
|
//this example
|
|
|
|
|
|
|
|
//Step 2. Launch workers that will use cuda (if enabled).
|
|
|
|
//The threads share a queue object so we need to explicitly pass it
|
|
|
|
//by reference (the std::ref call)
|
|
|
|
if (runOnCuda)
|
|
|
|
{
|
|
|
|
std::cout << "adding cuda workers" << std::endl;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
vtkm::Id gpu_count = 0;
|
|
|
|
vtkm::cont::RuntimeDeviceInformation{}
|
|
|
|
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda{})
|
|
|
|
.GetMaxDevices(gpu_count);
|
|
|
|
for (int i = 0; i < gpu_count; ++i)
|
|
|
|
{
|
|
|
|
//The number of workers per GPU is purely arbitrary currently,
|
|
|
|
//but in general we want multiple of them so we can overlap compute
|
|
|
|
//and transfer
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (const vtkm::cont::ErrorBadDevice& err)
|
|
|
|
{
|
|
|
|
VTKM_LOG_S(vtkm::cont::LogLevel::Error,
|
|
|
|
"Error getting CudaDeviceCount: " << err.GetMessage());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//Step 3. Launch a worker that will use openMP (if enabled).
|
|
|
|
//The threads share a queue object so we need to explicitly pass it
|
|
|
|
//by reference (the std::ref call)
|
|
|
|
else if (runOnOpenMP)
|
|
|
|
{
|
|
|
|
std::cout << "adding a openMP worker" << std::endl;
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_openMP, std::ref(this->Queue)));
|
|
|
|
}
|
|
|
|
//Step 4. Launch a worker that will use tbb (if enabled).
|
|
|
|
//The threads share a queue object so we need to explicitly pass it
|
|
|
|
//by reference (the std::ref call)
|
|
|
|
else if (runOnTbb)
|
|
|
|
{
|
|
|
|
std::cout << "adding a tbb worker" << std::endl;
|
|
|
|
this->Workers.emplace_back(std::bind(process_partition_tbb, std::ref(this->Queue)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()
|
|
|
|
{
|
|
|
|
this->Queue.shutdown();
|
|
|
|
|
|
|
|
//shutdown all workers
|
|
|
|
for (auto&& thread : this->Workers)
|
|
|
|
{
|
|
|
|
thread.join();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
VTKM_CONT vtkm::cont::PartitionedDataSet MultiDeviceGradient::DoExecutePartitions(
|
|
|
|
const vtkm::cont::PartitionedDataSet& pds)
|
|
|
|
{
|
|
|
|
//Step 1. Say that we have no more to submit for this PartitionedDataSet
|
|
|
|
//This is needed to happen for each execute as we want to support
|
|
|
|
//the same filter being used for multiple inputs
|
|
|
|
this->Queue.reset();
|
|
|
|
|
|
|
|
//Step 2. Construct the PartitionedDataSet we are going to fill. The size
|
|
|
|
//signature to PartitionedDataSet just reserves size
|
|
|
|
vtkm::cont::PartitionedDataSet output;
|
|
|
|
output.AppendPartitions(
|
|
|
|
std::vector<vtkm::cont::DataSet>(static_cast<size_t>(pds.GetNumberOfPartitions())));
|
|
|
|
vtkm::cont::PartitionedDataSet* outPtr = &output;
|
|
|
|
|
|
|
|
|
|
|
|
//Step 3. Construct the filter we want to run on each partition
|
|
|
|
vtkm::filter::vector_analysis::Gradient gradient;
|
|
|
|
gradient.SetComputePointGradient(this->GetComputePointGradient());
|
|
|
|
gradient.SetActiveField(this->GetActiveFieldName());
|
|
|
|
|
|
|
|
//Step 3b. Post 1 partition up as work and block until it is
|
|
|
|
//complete. This is needed as currently constructing the virtual
|
|
|
|
//Point Coordinates is not thread safe.
|
|
|
|
auto partition = pds.cbegin();
|
|
|
|
{
|
|
|
|
vtkm::cont::DataSet input = *partition;
|
|
|
|
this->Queue.push( //build a lambda that is the work to do
|
|
|
|
[=]() {
|
|
|
|
vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;
|
|
|
|
|
|
|
|
vtkm::cont::DataSet result = perThreadGrad.Execute(input);
|
|
|
|
outPtr->ReplacePartition(0, result);
|
|
|
|
});
|
|
|
|
this->Queue.waitForAllTasksToComplete();
|
|
|
|
partition++;
|
|
|
|
}
|
|
|
|
|
|
|
|
vtkm::Id index = 1;
|
|
|
|
for (; partition != pds.cend(); ++partition)
|
|
|
|
{
|
|
|
|
vtkm::cont::DataSet input = *partition;
|
|
|
|
//Step 4. For each input partition construct a lambda
|
|
|
|
//and add it to the queue for workers to take. This
|
|
|
|
//will allows us to have multiple works execute in a non
|
|
|
|
//blocking manner
|
|
|
|
this->Queue.push( //build a lambda that is the work to do
|
|
|
|
[=]() {
|
|
|
|
vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;
|
|
|
|
|
|
|
|
vtkm::cont::DataSet result = perThreadGrad.Execute(input);
|
|
|
|
outPtr->ReplacePartition(index, result);
|
|
|
|
});
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Step 5. Wait on all workers to finish
|
|
|
|
this->Queue.waitForAllTasksToComplete();
|
|
|
|
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
|
|
|
VTKM_CONT vtkm::cont::DataSet MultiDeviceGradient::DoExecute(const vtkm::cont::DataSet& inData)
|
|
|
|
{
|
|
|
|
vtkm::cont::PartitionedDataSet outData = this->Execute(vtkm::cont::PartitionedDataSet(inData));
|
|
|
|
VTKM_ASSERT(outData.GetNumberOfPartitions() == 1);
|
|
|
|
return outData.GetPartition(0);
|
|
|
|
}
|