vtk-m/examples/multi_backend/MultiDeviceGradient.cxx

//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
//
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//============================================================================

#include "MultiDeviceGradient.h"

#include <vtkm/cont/Logging.h>
#include <vtkm/cont/RuntimeDeviceInformation.h>
#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
#include <vtkm/cont/tbb/DeviceAdapterTBB.h>

#include <vtkm/filter/vector_analysis/Gradient.h>

namespace
{

void process_partition_tbb(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to TBB.
  //This makes sure that any vtkm::filters used by our
  //task operate only on TBB. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on TBB
    auto task = queue.pop();

    //Step 3. Run the task on TBB. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on tbb (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

void process_partition_openMP(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to openMP.
  //This makes sure that any vtkm::filters used by our
  //task operate only on openMP. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on openMP
    auto task = queue.pop();

    //Step 3. Run the task on openMP. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on openMP (" << std::this_thread::get_id() << ")"
              << std::endl;
  }
}

void process_partition_cuda(RuntimeTaskQueue& queue, int gpuId)
{
  //Step 1. Set the device adapter to this thread to cuda.
  //This makes sure that any vtkm::filters used by our
  //task operate only on cuda.  The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});
  (void)gpuId;

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on cuda
    auto task = queue.pop();

    //Step 3. Run the task on cuda. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on cuda (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

} //namespace

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()
  : ComputePointGradient(false)
  , Queue()
  , Workers()
{
  //Step 1. Determine the number of workers we want
  auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
  const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});
  const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});
  const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});

  //Note currently the virtual implementation has some issues
  //In a multi-threaded environment only cuda can be used or
  //all SMP backends ( Serial, TBB, OpenMP ).
  //Once this issue is resolved we can enable CUDA + TBB in
  //this example

  //Step 2. Launch workers that will use cuda (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  if (runOnCuda)
  {
    std::cout << "adding cuda workers" << std::endl;
    try
    {
      vtkm::Id gpu_count = 0;
      vtkm::cont::RuntimeDeviceInformation{}
        .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda{})
        .GetMaxDevices(gpu_count);
      for (int i = 0; i < gpu_count; ++i)
      {
        //The number of workers per GPU is purely arbitrary currently,
        //but in general we want multiple of them so we can overlap compute
        //and transfer
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
      }
    }
    catch (const vtkm::cont::ErrorBadDevice& err)
    {
      VTKM_LOG_S(vtkm::cont::LogLevel::Error,
                 "Error getting CudaDeviceCount: " << err.GetMessage());
    }
  }
  //Step 3. Launch a worker that will use openMP (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnOpenMP)
  {
    std::cout << "adding a openMP worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_partition_openMP, std::ref(this->Queue)));
  }
  //Step 4. Launch a worker that will use tbb (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnTbb)
  {
    std::cout << "adding a tbb worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_partition_tbb, std::ref(this->Queue)));
  }
}

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()
{
  this->Queue.shutdown();

  //shutdown all workers
  for (auto&& thread : this->Workers)
  {
    thread.join();
  }
}

//-----------------------------------------------------------------------------
VTKM_CONT vtkm::cont::PartitionedDataSet MultiDeviceGradient::DoExecutePartitions(
  const vtkm::cont::PartitionedDataSet& pds)
{
  //Step 1. Say that we have no more to submit for this PartitionedDataSet
  //This is needed to happen for each execute as we want to support
  //the same filter being used for multiple inputs
  this->Queue.reset();

  //Step 2. Construct the PartitionedDataSet we are going to fill. The size
  //signature to PartitionedDataSet just reserves size
  vtkm::cont::PartitionedDataSet output;
  output.AppendPartitions(
    std::vector<vtkm::cont::DataSet>(static_cast<size_t>(pds.GetNumberOfPartitions())));
  vtkm::cont::PartitionedDataSet* outPtr = &output;


  //Step 3. Construct the filter we want to run on each partition
  vtkm::filter::vector_analysis::Gradient gradient;
  gradient.SetComputePointGradient(this->GetComputePointGradient());
  gradient.SetActiveField(this->GetActiveFieldName());

  //Step 3b. Post 1 partition up as work and block until it is
  //complete. This is needed as currently constructing the virtual
  //Point Coordinates is not thread safe.
  auto partition = pds.cbegin();
  {
    vtkm::cont::DataSet input = *partition;
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input);
        outPtr->ReplacePartition(0, result);
      });
    this->Queue.waitForAllTasksToComplete();
    partition++;
  }

  vtkm::Id index = 1;
  for (; partition != pds.cend(); ++partition)
  {
    vtkm::cont::DataSet input = *partition;
    //Step 4. For each input partition construct a lambda
    //and add it to the queue for workers to take. This
    //will allows us to have multiple works execute in a non
    //blocking manner
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input);
        outPtr->ReplacePartition(index, result);
      });
    index++;
  }

  // Step 5. Wait on all workers to finish
  this->Queue.waitForAllTasksToComplete();

  return output;
}

VTKM_CONT vtkm::cont::DataSet MultiDeviceGradient::DoExecute(const vtkm::cont::DataSet& inData)
{
  vtkm::cont::PartitionedDataSet outData = this->Execute(vtkm::cont::PartitionedDataSet(inData));
  VTKM_ASSERT(outData.GetNumberOfPartitions() == 1);
  return outData.GetPartition(0);
}
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//============================================================================`
			`// Copyright (c) Kitware, Inc.`
			`// All rights reserved.`
			`// See LICENSE.txt for details.`
conslidate the license statement 2019-04-15 23:24:21 +00:00			`//`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`// This software is distributed WITHOUT ANY WARRANTY; without even`
			`// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR`
			`// PURPOSE. See the above copyright notice for more information.`
			`//============================================================================`

			`#include "MultiDeviceGradient.h"`

Deprecate old filter base classes and supporting classes 2022-09-26 16:23:26 +00:00			`#include <vtkm/cont/Logging.h>`
			`#include <vtkm/cont/RuntimeDeviceInformation.h>`
			`#include <vtkm/cont/RuntimeDeviceTracker.h>`
			`#include <vtkm/cont/cuda/DeviceAdapterCuda.h>`
			`#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>`
			`#include <vtkm/cont/tbb/DeviceAdapterTBB.h>`

			`#include <vtkm/filter/vector_analysis/Gradient.h>`

			`namespace`
			`{`

			`void process_partition_tbb(RuntimeTaskQueue& queue)`
			`{`
			`//Step 1. Set the device adapter to this thread to TBB.`
			`//This makes sure that any vtkm::filters used by our`
			`//task operate only on TBB. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
			`//`
			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});`

			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on TBB`
			`auto task = queue.pop();`

			`//Step 3. Run the task on TBB. We check the validity`
			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
			`task();`
			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a partition on tbb (" << std::this_thread::get_id() << ")" << std::endl;`
			`}`
			`}`

			`void process_partition_openMP(RuntimeTaskQueue& queue)`
			`{`
			`//Step 1. Set the device adapter to this thread to openMP.`
			`//This makes sure that any vtkm::filters used by our`
			`//task operate only on openMP. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
			`//`
			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});`

			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on openMP`
			`auto task = queue.pop();`

			`//Step 3. Run the task on openMP. We check the validity`
			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
			`task();`
			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a partition on openMP (" << std::this_thread::get_id() << ")"`
			`<< std::endl;`
			`}`
			`}`

			`void process_partition_cuda(RuntimeTaskQueue& queue, int gpuId)`
			`{`
			`//Step 1. Set the device adapter to this thread to cuda.`
			`//This makes sure that any vtkm::filters used by our`
			`//task operate only on cuda. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
			`//`
			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});`
			`(void)gpuId;`

			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on cuda`
			`auto task = queue.pop();`

			`//Step 3. Run the task on cuda. We check the validity`
			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
			`task();`
			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a partition on cuda (" << std::this_thread::get_id() << ")" << std::endl;`
			`}`
			`}`

			`} //namespace`

			`//-----------------------------------------------------------------------------`
			`VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()`
			`: ComputePointGradient(false)`
			`, Queue()`
			`, Workers()`
			`{`
			`//Step 1. Determine the number of workers we want`
			`auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();`
			`const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});`
			`const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});`
			`const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});`

			`//Note currently the virtual implementation has some issues`
			`//In a multi-threaded environment only cuda can be used or`
			`//all SMP backends ( Serial, TBB, OpenMP ).`
			`//Once this issue is resolved we can enable CUDA + TBB in`
			`//this example`

			`//Step 2. Launch workers that will use cuda (if enabled).`
			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`if (runOnCuda)`
			`{`
			`std::cout << "adding cuda workers" << std::endl;`
			`try`
			`{`
			`vtkm::Id gpu_count = 0;`
			`vtkm::cont::RuntimeDeviceInformation{}`
			`.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda{})`
			`.GetMaxDevices(gpu_count);`
			`for (int i = 0; i < gpu_count; ++i)`
			`{`
			`//The number of workers per GPU is purely arbitrary currently,`
			`//but in general we want multiple of them so we can overlap compute`
			`//and transfer`
			`this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));`
			`}`
			`}`
			`catch (const vtkm::cont::ErrorBadDevice& err)`
			`{`
			`VTKM_LOG_S(vtkm::cont::LogLevel::Error,`
			`"Error getting CudaDeviceCount: " << err.GetMessage());`
			`}`
			`}`
			`//Step 3. Launch a worker that will use openMP (if enabled).`
			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`else if (runOnOpenMP)`
			`{`
			`std::cout << "adding a openMP worker" << std::endl;`
			`this->Workers.emplace_back(std::bind(process_partition_openMP, std::ref(this->Queue)));`
			`}`
			`//Step 4. Launch a worker that will use tbb (if enabled).`
			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`else if (runOnTbb)`
			`{`
			`std::cout << "adding a tbb worker" << std::endl;`
			`this->Workers.emplace_back(std::bind(process_partition_tbb, std::ref(this->Queue)));`
			`}`
			`}`

			`//-----------------------------------------------------------------------------`
			`VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()`
			`{`
			`this->Queue.shutdown();`

			`//shutdown all workers`
			`for (auto&& thread : this->Workers)`
			`{`
			`thread.join();`
			`}`
			`}`

			`//-----------------------------------------------------------------------------`
			`VTKM_CONT vtkm::cont::PartitionedDataSet MultiDeviceGradient::DoExecutePartitions(`
			`const vtkm::cont::PartitionedDataSet& pds)`
			`{`
			`//Step 1. Say that we have no more to submit for this PartitionedDataSet`
			`//This is needed to happen for each execute as we want to support`
			`//the same filter being used for multiple inputs`
			`this->Queue.reset();`

			`//Step 2. Construct the PartitionedDataSet we are going to fill. The size`
			`//signature to PartitionedDataSet just reserves size`
			`vtkm::cont::PartitionedDataSet output;`
			`output.AppendPartitions(`
			`std::vector<vtkm::cont::DataSet>(static_cast<size_t>(pds.GetNumberOfPartitions())));`
			`vtkm::cont::PartitionedDataSet* outPtr = &output;`


			`//Step 3. Construct the filter we want to run on each partition`
			`vtkm::filter::vector_analysis::Gradient gradient;`
			`gradient.SetComputePointGradient(this->GetComputePointGradient());`
			`gradient.SetActiveField(this->GetActiveFieldName());`

			`//Step 3b. Post 1 partition up as work and block until it is`
			`//complete. This is needed as currently constructing the virtual`
			`//Point Coordinates is not thread safe.`
			`auto partition = pds.cbegin();`
			`{`
			`vtkm::cont::DataSet input = *partition;`
			`this->Queue.push( //build a lambda that is the work to do`
			`[=]() {`
			`vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;`

			`vtkm::cont::DataSet result = perThreadGrad.Execute(input);`
			`outPtr->ReplacePartition(0, result);`
			`});`
			`this->Queue.waitForAllTasksToComplete();`
			`partition++;`
			`}`

			`vtkm::Id index = 1;`
			`for (; partition != pds.cend(); ++partition)`
			`{`
			`vtkm::cont::DataSet input = *partition;`
			`//Step 4. For each input partition construct a lambda`
			`//and add it to the queue for workers to take. This`
			`//will allows us to have multiple works execute in a non`
			`//blocking manner`
			`this->Queue.push( //build a lambda that is the work to do`
			`[=]() {`
			`vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;`

			`vtkm::cont::DataSet result = perThreadGrad.Execute(input);`
			`outPtr->ReplacePartition(index, result);`
			`});`
			`index++;`
			`}`

			`// Step 5. Wait on all workers to finish`
			`this->Queue.waitForAllTasksToComplete();`

			`return output;`
			`}`

			`VTKM_CONT vtkm::cont::DataSet MultiDeviceGradient::DoExecute(const vtkm::cont::DataSet& inData)`
			`{`
			`vtkm::cont::PartitionedDataSet outData = this->Execute(vtkm::cont::PartitionedDataSet(inData));`
			`VTKM_ASSERT(outData.GetNumberOfPartitions() == 1);`
			`return outData.GetPartition(0);`
			`}`