vtk-m/examples/multi_backend/MultiDeviceGradient.hxx

//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
//
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//============================================================================

#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
#include <vtkm/cont/tbb/DeviceAdapterTBB.h>

#include <vtkm/filter/Gradient.h>


namespace
{
int determine_cuda_gpu_count()
{
  int count = 0;
#if defined(VTKM_ENABLE_CUDA)
  int numberOfDevices = 0;
  auto res = cudaGetDeviceCount(&numberOfDevices);
  if (res == cudaSuccess)
  {
    count = numberOfDevices;
  }
#endif
  return count;
}

void process_block_tbb(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to TBB.
  //This makes sure that any vtkm::filters used by our
  //task operate only on TBB. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on TBB
    auto task = queue.pop();

    //Step 3. Run the task on TBB. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a block on tbb (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

void process_block_openMP(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to TBB.
  //This makes sure that any vtkm::filters used by our
  //task operate only on TBB. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on TBB
    auto task = queue.pop();

    //Step 3. Run the task on TBB. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a block on tbb (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

void process_block_cuda(RuntimeTaskQueue& queue, int gpuId)
{
  //Step 1. Set the device adapter to this thread to cuda.
  //This makes sure that any vtkm::filters used by our
  //task operate only on cuda.  The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});
  (void)gpuId;

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on cuda
    auto task = queue.pop();

    //Step 3. Run the task on cuda. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a block on cuda (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

} //namespace

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()
  : ComputePointGradient(false)
  , Queue()
  , Workers()
{
  //Step 1. Determine the number of workers we want
  auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
  const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});
  const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});
  const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});

  //Note currently the virtual implementation has some issues
  //In a multi-threaded environment only cuda can be used or
  //all SMP backends ( Serial, TBB, OpenMP ).
  //Once this issue is resolved we can enable CUDA + TBB in
  //this example

  //Step 2. Launch workers that will use cuda (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  if (runOnCuda)
  {
    std::cout << "adding cuda workers" << std::endl;
    const int gpu_count = determine_cuda_gpu_count();
    for (int i = 0; i < gpu_count; ++i)
    {
      //The number of workers per GPU is purely arbitrary currently,
      //but in general we want multiple of them so we can overlap compute
      //and transfer
      this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));
      this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));
      this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));
      this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));
    }
  }
  //Step 3. Launch a worker that will use openMP (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnOpenMP)
  {
    std::cout << "adding a openMP worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_block_openMP, std::ref(this->Queue)));
  }
  //Step 4. Launch a worker that will use tbb (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnTbb)
  {
    std::cout << "adding a tbb worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_block_tbb, std::ref(this->Queue)));
  }
}

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()
{
  this->Queue.shutdown();

  //shutdown all workers
  for (auto&& thread : this->Workers)
  {
    thread.join();
  }
}

//-----------------------------------------------------------------------------
template <typename DerivedPolicy>
inline VTKM_CONT vtkm::cont::MultiBlock MultiDeviceGradient::PrepareForExecution(
  const vtkm::cont::MultiBlock& mb,
  const vtkm::filter::PolicyBase<DerivedPolicy>& policy)
{
  //Step 1. Say that we have no more to submit for this multi block
  //This is needed to happen for each execute as we want to support
  //the same filter being used for multiple inputs
  this->Queue.reset();

  //Step 2. Construct the multi-block we are going to fill. The size signature
  //to MultiBlock just reserves size
  vtkm::cont::MultiBlock output;
  output.AddBlocks(std::vector<vtkm::cont::DataSet>(static_cast<size_t>(mb.GetNumberOfBlocks())));
  vtkm::cont::MultiBlock* outPtr = &output;


  //Step 3. Construct the filter we want to run on each block
  vtkm::filter::Gradient gradient;
  gradient.SetComputePointGradient(this->GetComputePointGradient());
  gradient.SetActiveField(this->GetActiveFieldName());

  //Step 3b. Post 1 block up as work and block intil it is
  //complete. This is needed as currently constructing the virtual
  //Point Coordinates is not thread safe.
  auto block = mb.cbegin();
  {
    vtkm::cont::DataSet input = *block;
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);
        outPtr->ReplaceBlock(0, result);
      });
    this->Queue.waitForAllTasksToComplete();
    block++;
  }

  vtkm::Id index = 1;
  for (; block != mb.cend(); ++block)
  {
    vtkm::cont::DataSet input = *block;
    //Step 4. For each input block construct a lambda
    //and add it to the queue for workers to take. This
    //will allows us to have multiple works execute in a non
    //blocking manner
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);
        outPtr->ReplaceBlock(index, result);
      });
    index++;
  }

  // Step 5. Wait on all workers to finish
  this->Queue.waitForAllTasksToComplete();

  return output;
}
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//============================================================================`
			`// Copyright (c) Kitware, Inc.`
			`// All rights reserved.`
			`// See LICENSE.txt for details.`
conslidate the license statement 2019-04-15 23:24:21 +00:00			`//`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`// This software is distributed WITHOUT ANY WARRANTY; without even`
			`// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR`
			`// PURPOSE. See the above copyright notice for more information.`
			`//============================================================================`

			`#include <vtkm/cont/RuntimeDeviceTracker.h>`
			`#include <vtkm/cont/cuda/DeviceAdapterCuda.h>`
Removes the default device macros from VTK-m Fixes #116 2019-04-09 13:52:53 +00:00			`#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`#include <vtkm/cont/tbb/DeviceAdapterTBB.h>`

			`#include <vtkm/filter/Gradient.h>`


			`namespace`
			`{`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`int determine_cuda_gpu_count()`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`{`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`int count = 0;`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`#if defined(VTKM_ENABLE_CUDA)`
			`int numberOfDevices = 0;`
			`auto res = cudaGetDeviceCount(&numberOfDevices);`
			`if (res == cudaSuccess)`
			`{`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`count = numberOfDevices;`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`}`
			`#endif`
			`return count;`
			`}`

			`void process_block_tbb(RuntimeTaskQueue& queue)`
			`{`
			`//Step 1. Set the device adapter to this thread to TBB.`
			`//This makes sure that any vtkm::filters used by our`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`//task operate only on TBB. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//`
RuntimeDeviceTracker: Remove `Global` from names 2019-03-15 19:59:02 +00:00			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00
			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on TBB`
			`auto task = queue.pop();`

			`//Step 3. Run the task on TBB. We check the validity`
			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`task();`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a block on tbb (" << std::this_thread::get_id() << ")" << std::endl;`
			`}`
			`}`

Removes the default device macros from VTK-m Fixes #116 2019-04-09 13:52:53 +00:00			`void process_block_openMP(RuntimeTaskQueue& queue)`
			`{`
			`//Step 1. Set the device adapter to this thread to TBB.`
			`//This makes sure that any vtkm::filters used by our`
			`//task operate only on TBB. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
			`//`
			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});`

			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on TBB`
			`auto task = queue.pop();`

			`//Step 3. Run the task on TBB. We check the validity`
			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
			`task();`
			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a block on tbb (" << std::this_thread::get_id() << ")" << std::endl;`
			`}`
			`}`

multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`void process_block_cuda(RuntimeTaskQueue& queue, int gpuId)`
			`{`
			`//Step 1. Set the device adapter to this thread to cuda.`
			`//This makes sure that any vtkm::filters used by our`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`//task operate only on cuda. The "global" thread tracker`
			`//is actually thread-local, so we can use that.`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//`
RuntimeDeviceTracker: Remove `Global` from names 2019-03-15 19:59:02 +00:00			`vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`(void)gpuId;`

			`while (queue.hasTasks())`
			`{`
			`//Step 2. Get the task to run on cuda`
			`auto task = queue.pop();`

Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`//Step 3. Run the task on cuda. We check the validity`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//of the task since we could be given an empty task`
			`//when the queue is empty and we are shutting down`
			`if (task != nullptr)`
			`{`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`task();`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`}`

			`//Step 4. Notify the queue that we finished processing this task`
			`queue.completedTask();`
			`std::cout << "finished a block on cuda (" << std::this_thread::get_id() << ")" << std::endl;`
			`}`
			`}`

			`} //namespace`

			`//-----------------------------------------------------------------------------`
			`VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()`
			`: ComputePointGradient(false)`
			`, Queue()`
			`, Workers()`
			`{`
			`//Step 1. Determine the number of workers we want`
RuntimeDeviceTracker can't be copied and is only accessible via reference. As the RuntimeDeviceTracker is a per thread construct we now make it explicit that you can only get a reference to the per-thread version and can't copy it. 2019-05-15 21:26:34 +00:00			`auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});`
Removes the default device macros from VTK-m Fixes #116 2019-04-09 13:52:53 +00:00			`const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});`
			`const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00
			`//Note currently the virtual implementation has some issues`
Misc. typos Found via `codespell` and `grep` more typos includes source typo change and a typo that needs further review follow-up typos Follow-up typos Revert a commit 2018-06-14 20:49:11 +00:00			`//In a multi-threaded environment only cuda can be used or`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//all SMP backends ( Serial, TBB, OpenMP ).`
			`//Once this issue is resolved we can enable CUDA + TBB in`
			`//this example`

			`//Step 2. Launch workers that will use cuda (if enabled).`
			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`if (runOnCuda)`
			`{`
			`std::cout << "adding cuda workers" << std::endl;`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`const int gpu_count = determine_cuda_gpu_count();`
			`for (int i = 0; i < gpu_count; ++i)`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`{`
			`//The number of workers per GPU is purely arbitrary currently,`
			`//but in general we want multiple of them so we can overlap compute`
			`//and transfer`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));`
			`this->Workers.emplace_back(std::bind(process_block_cuda, std::ref(this->Queue), i));`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`}`
			`}`
Removes the default device macros from VTK-m Fixes #116 2019-04-09 13:52:53 +00:00			`//Step 3. Launch a worker that will use openMP (if enabled).`
			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`else if (runOnOpenMP)`
			`{`
			`std::cout << "adding a openMP worker" << std::endl;`
			`this->Workers.emplace_back(std::bind(process_block_openMP, std::ref(this->Queue)));`
			`}`
			`//Step 4. Launch a worker that will use tbb (if enabled).`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`//The threads share a queue object so we need to explicitly pass it`
			`//by reference (the std::ref call)`
			`else if (runOnTbb)`
			`{`
			`std::cout << "adding a tbb worker" << std::endl;`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`this->Workers.emplace_back(std::bind(process_block_tbb, std::ref(this->Queue)));`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`}`
			`}`

			`//-----------------------------------------------------------------------------`
			`VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()`
			`{`
			`this->Queue.shutdown();`

			`//shutdown all workers`
			`for (auto&& thread : this->Workers)`
			`{`
			`thread.join();`
			`}`
			`}`

			`//-----------------------------------------------------------------------------`
			`template <typename DerivedPolicy>`
			`inline VTKM_CONT vtkm::cont::MultiBlock MultiDeviceGradient::PrepareForExecution(`
			`const vtkm::cont::MultiBlock& mb,`
			`const vtkm::filter::PolicyBase<DerivedPolicy>& policy)`
			`{`
			`//Step 1. Say that we have no more to submit for this multi block`
			`//This is needed to happen for each execute as we want to support`
			`//the same filter being used for multiple inputs`
			`this->Queue.reset();`

			`//Step 2. Construct the multi-block we are going to fill. The size signature`
			`//to MultiBlock just reserves size`
			`vtkm::cont::MultiBlock output;`
Workaround compiling error in multi-backend example with some gcc versions 2018-06-21 15:56:06 +00:00			`output.AddBlocks(std::vector<vtkm::cont::DataSet>(static_cast<size_t>(mb.GetNumberOfBlocks())));`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`vtkm::cont::MultiBlock* outPtr = &output;`


			`//Step 3. Construct the filter we want to run on each block`
			`vtkm::filter::Gradient gradient;`
			`gradient.SetComputePointGradient(this->GetComputePointGradient());`
			`gradient.SetActiveField(this->GetActiveFieldName());`

			`//Step 3b. Post 1 block up as work and block intil it is`
			`//complete. This is needed as currently constructing the virtual`
			`//Point Coordinates is not thread safe.`
			`auto block = mb.cbegin();`
			`{`
			`vtkm::cont::DataSet input = *block;`
			`this->Queue.push( //build a lambda that is the work to do`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`[=]() {`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`vtkm::filter::Gradient perThreadGrad = gradient;`

			`vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);`
			`outPtr->ReplaceBlock(0, result);`
			`});`
			`this->Queue.waitForAllTasksToComplete();`
			`block++;`
			`}`

			`vtkm::Id index = 1;`
			`for (; block != mb.cend(); ++block)`
			`{`
			`vtkm::cont::DataSet input = *block;`
			`//Step 4. For each input block construct a lambda`
			`//and add it to the queue for workers to take. This`
			`//will allows us to have multiple works execute in a non`
			`//blocking manner`
			`this->Queue.push( //build a lambda that is the work to do`
Remove TryExecute from filters Now that the dispatcher does its own TryExecute, filters do not need to do that. This change requires all worklets called by filters to be able to execute without knowing the device a priori. 2018-08-30 15:53:18 +00:00			`[=]() {`
multi_backend shows how a filter can use multiple device adapter 2018-05-04 18:42:46 +00:00			`vtkm::filter::Gradient perThreadGrad = gradient;`

			`vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);`
			`outPtr->ReplaceBlock(index, result);`
			`});`
			`index++;`
			`}`

			`// Step 5. Wait on all workers to finish`
			`this->Queue.waitForAllTasksToComplete();`

			`return output;`
			`}`