//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
//
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//============================================================================

#include "MultiDeviceGradient.h"

#include <vtkm/cont/Logging.h>
#include <vtkm/cont/RuntimeDeviceInformation.h>
#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
#include <vtkm/cont/tbb/DeviceAdapterTBB.h>

#include <vtkm/filter/vector_analysis/Gradient.h>

namespace
{

void process_partition_tbb(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to TBB.
  //This makes sure that any vtkm::filters used by our
  //task operate only on TBB. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on TBB
    auto task = queue.pop();

    //Step 3. Run the task on TBB. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on tbb (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

void process_partition_openMP(RuntimeTaskQueue& queue)
{
  //Step 1. Set the device adapter to this thread to openMP.
  //This makes sure that any vtkm::filters used by our
  //task operate only on openMP. The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on openMP
    auto task = queue.pop();

    //Step 3. Run the task on openMP. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on openMP (" << std::this_thread::get_id() << ")"
              << std::endl;
  }
}

void process_partition_cuda(RuntimeTaskQueue& queue, int gpuId)
{
  //Step 1. Set the device adapter to this thread to cuda.
  //This makes sure that any vtkm::filters used by our
  //task operate only on cuda.  The "global" thread tracker
  //is actually thread-local, so we can use that.
  //
  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});
  (void)gpuId;

  while (queue.hasTasks())
  {
    //Step 2. Get the task to run on cuda
    auto task = queue.pop();

    //Step 3. Run the task on cuda. We check the validity
    //of the task since we could be given an empty task
    //when the queue is empty and we are shutting down
    if (task != nullptr)
    {
      task();
    }

    //Step 4. Notify the queue that we finished processing this task
    queue.completedTask();
    std::cout << "finished a partition on cuda (" << std::this_thread::get_id() << ")" << std::endl;
  }
}

} //namespace

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()
  : ComputePointGradient(false)
  , Queue()
  , Workers()
{
  //Step 1. Determine the number of workers we want
  auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
  const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});
  const bool runOnOpenMP = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagOpenMP{});
  const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});

  //Note currently the virtual implementation has some issues
  //In a multi-threaded environment only cuda can be used or
  //all SMP backends ( Serial, TBB, OpenMP ).
  //Once this issue is resolved we can enable CUDA + TBB in
  //this example

  //Step 2. Launch workers that will use cuda (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  if (runOnCuda)
  {
    std::cout << "adding cuda workers" << std::endl;
    try
    {
      vtkm::Id gpu_count = 0;
      vtkm::cont::RuntimeDeviceInformation{}
        .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda{})
        .GetMaxDevices(gpu_count);
      for (int i = 0; i < gpu_count; ++i)
      {
        //The number of workers per GPU is purely arbitrary currently,
        //but in general we want multiple of them so we can overlap compute
        //and transfer
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
        this->Workers.emplace_back(std::bind(process_partition_cuda, std::ref(this->Queue), i));
      }
    }
    catch (const vtkm::cont::ErrorBadDevice& err)
    {
      VTKM_LOG_S(vtkm::cont::LogLevel::Error,
                 "Error getting CudaDeviceCount: " << err.GetMessage());
    }
  }
  //Step 3. Launch a worker that will use openMP (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnOpenMP)
  {
    std::cout << "adding a openMP worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_partition_openMP, std::ref(this->Queue)));
  }
  //Step 4. Launch a worker that will use tbb (if enabled).
  //The threads share a queue object so we need to explicitly pass it
  //by reference (the std::ref call)
  else if (runOnTbb)
  {
    std::cout << "adding a tbb worker" << std::endl;
    this->Workers.emplace_back(std::bind(process_partition_tbb, std::ref(this->Queue)));
  }
}

//-----------------------------------------------------------------------------
VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()
{
  this->Queue.shutdown();

  //shutdown all workers
  for (auto&& thread : this->Workers)
  {
    thread.join();
  }
}

//-----------------------------------------------------------------------------
VTKM_CONT vtkm::cont::PartitionedDataSet MultiDeviceGradient::DoExecutePartitions(
  const vtkm::cont::PartitionedDataSet& pds)
{
  //Step 1. Say that we have no more to submit for this PartitionedDataSet
  //This is needed to happen for each execute as we want to support
  //the same filter being used for multiple inputs
  this->Queue.reset();

  //Step 2. Construct the PartitionedDataSet we are going to fill. The size
  //signature to PartitionedDataSet just reserves size
  vtkm::cont::PartitionedDataSet output;
  output.AppendPartitions(
    std::vector<vtkm::cont::DataSet>(static_cast<size_t>(pds.GetNumberOfPartitions())));
  vtkm::cont::PartitionedDataSet* outPtr = &output;


  //Step 3. Construct the filter we want to run on each partition
  vtkm::filter::vector_analysis::Gradient gradient;
  gradient.SetComputePointGradient(this->GetComputePointGradient());
  gradient.SetActiveField(this->GetActiveFieldName());

  //Step 3b. Post 1 partition up as work and block until it is
  //complete. This is needed as currently constructing the virtual
  //Point Coordinates is not thread safe.
  auto partition = pds.cbegin();
  {
    vtkm::cont::DataSet input = *partition;
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input);
        outPtr->ReplacePartition(0, result);
      });
    this->Queue.waitForAllTasksToComplete();
    partition++;
  }

  vtkm::Id index = 1;
  for (; partition != pds.cend(); ++partition)
  {
    vtkm::cont::DataSet input = *partition;
    //Step 4. For each input partition construct a lambda
    //and add it to the queue for workers to take. This
    //will allows us to have multiple works execute in a non
    //blocking manner
    this->Queue.push( //build a lambda that is the work to do
      [=]() {
        vtkm::filter::vector_analysis::Gradient perThreadGrad = gradient;

        vtkm::cont::DataSet result = perThreadGrad.Execute(input);
        outPtr->ReplacePartition(index, result);
      });
    index++;
  }

  // Step 5. Wait on all workers to finish
  this->Queue.waitForAllTasksToComplete();

  return output;
}

VTKM_CONT vtkm::cont::DataSet MultiDeviceGradient::DoExecute(const vtkm::cont::DataSet& inData)
{
  vtkm::cont::PartitionedDataSet outData = this->Execute(vtkm::cont::PartitionedDataSet(inData));
  VTKM_ASSERT(outData.GetNumberOfPartitions() == 1);
  return outData.GetPartition(0);
}