From e621b6ba3c01593060e3f965dc739d780a6b1b0f Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Mon, 14 May 2018 15:55:23 -0400 Subject: [PATCH] Generalize the TBB radix sort implementation. The core algorithm will be shared by OpenMP. --- vtkm/cont/internal/CMakeLists.txt | 3 + .../internal/kxsort.h => internal/KXSort.h} | 22 + vtkm/cont/internal/ParallelRadixSort.h | 1069 +++++++++++++++++ .../internal/ParallelRadixSortInterface.h | 166 +++ .../tbb/internal/DeviceAdapterAlgorithmTBB.h | 8 +- vtkm/cont/tbb/internal/ParallelSortTBB.cxx | 890 +------------- vtkm/cont/tbb/internal/ParallelSortTBB.h | 181 +-- 7 files changed, 1349 insertions(+), 990 deletions(-) rename vtkm/cont/{tbb/internal/kxsort.h => internal/KXSort.h} (84%) create mode 100644 vtkm/cont/internal/ParallelRadixSort.h create mode 100644 vtkm/cont/internal/ParallelRadixSortInterface.h diff --git a/vtkm/cont/internal/CMakeLists.txt b/vtkm/cont/internal/CMakeLists.txt index 2faf93a7e..bd2186e80 100644 --- a/vtkm/cont/internal/CMakeLists.txt +++ b/vtkm/cont/internal/CMakeLists.txt @@ -37,6 +37,9 @@ set(headers DynamicTransform.h FunctorsGeneral.h IteratorFromArrayPortal.h + KXSort.h + ParallelRadixSort.h + ParallelRadixSortInterface.h SimplePolymorphicContainer.h StorageError.h VirtualObjectTransfer.h diff --git a/vtkm/cont/tbb/internal/kxsort.h b/vtkm/cont/internal/KXSort.h similarity index 84% rename from vtkm/cont/tbb/internal/kxsort.h rename to vtkm/cont/internal/KXSort.h index faa0def81..e1c8ef185 100644 --- a/vtkm/cont/tbb/internal/kxsort.h +++ b/vtkm/cont/internal/KXSort.h @@ -1,3 +1,25 @@ +//============================================================================= +// +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS). +// Copyright 2018 UT-Battelle, LLC. +// Copyright 2018 Los Alamos National Security. +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +// +//============================================================================= + /* The MIT License Copyright (c) 2016 Dinghua Li diff --git a/vtkm/cont/internal/ParallelRadixSort.h b/vtkm/cont/internal/ParallelRadixSort.h new file mode 100644 index 000000000..08be194ee --- /dev/null +++ b/vtkm/cont/internal/ParallelRadixSort.h @@ -0,0 +1,1069 @@ +//============================================================================= +// +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS). +// Copyright 2018 UT-Battelle, LLC. +// Copyright 2018 Los Alamos National Security. +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +// +//============================================================================= + +// Copyright 2010, Takuya Akiba +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Takuya Akiba nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Modifications of Takuya Akiba's original GitHub source code for inclusion +// in VTK-m: +// +// - Made parallelization library details generic (see "Threading Interface" below). +// - Added minimum threshold for parallel, will instead invoke serial radix sort (kxsort) +// - Added std::greater and std::less to interface for descending order sorts +// - Added linear scaling of threads used by the algorithm for more stable performance +// on machines with lots of available threads (KNL and Haswell) +// +// This file contains an implementation of Satish parallel radix sort +// as documented in the following citation: +// +// Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort. +// N. Satish, C. Kim, J. Chhugani, A. D. Nguyen, V. W. Lee, D. Kim, and P. Dubey. +// In Proc. SIGMOD, pages 351–362, 2010 +// +// Threading Interface: +// +// To use this implementation, an object containing the following members should +// be passed as the 'threader' argument at the entry points: +// +// struct ThreaderExample +// { +// // Return the number of cores available: +// size_t GetAvailableCores(); +// +// // Run the supplied functor in a new thread. This task is likely to +// // generate children (through RunChildTasks), and must block until all +// // children complete. +// template +// void RunParentTask(TaskType task); +// +// // Run an child task in a new thread. The function may be blocking or +// // non-blocking, and 'data' is an abitrary object passed to the parent +// // task's operator() (See the TBB implementation for details). +// template +// void RunChildTasks(ParentTaskThreadData data, TaskType left, TaskType right); +// }; +// +// See the sample implementations and the RunTask struct below for examples. + +#ifndef vtk_m_cont_internal_ParallelRadixSort_h +#define vtk_m_cont_internal_ParallelRadixSort_h + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +VTKM_THIRDPARTY_PRE_INCLUDE + +#include + +namespace vtkm +{ +namespace cont +{ +namespace internal +{ +namespace radix +{ + +namespace utility +{ +// Return the number of threads that would be executed in parallel regions +inline size_t GetMaxThreads(size_t num_bytes, size_t available_cores) +{ + const double CORES_PER_BYTE = + double(available_cores - 1) / double(BYTES_FOR_MAX_PARALLELISM - MIN_BYTES_FOR_PARALLEL); + const double Y_INTERCEPT = 1.0 - CORES_PER_BYTE * MIN_BYTES_FOR_PARALLEL; + + const size_t num_cores = (size_t)(CORES_PER_BYTE * double(num_bytes) + Y_INTERCEPT); + if (num_cores < 1) + { + return 1; + } + if (num_cores > available_cores) + { + return available_cores; + } + return num_cores; +} +} // namespace utility + +namespace internal +{ +// Size of the software managed buffer +const size_t kOutBufferSize = 32; + +// Ascending order radix sort is a no-op +template +struct ParallelRadixCompareInternal +{ + inline static void reverse(UnsignedType& t) { (void)t; } +}; + +// Handle descending order radix sort +template +struct ParallelRadixCompareInternal, + ValueManager, + Base> +{ + inline static void reverse(UnsignedType& t) { t = ((1 << Base) - 1) - t; } +}; + +// The algorithm is implemented in this internal class +template +class ParallelRadixSortInternal +{ +public: + using CompareInternal = + ParallelRadixCompareInternal; + + ParallelRadixSortInternal(); + ~ParallelRadixSortInternal(); + + void Init(PlainType* data, size_t num_elems, const ThreaderType& threader); + + PlainType* Sort(PlainType* data, ValueManager* value_manager); + + static void InitAndSort(PlainType* data, + size_t num_elems, + const ThreaderType& threader, + ValueManager* value_manager); + +private: + CompareInternal compare_internal_; + size_t num_elems_; + size_t num_threads_; + + UnsignedType* tmp_; + size_t** histo_; + UnsignedType*** out_buf_; + size_t** out_buf_n_; + + size_t *pos_bgn_, *pos_end_; + ValueManager* value_manager_; + ThreaderType threader_; + + void DeleteAll(); + + UnsignedType* SortInternal(UnsignedType* data, ValueManager* value_manager); + + // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads) + void ComputeRanges(); + + // First step of each iteration of sorting + // Compute the histogram of |src| using bits in [b, b + Base) + void ComputeHistogram(unsigned int b, UnsignedType* src); + + // Second step of each iteration of sorting + // Scatter elements of |src| to |dst| using the histogram + void Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst); +}; + +template +ParallelRadixSortInternal::ParallelRadixSortInternal() + : num_elems_(0) + , num_threads_(0) + , tmp_(NULL) + , histo_(NULL) + , out_buf_(NULL) + , out_buf_n_(NULL) + , pos_bgn_(NULL) + , pos_end_(NULL) +{ + assert(sizeof(PlainType) == sizeof(UnsignedType)); +} + +template +ParallelRadixSortInternal::~ParallelRadixSortInternal() +{ + DeleteAll(); +} + +template +void ParallelRadixSortInternal::DeleteAll() +{ + delete[] tmp_; + tmp_ = NULL; + + for (size_t i = 0; i < num_threads_; ++i) + delete[] histo_[i]; + delete[] histo_; + histo_ = NULL; + + for (size_t i = 0; i < num_threads_; ++i) + { + for (size_t j = 0; j < 1 << Base; ++j) + { + delete[] out_buf_[i][j]; + } + delete[] out_buf_n_[i]; + delete[] out_buf_[i]; + } + delete[] out_buf_; + delete[] out_buf_n_; + out_buf_ = NULL; + out_buf_n_ = NULL; + + delete[] pos_bgn_; + delete[] pos_end_; + pos_bgn_ = pos_end_ = NULL; + + num_elems_ = 0; + num_threads_ = 0; +} + +template +void ParallelRadixSortInternal::Init(PlainType* data, + size_t num_elems, + const ThreaderType& threader) +{ + (void)data; + DeleteAll(); + + threader_ = threader; + + num_elems_ = num_elems; + + num_threads_ = + utility::GetMaxThreads(num_elems_ * sizeof(PlainType), threader_.GetAvailableCores()); + + tmp_ = new UnsignedType[num_elems_]; + histo_ = new size_t*[num_threads_]; + for (size_t i = 0; i < num_threads_; ++i) + { + histo_[i] = new size_t[1 << Base]; + } + + out_buf_ = new UnsignedType**[num_threads_]; + out_buf_n_ = new size_t*[num_threads_]; + for (size_t i = 0; i < num_threads_; ++i) + { + out_buf_[i] = new UnsignedType*[1 << Base]; + out_buf_n_[i] = new size_t[1 << Base]; + for (size_t j = 0; j < 1 << Base; ++j) + { + out_buf_[i][j] = new UnsignedType[kOutBufferSize]; + } + } + + pos_bgn_ = new size_t[num_threads_]; + pos_end_ = new size_t[num_threads_]; +} + +template +PlainType* ParallelRadixSortInternal::Sort(PlainType* data, ValueManager* value_manager) +{ + UnsignedType* src = reinterpret_cast(data); + UnsignedType* res = SortInternal(src, value_manager); + return reinterpret_cast(res); +} + +template +void ParallelRadixSortInternal::InitAndSort(PlainType* data, + size_t num_elems, + const ThreaderType& threader, + ValueManager* value_manager) +{ + ParallelRadixSortInternal prs; + prs.Init(data, num_elems, threader); + const PlainType* res = prs.Sort(data, value_manager); + if (res != data) + { + for (size_t i = 0; i < num_elems; ++i) + data[i] = res[i]; + } +} + +template +UnsignedType* ParallelRadixSortInternal::SortInternal(UnsignedType* data, + ValueManager* value_manager) +{ + + value_manager_ = value_manager; + + // Compute |pos_bgn_| and |pos_end_| + ComputeRanges(); + + // Iterate from lower bits to higher bits + const size_t bits = CHAR_BIT * sizeof(UnsignedType); + UnsignedType *src = data, *dst = tmp_; + for (unsigned int b = 0; b < bits; b += Base) + { + ComputeHistogram(b, src); + Scatter(b, src, dst); + + std::swap(src, dst); + value_manager->Next(); + } + + return src; +} + +template +void ParallelRadixSortInternal::ComputeRanges() +{ + pos_bgn_[0] = 0; + for (size_t i = 0; i < num_threads_ - 1; ++i) + { + const size_t t = (num_elems_ - pos_bgn_[i]) / (num_threads_ - i); + pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t; + } + pos_end_[num_threads_ - 1] = num_elems_; +} + +template +struct RunTask +{ + RunTask(size_t binary_tree_height, + size_t binary_tree_position, + Function f, + size_t num_elems, + size_t num_threads, + const ThreaderType& threader) + : binary_tree_height_(binary_tree_height) + , binary_tree_position_(binary_tree_position) + , f_(f) + , num_elems_(num_elems) + , num_threads_(num_threads) + , threader_(threader) + { + } + + template + void operator()(ThreaderData tData = nullptr) + { + size_t num_nodes_at_current_height = (size_t)pow(2, (double)binary_tree_height_); + if (num_threads_ <= num_nodes_at_current_height) + { + const size_t my_id = binary_tree_position_ - num_nodes_at_current_height; + if (my_id < num_threads_) + { + f_(my_id); + } + } + else + { + RunTask left(binary_tree_height_ + 1, + 2 * binary_tree_position_, + f_, + num_elems_, + num_threads_, + threader_); + RunTask right(binary_tree_height_ + 1, + 2 * binary_tree_position_ + 1, + f_, + num_elems_, + num_threads_, + threader_); + threader_.RunChildTasks(tData, left, right); + } + } + + size_t binary_tree_height_; + size_t binary_tree_position_; + Function f_; + size_t num_elems_; + size_t num_threads_; + ThreaderType threader_; +}; + +template +void ParallelRadixSortInternal::ComputeHistogram(unsigned int b, UnsignedType* src) +{ + // Compute local histogram + + auto lambda = [=](const size_t my_id) { + const size_t my_bgn = pos_bgn_[my_id]; + const size_t my_end = pos_end_[my_id]; + size_t* my_histo = histo_[my_id]; + + memset(my_histo, 0, sizeof(size_t) * (1 << Base)); + for (size_t i = my_bgn; i < my_end; ++i) + { + const UnsignedType s = Encoder::encode(src[i]); + UnsignedType t = (s >> b) & ((1 << Base) - 1); + compare_internal_.reverse(t); + ++my_histo[t]; + } + }; + + using RunTaskType = + RunTask, ThreaderType>; + + RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_); + this->threader_.RunParentTask(root); + + // Compute global histogram + size_t s = 0; + for (size_t i = 0; i < 1 << Base; ++i) + { + for (size_t j = 0; j < num_threads_; ++j) + { + const size_t t = s + histo_[j][i]; + histo_[j][i] = s; + s = t; + } + } +} + +template +void ParallelRadixSortInternal::Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst) +{ + + auto lambda = [=](const size_t my_id) { + const size_t my_bgn = pos_bgn_[my_id]; + const size_t my_end = pos_end_[my_id]; + size_t* my_histo = histo_[my_id]; + UnsignedType** my_buf = out_buf_[my_id]; + size_t* my_buf_n = out_buf_n_[my_id]; + + memset(my_buf_n, 0, sizeof(size_t) * (1 << Base)); + for (size_t i = my_bgn; i < my_end; ++i) + { + const UnsignedType s = Encoder::encode(src[i]); + UnsignedType t = (s >> b) & ((1 << Base) - 1); + compare_internal_.reverse(t); + my_buf[t][my_buf_n[t]] = src[i]; + value_manager_->Push(my_id, t, my_buf_n[t], i); + ++my_buf_n[t]; + + if (my_buf_n[t] == kOutBufferSize) + { + size_t p = my_histo[t]; + for (size_t j = 0; j < kOutBufferSize; ++j) + { + size_t tp = p++; + dst[tp] = my_buf[t][j]; + } + value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]); + + my_histo[t] += kOutBufferSize; + my_buf_n[t] = 0; + } + } + + // Flush everything + for (size_t i = 0; i < 1 << Base; ++i) + { + size_t p = my_histo[i]; + for (size_t j = 0; j < my_buf_n[i]; ++j) + { + size_t tp = p++; + dst[tp] = my_buf[i][j]; + } + value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]); + } + }; + + using RunTaskType = + RunTask, ThreaderType>; + RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_); + this->threader_.RunParentTask(root); +} +} // namespace internal + +// Encoders encode signed/unsigned integers and floating point numbers +// to correctly ordered unsigned integers +namespace encoder +{ +class EncoderDummy +{ +}; + +class EncoderUnsigned +{ +public: + template + inline static UnsignedType encode(UnsignedType x) + { + return x; + } +}; + +class EncoderSigned +{ +public: + template + inline static UnsignedType encode(UnsignedType x) + { + return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1)); + } +}; + +class EncoderDecimal +{ +public: + template + inline static UnsignedType encode(UnsignedType x) + { + static const size_t bits = CHAR_BIT * sizeof(UnsignedType); + const UnsignedType a = x >> (bits - 1); + const UnsignedType b = (-static_cast(a)) | (UnsignedType(1) << (bits - 1)); + return x ^ b; + } +}; +} // namespace encoder + +// Value managers are used to generalize the sorting algorithm +// to sorting of keys and sorting of pairs +namespace value_manager +{ +class DummyValueManager +{ +public: + inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) + { + (void)thread; + (void)bucket; + (void)num; + (void)from_pos; + } + + inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) + { + (void)thread; + (void)bucket; + (void)num; + (void)to_pos; + } + + void Next() {} +}; + +template +class PairValueManager +{ +public: + PairValueManager() + : max_elems_(0) + , max_threads_(0) + , original_(NULL) + , tmp_(NULL) + , src_(NULL) + , dst_(NULL) + , out_buf_(NULL) + { + } + + ~PairValueManager() { DeleteAll(); } + + void Init(size_t max_elems, size_t available_threads); + + void Start(ValueType* original, size_t num_elems) + { + assert(num_elems <= max_elems_); + src_ = original_ = original; + dst_ = tmp_; + } + + inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) + { + out_buf_[thread][bucket][num] = src_[from_pos]; + } + + inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) + { + for (size_t i = 0; i < num; ++i) + { + dst_[to_pos++] = out_buf_[thread][bucket][i]; + } + } + + void Next() { std::swap(src_, dst_); } + + ValueType* GetResult() { return src_; } +private: + size_t max_elems_; + int max_threads_; + + static constexpr size_t kOutBufferSize = internal::kOutBufferSize; + ValueType *original_, *tmp_; + ValueType *src_, *dst_; + ValueType*** out_buf_; + + void DeleteAll(); +}; + +template +void PairValueManager::Init(size_t max_elems, size_t available_cores) +{ + DeleteAll(); + + max_elems_ = max_elems; + max_threads_ = utility::GetMaxThreads(max_elems_ * sizeof(PlainType), available_cores); + + tmp_ = new ValueType[max_elems]; + + out_buf_ = new ValueType**[max_threads_]; + for (int i = 0; i < max_threads_; ++i) + { + out_buf_[i] = new ValueType*[1 << Base]; + for (size_t j = 0; j < 1 << Base; ++j) + { + out_buf_[i][j] = new ValueType[kOutBufferSize]; + } + } +} + +template +void PairValueManager::DeleteAll() +{ + delete[] tmp_; + tmp_ = NULL; + + for (int i = 0; i < max_threads_; ++i) + { + for (size_t j = 0; j < 1 << Base; ++j) + { + delete[] out_buf_[i][j]; + } + delete[] out_buf_[i]; + } + delete[] out_buf_; + out_buf_ = NULL; + + max_elems_ = 0; + max_threads_ = 0; +} +} // namespace value_manager + +// Frontend class for sorting keys +template +class KeySort +{ + using DummyValueManager = value_manager::DummyValueManager; + using Internal = internal::ParallelRadixSortInternal; + +public: + void InitAndSort(PlainType* data, + size_t num_elems, + const ThreaderType& threader, + const CompareType& comp) + { + (void)comp; + DummyValueManager dvm; + Internal::InitAndSort(data, num_elems, threader, &dvm); + } +}; + +// Frontend class for sorting pairs +template +class PairSort +{ + using ValueManager = value_manager::PairValueManager; + using Internal = internal::ParallelRadixSortInternal; + +public: + void InitAndSort(PlainType* keys, + ValueType* vals, + size_t num_elems, + const ThreaderType& threader, + const CompareType& comp) + { + (void)comp; + ValueManager vm; + vm.Init(num_elems, threader.GetAvailableCores()); + vm.Start(vals, num_elems); + Internal::InitAndSort(keys, num_elems, threader, &vm); + ValueType* res_vals = vm.GetResult(); + if (res_vals != vals) + { + for (size_t i = 0; i < num_elems; ++i) + { + vals[i] = res_vals[i]; + } + } + } + +private: +}; + +#define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type) \ + template \ + class KeySort \ + : public KeySort \ + { \ + }; \ + template \ + class PairSort \ + : public PairSort \ + { \ + }; + +// Unsigned integers +KEY_SORT_CASE(unsigned int, std::less, unsigned int, Unsigned); +KEY_SORT_CASE(unsigned int, std::greater, unsigned int, Unsigned); +KEY_SORT_CASE(unsigned short int, std::less, unsigned short int, Unsigned); +KEY_SORT_CASE(unsigned short int, std::greater, unsigned short int, Unsigned); +KEY_SORT_CASE(unsigned long int, std::less, unsigned long int, Unsigned); +KEY_SORT_CASE(unsigned long int, std::greater, unsigned long int, Unsigned); +KEY_SORT_CASE(unsigned long long int, + std::less, + unsigned long long int, + Unsigned); +KEY_SORT_CASE(unsigned long long int, + std::greater, + unsigned long long int, + Unsigned); + +// Unsigned char +KEY_SORT_CASE(unsigned char, std::less, unsigned char, Unsigned); +KEY_SORT_CASE(unsigned char, std::greater, unsigned char, Unsigned); +KEY_SORT_CASE(char16_t, std::less, uint16_t, Unsigned); +KEY_SORT_CASE(char16_t, std::greater, uint16_t, Unsigned); +KEY_SORT_CASE(char32_t, std::less, uint32_t, Unsigned); +KEY_SORT_CASE(char32_t, std::greater, uint32_t, Unsigned); +KEY_SORT_CASE(wchar_t, std::less, uint32_t, Unsigned); +KEY_SORT_CASE(wchar_t, std::greater, uint32_t, Unsigned); + +// Signed integers +KEY_SORT_CASE(char, std::less, unsigned char, Signed); +KEY_SORT_CASE(char, std::greater, unsigned char, Signed); +KEY_SORT_CASE(short, std::less, unsigned short, Signed); +KEY_SORT_CASE(short, std::greater, unsigned short, Signed); +KEY_SORT_CASE(int, std::less, unsigned int, Signed); +KEY_SORT_CASE(int, std::greater, unsigned int, Signed); +KEY_SORT_CASE(long, std::less, unsigned long, Signed); +KEY_SORT_CASE(long, std::greater, unsigned long, Signed); +KEY_SORT_CASE(long long, std::less, unsigned long long, Signed); +KEY_SORT_CASE(long long, std::greater, unsigned long long, Signed); + +// |signed char| and |char| are treated as different types +KEY_SORT_CASE(signed char, std::less, unsigned char, Signed); +KEY_SORT_CASE(signed char, std::greater, unsigned char, Signed); + +// Floating point numbers +KEY_SORT_CASE(float, std::less, uint32_t, Decimal); +KEY_SORT_CASE(float, std::greater, uint32_t, Decimal); +KEY_SORT_CASE(double, std::less, uint64_t, Decimal); +KEY_SORT_CASE(double, std::greater, uint64_t, Decimal); + +#undef KEY_SORT_CASE + +template +struct run_kx_radix_sort_keys +{ + static void run(T* data, size_t num_elems, const CompareType& comp) + { + std::sort(data, data + num_elems, comp); + } +}; + +#define KX_SORT_KEYS(key_type) \ + template <> \ + struct run_kx_radix_sort_keys> \ + { \ + static void run(key_type* data, size_t num_elems, const std::less& comp) \ + { \ + (void)comp; \ + kx::radix_sort(data, data + num_elems); \ + } \ + }; + +KX_SORT_KEYS(unsigned short int); +KX_SORT_KEYS(int); +KX_SORT_KEYS(unsigned int); +KX_SORT_KEYS(long int); +KX_SORT_KEYS(unsigned long int); +KX_SORT_KEYS(long long int); +KX_SORT_KEYS(unsigned long long int); +KX_SORT_KEYS(unsigned char); + +#undef KX_SORT_KEYS + +template +bool use_serial_sort_keys(T* data, size_t num_elems, const CompareType& comp) +{ + size_t total_bytes = (num_elems) * sizeof(T); + if (total_bytes < MIN_BYTES_FOR_PARALLEL) + { + run_kx_radix_sort_keys::run(data, num_elems, comp); + return true; + } + return false; +} + +// Generate radix sort interfaces for key and key value sorts. +#define VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(threader_type, key_type) \ + VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ + key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater& comp) \ + { \ + using namespace vtkm::cont::internal::radix; \ + PairSort> ps; \ + ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \ + } \ + VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ + key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less& comp) \ + { \ + using namespace vtkm::cont::internal::radix; \ + PairSort> ps; \ + ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \ + } \ + VTKM_CONT_EXPORT void parallel_radix_sort( \ + key_type* data, size_t num_elems, const std::greater& comp) \ + { \ + using namespace vtkm::cont::internal::radix; \ + if (!use_serial_sort_keys(data, num_elems, comp)) \ + { \ + KeySort> ks; \ + ks.InitAndSort(data, num_elems, threader_type(), comp); \ + } \ + } \ + VTKM_CONT_EXPORT void parallel_radix_sort( \ + key_type* data, size_t num_elems, const std::less& comp) \ + { \ + using namespace vtkm::cont::internal::radix; \ + if (!use_serial_sort_keys(data, num_elems, comp)) \ + { \ + KeySort> ks; \ + ks.InitAndSort(data, num_elems, threader_type(), comp); \ + } \ + } + +#define VTKM_INSTANTIATE_RADIX_SORT_FOR_THREADER(ThreaderType) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, short int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned short int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long long int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long long int) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned char) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, signed char) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char16_t) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char32_t) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, wchar_t) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, float) \ + VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, double) + +VTKM_THIRDPARTY_POST_INCLUDE +} +} +} +} // end namespace vtkm::cont::internal::radix + +#endif // vtk_m_cont_internal_ParallelRadixSort_h diff --git a/vtkm/cont/internal/ParallelRadixSortInterface.h b/vtkm/cont/internal/ParallelRadixSortInterface.h new file mode 100644 index 000000000..1295509c3 --- /dev/null +++ b/vtkm/cont/internal/ParallelRadixSortInterface.h @@ -0,0 +1,166 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS). +// Copyright 2017 UT-Battelle, LLC. +// Copyright 2017 Los Alamos National Security. +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#ifndef vtk_m_cont_internal_ParallelRadixSortInterface_h +#define vtk_m_cont_internal_ParallelRadixSortInterface_h + +#include +#include + +#include +#include + +namespace vtkm +{ +namespace cont +{ +namespace internal +{ +namespace radix +{ + +const size_t MIN_BYTES_FOR_PARALLEL = 400000; +const size_t BYTES_FOR_MAX_PARALLELISM = 4000000; + +struct RadixSortTag +{ +}; + +struct PSortTag +{ +}; + +// Detect supported functors for radix sort: +template +struct is_valid_compare_type : std::integral_constant +{ +}; +template +struct is_valid_compare_type> : std::integral_constant +{ +}; +template +struct is_valid_compare_type> : std::integral_constant +{ +}; +template <> +struct is_valid_compare_type : std::integral_constant +{ +}; +template <> +struct is_valid_compare_type : std::integral_constant +{ +}; + +// Convert vtkm::Sort[Less|Greater] to the std:: equivalents: +template +BComp&& get_std_compare(BComp&& b, T&&) +{ + return std::forward(b); +} +template +std::less get_std_compare(vtkm::SortLess, T&&) +{ + return std::less{}; +} +template +std::greater get_std_compare(vtkm::SortGreater, T&&) +{ + return std::greater{}; +} + +// Determine if radix sort can be used for a given ValueType, StorageType, and +// comparison functor. +template +struct sort_tag_type +{ + using type = PSortTag; +}; +template +struct sort_tag_type +{ + using PrimT = std::is_arithmetic; + using LongDT = std::is_same; + using BComp = is_valid_compare_type; + using type = typename std::conditional::type; +}; + +template +struct sortbykey_tag_type +{ + using type = PSortTag; +}; +template +struct sortbykey_tag_type +{ + using PrimKey = std::is_arithmetic; + using PrimValue = std::is_arithmetic; + using LongDKey = std::is_same; + using BComp = is_valid_compare_type; + using type = typename std::conditional::type; +}; + +#define VTKM_INTERNAL_RADIX_SORT_DECLARE(key_type) \ + VTKM_CONT_EXPORT void parallel_radix_sort( \ + key_type* data, size_t num_elems, const std::greater& comp); \ + VTKM_CONT_EXPORT void parallel_radix_sort( \ + key_type* data, size_t num_elems, const std::less& comp); \ + VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ + key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater& comp); \ + VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ + key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less& comp); + +// Generate radix sort interfaces for key and key value sorts. +#define VTKM_DECLARE_RADIX_SORT() \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(short int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned short int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(long int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned long int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(long long int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned long long int) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned char) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(signed char) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(char) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(char16_t) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(char32_t) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(wchar_t) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(float) \ + VTKM_INTERNAL_RADIX_SORT_DECLARE(double) +} +} +} +} // end vtkm::cont::internal::radix + +#endif // vtk_m_cont_internal_ParallelRadixSortInterface_h diff --git a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h index 24f4b8d43..d60a2fb3a 100644 --- a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h +++ b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h @@ -251,21 +251,21 @@ public: { //this is required to get sort to work with zip handles std::less lessOp; - vtkm::cont::tbb::internal::parallel_sort(values, lessOp); + vtkm::cont::tbb::sort::parallel_sort(values, lessOp); } template VTKM_CONT static void Sort(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { - vtkm::cont::tbb::internal::parallel_sort(values, binary_compare); + vtkm::cont::tbb::sort::parallel_sort(values, binary_compare); } template VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values) { - vtkm::cont::tbb::internal::parallel_sort_bykey(keys, values, std::less()); + vtkm::cont::tbb::sort::parallel_sort_bykey(keys, values, std::less()); } template @@ -273,7 +273,7 @@ public: vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { - vtkm::cont::tbb::internal::parallel_sort_bykey(keys, values, binary_compare); + vtkm::cont::tbb::sort::parallel_sort_bykey(keys, values, binary_compare); } template diff --git a/vtkm/cont/tbb/internal/ParallelSortTBB.cxx b/vtkm/cont/tbb/internal/ParallelSortTBB.cxx index aa5790637..032ae2ba3 100644 --- a/vtkm/cont/tbb/internal/ParallelSortTBB.cxx +++ b/vtkm/cont/tbb/internal/ParallelSortTBB.cxx @@ -49,36 +49,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// Modifications of Takuya Akiba's original GitHub source code for inclusion -// in VTK-m: -// -// - Changed parallel threading from OpenMP to TBB tasks -// - Added minimum threshold for parallel, will instead invoke serial radix sort (kxsort) -// - Added std::greater and std::less to interface for descending order sorts -// - Added linear scaling of threads used by the algorithm for more stable performance -// on machines with lots of available threads (KNL and Haswell) -// -// This file contains an implementation of Satish parallel radix sort -// as documented in the following citation: -// -// Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort. -// N. Satish, C. Kim, J. Chhugani, A. D. Nguyen, V. W. Lee, D. Kim, and P. Dubey. -// In Proc. SIGMOD, pages 351–362, 2010 -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -VTKM_THIRDPARTY_PRE_INCLUDE - -#include +#include #if defined(VTKM_MSVC) @@ -107,852 +78,57 @@ namespace cont { namespace tbb { -namespace internal +namespace sort { -const size_t MIN_BYTES_FOR_PARALLEL = 400000; -const size_t BYTES_FOR_MAX_PARALLELISM = 4000000; const size_t MAX_CORES = ::tbb::tbb_thread::hardware_concurrency(); -const double CORES_PER_BYTE = - double(MAX_CORES - 1) / double(BYTES_FOR_MAX_PARALLELISM - MIN_BYTES_FOR_PARALLEL); -const double Y_INTERCEPT = 1.0 - CORES_PER_BYTE * MIN_BYTES_FOR_PARALLEL; -namespace utility +// Simple TBB task wrapper around a generic functor. +template +struct TaskWrapper : public ::tbb::task { -// Return the number of threads that would be executed in parallel regions -inline size_t GetMaxThreads(size_t num_bytes) -{ - size_t num_cores = (size_t)(CORES_PER_BYTE * double(num_bytes) + Y_INTERCEPT); - if (num_cores < 1) - { - return 1; - } - if (num_cores > MAX_CORES) - { - return MAX_CORES; - } - return num_cores; -} -} // namespace utility + FunctorType Functor; -namespace internal -{ -// Size of the software managed buffer -const size_t kOutBufferSize = 32; - -// Ascending order radix sort is a no-op -template -struct ParallelRadixCompareInternal -{ - inline static void reverse(UnsignedType& t) { (void)t; } -}; - -// Handle descending order radix sort -template -struct ParallelRadixCompareInternal, - ValueManager, - Base> -{ - inline static void reverse(UnsignedType& t) { t = ((1 << Base) - 1) - t; } -}; - -// The algorithm is implemented in this internal class -template -class ParallelRadixSortInternal -{ -public: - using CompareInternal = - ParallelRadixCompareInternal; - - ParallelRadixSortInternal(); - ~ParallelRadixSortInternal(); - - void Init(PlainType* data, size_t num_elems); - - PlainType* Sort(PlainType* data, ValueManager* value_manager); - - static void InitAndSort(PlainType* data, size_t num_elems, ValueManager* value_manager); - -private: - CompareInternal compare_internal_; - size_t num_elems_; - size_t num_threads_; - - UnsignedType* tmp_; - size_t** histo_; - UnsignedType*** out_buf_; - size_t** out_buf_n_; - - size_t *pos_bgn_, *pos_end_; - ValueManager* value_manager_; - - void DeleteAll(); - - UnsignedType* SortInternal(UnsignedType* data, ValueManager* value_manager); - - // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads) - void ComputeRanges(); - - // First step of each iteration of sorting - // Compute the histogram of |src| using bits in [b, b + Base) - void ComputeHistogram(unsigned int b, UnsignedType* src); - - // Second step of each iteration of sorting - // Scatter elements of |src| to |dst| using the histogram - void Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst); -}; - -template -ParallelRadixSortInternal:: - ParallelRadixSortInternal() - : num_elems_(0) - , num_threads_(0) - , tmp_(NULL) - , histo_(NULL) - , out_buf_(NULL) - , out_buf_n_(NULL) - , pos_bgn_(NULL) - , pos_end_(NULL) -{ - assert(sizeof(PlainType) == sizeof(UnsignedType)); -} - -template -ParallelRadixSortInternal:: - ~ParallelRadixSortInternal() -{ - DeleteAll(); -} - -template -void ParallelRadixSortInternal:: - DeleteAll() -{ - delete[] tmp_; - tmp_ = NULL; - - for (size_t i = 0; i < num_threads_; ++i) - delete[] histo_[i]; - delete[] histo_; - histo_ = NULL; - - for (size_t i = 0; i < num_threads_; ++i) - { - for (size_t j = 0; j < 1 << Base; ++j) - { - delete[] out_buf_[i][j]; - } - delete[] out_buf_n_[i]; - delete[] out_buf_[i]; - } - delete[] out_buf_; - delete[] out_buf_n_; - out_buf_ = NULL; - out_buf_n_ = NULL; - - delete[] pos_bgn_; - delete[] pos_end_; - pos_bgn_ = pos_end_ = NULL; - - num_elems_ = 0; - num_threads_ = 0; -} - -template -void ParallelRadixSortInternal:: - Init(PlainType* data, size_t num_elems) -{ - (void)data; - DeleteAll(); - - num_elems_ = num_elems; - - num_threads_ = utility::GetMaxThreads(num_elems_ * sizeof(PlainType)); - - tmp_ = new UnsignedType[num_elems_]; - histo_ = new size_t*[num_threads_]; - for (size_t i = 0; i < num_threads_; ++i) - { - histo_[i] = new size_t[1 << Base]; - } - - out_buf_ = new UnsignedType**[num_threads_]; - out_buf_n_ = new size_t*[num_threads_]; - for (size_t i = 0; i < num_threads_; ++i) - { - out_buf_[i] = new UnsignedType*[1 << Base]; - out_buf_n_[i] = new size_t[1 << Base]; - for (size_t j = 0; j < 1 << Base; ++j) - { - out_buf_[i][j] = new UnsignedType[kOutBufferSize]; - } - } - - pos_bgn_ = new size_t[num_threads_]; - pos_end_ = new size_t[num_threads_]; -} - -template -PlainType* -ParallelRadixSortInternal::Sort( - PlainType* data, - ValueManager* value_manager) -{ - UnsignedType* src = reinterpret_cast(data); - UnsignedType* res = SortInternal(src, value_manager); - return reinterpret_cast(res); -} - -template -void ParallelRadixSortInternal:: - InitAndSort(PlainType* data, size_t num_elems, ValueManager* value_manager) -{ - ParallelRadixSortInternal prs; - prs.Init(data, num_elems); - const PlainType* res = prs.Sort(data, value_manager); - if (res != data) - { - for (size_t i = 0; i < num_elems; ++i) - data[i] = res[i]; - } -} - -template -UnsignedType* -ParallelRadixSortInternal:: - SortInternal(UnsignedType* data, ValueManager* value_manager) -{ - - value_manager_ = value_manager; - - // Compute |pos_bgn_| and |pos_end_| - ComputeRanges(); - - // Iterate from lower bits to higher bits - const size_t bits = CHAR_BIT * sizeof(UnsignedType); - UnsignedType *src = data, *dst = tmp_; - for (unsigned int b = 0; b < bits; b += Base) - { - ComputeHistogram(b, src); - Scatter(b, src, dst); - - std::swap(src, dst); - value_manager->Next(); - } - - return src; -} - -template -void ParallelRadixSortInternal:: - ComputeRanges() -{ - pos_bgn_[0] = 0; - for (size_t i = 0; i < num_threads_ - 1; ++i) - { - const size_t t = (num_elems_ - pos_bgn_[i]) / (num_threads_ - i); - pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t; - } - pos_end_[num_threads_ - 1] = num_elems_; -} - -template -class RunTask : public ::tbb::task -{ -public: - RunTask(size_t binary_tree_height, - size_t binary_tree_position, - Function f, - size_t num_elems, - size_t num_threads) - : binary_tree_height_(binary_tree_height) - , binary_tree_position_(binary_tree_position) - , f_(f) - , num_elems_(num_elems) - , num_threads_(num_threads) + TaskWrapper(FunctorType f) + : Functor(f) { } ::tbb::task* execute() { - size_t num_nodes_at_current_height = (size_t)pow(2, (double)binary_tree_height_); - if (num_threads_ <= num_nodes_at_current_height) - { - const size_t my_id = binary_tree_position_ - num_nodes_at_current_height; - if (my_id < num_threads_) - { - f_(my_id); - } - return NULL; - } - else - { - ::tbb::empty_task& p = *new (task::allocate_continuation())::tbb::empty_task(); - RunTask& left = *new (p.allocate_child()) RunTask( - binary_tree_height_ + 1, 2 * binary_tree_position_, f_, num_elems_, num_threads_); - RunTask& right = *new (p.allocate_child()) RunTask( - binary_tree_height_ + 1, 2 * binary_tree_position_ + 1, f_, num_elems_, num_threads_); - p.set_ref_count(2); - task::spawn(left); - task::spawn(right); - return NULL; - } - } - -private: - size_t binary_tree_height_; - size_t binary_tree_position_; - Function f_; - size_t num_elems_; - size_t num_threads_; -}; - -template -void ParallelRadixSortInternal:: - ComputeHistogram(unsigned int b, UnsignedType* src) -{ - // Compute local histogram - - auto lambda = [=](const size_t my_id) { - const size_t my_bgn = pos_bgn_[my_id]; - const size_t my_end = pos_end_[my_id]; - size_t* my_histo = histo_[my_id]; - - memset(my_histo, 0, sizeof(size_t) * (1 << Base)); - for (size_t i = my_bgn; i < my_end; ++i) - { - const UnsignedType s = Encoder::encode(src[i]); - UnsignedType t = (s >> b) & ((1 << Base) - 1); - compare_internal_.reverse(t); - ++my_histo[t]; - } - }; - - using RunTaskType = RunTask>; - - RunTaskType& root = - *new (::tbb::task::allocate_root()) RunTaskType(0, 1, lambda, num_elems_, num_threads_); - - ::tbb::task::spawn_root_and_wait(root); - - // Compute global histogram - size_t s = 0; - for (size_t i = 0; i < 1 << Base; ++i) - { - for (size_t j = 0; j < num_threads_; ++j) - { - const size_t t = s + histo_[j][i]; - histo_[j][i] = s; - s = t; - } - } -} - -template -void ParallelRadixSortInternal:: - Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst) -{ - - auto lambda = [=](const size_t my_id) { - const size_t my_bgn = pos_bgn_[my_id]; - const size_t my_end = pos_end_[my_id]; - size_t* my_histo = histo_[my_id]; - UnsignedType** my_buf = out_buf_[my_id]; - size_t* my_buf_n = out_buf_n_[my_id]; - - memset(my_buf_n, 0, sizeof(size_t) * (1 << Base)); - for (size_t i = my_bgn; i < my_end; ++i) - { - const UnsignedType s = Encoder::encode(src[i]); - UnsignedType t = (s >> b) & ((1 << Base) - 1); - compare_internal_.reverse(t); - my_buf[t][my_buf_n[t]] = src[i]; - value_manager_->Push(my_id, t, my_buf_n[t], i); - ++my_buf_n[t]; - - if (my_buf_n[t] == kOutBufferSize) - { - size_t p = my_histo[t]; - for (size_t j = 0; j < kOutBufferSize; ++j) - { - size_t tp = p++; - dst[tp] = my_buf[t][j]; - } - value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]); - - my_histo[t] += kOutBufferSize; - my_buf_n[t] = 0; - } - } - - // Flush everything - for (size_t i = 0; i < 1 << Base; ++i) - { - size_t p = my_histo[i]; - for (size_t j = 0; j < my_buf_n[i]; ++j) - { - size_t tp = p++; - dst[tp] = my_buf[i][j]; - } - value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]); - } - }; - - using RunTaskType = RunTask>; - - RunTaskType& root = - *new (::tbb::task::allocate_root()) RunTaskType(0, 1, lambda, num_elems_, num_threads_); - - ::tbb::task::spawn_root_and_wait(root); -} -} // namespace internal - -// Encoders encode signed/unsigned integers and floating point numbers -// to correctly ordered unsigned integers -namespace encoder -{ -class EncoderDummy -{ -}; - -class EncoderUnsigned -{ -public: - template - inline static UnsignedType encode(UnsignedType x) - { - return x; + this->Functor(this); + return nullptr; } }; -class EncoderSigned +struct RadixThreaderTBB { -public: - template - inline static UnsignedType encode(UnsignedType x) + size_t GetAvailableCores() const { return MAX_CORES; } + + template + void RunParentTask(TaskType task) { - return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1)); + using Task = TaskWrapper; + Task& root = *new (::tbb::task::allocate_root()) Task(task); + ::tbb::task::spawn_root_and_wait(root); + } + + template + void RunChildTasks(TaskWrapper* wrapper, TaskType left, TaskType right) + { + using Task = TaskWrapper; + ::tbb::empty_task& p = *new (wrapper->allocate_continuation())::tbb::empty_task(); + + Task& lchild = *new (p.allocate_child()) Task(left); + Task& rchild = *new (p.allocate_child()) Task(right); + p.set_ref_count(2); + ::tbb::task::spawn(lchild); + ::tbb::task::spawn(rchild); } }; -class EncoderDecimal -{ -public: - template - inline static UnsignedType encode(UnsignedType x) - { - static const size_t bits = CHAR_BIT * sizeof(UnsignedType); - const UnsignedType a = x >> (bits - 1); - const UnsignedType b = (-static_cast(a)) | (UnsignedType(1) << (bits - 1)); - return x ^ b; - } -}; -} // namespace encoder - -// Value managers are used to generalize the sorting algorithm -// to sorting of keys and sorting of pairs -namespace value_manager -{ -class DummyValueManager -{ -public: - inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) - { - (void)thread; - (void)bucket; - (void)num; - (void)from_pos; - } - - inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) - { - (void)thread; - (void)bucket; - (void)num; - (void)to_pos; - } - - void Next() {} -}; - -template -class PairValueManager -{ -public: - PairValueManager() - : max_elems_(0) - , max_threads_(0) - , original_(NULL) - , tmp_(NULL) - , src_(NULL) - , dst_(NULL) - , out_buf_(NULL) - { - } - - ~PairValueManager() { DeleteAll(); } - - void Init(size_t max_elems); - - void Start(ValueType* original, size_t num_elems) - { - assert(num_elems <= max_elems_); - src_ = original_ = original; - dst_ = tmp_; - } - - inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) - { - out_buf_[thread][bucket][num] = src_[from_pos]; - } - - inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) - { - for (size_t i = 0; i < num; ++i) - { - dst_[to_pos++] = out_buf_[thread][bucket][i]; - } - } - - void Next() { std::swap(src_, dst_); } - - ValueType* GetResult() { return src_; } -private: - size_t max_elems_; - int max_threads_; - - static constexpr size_t kOutBufferSize = internal::kOutBufferSize; - ValueType *original_, *tmp_; - ValueType *src_, *dst_; - ValueType*** out_buf_; - - void DeleteAll(); -}; - -template -void PairValueManager::Init(size_t max_elems) -{ - DeleteAll(); - - max_elems_ = max_elems; - max_threads_ = utility::GetMaxThreads(max_elems_ * sizeof(PlainType)); - - tmp_ = new ValueType[max_elems]; - - out_buf_ = new ValueType**[max_threads_]; - for (int i = 0; i < max_threads_; ++i) - { - out_buf_[i] = new ValueType*[1 << Base]; - for (size_t j = 0; j < 1 << Base; ++j) - { - out_buf_[i][j] = new ValueType[kOutBufferSize]; - } - } -} - -template -void PairValueManager::DeleteAll() -{ - delete[] tmp_; - tmp_ = NULL; - - for (int i = 0; i < max_threads_; ++i) - { - for (size_t j = 0; j < 1 << Base; ++j) - { - delete[] out_buf_[i][j]; - } - delete[] out_buf_[i]; - } - delete[] out_buf_; - out_buf_ = NULL; - - max_elems_ = 0; - max_threads_ = 0; -} -} // namespace value_manager - -// Frontend class for sorting keys -template -class KeySort -{ - using DummyValueManager = value_manager::DummyValueManager; - using Internal = internal::ParallelRadixSortInternal; - -public: - void InitAndSort(PlainType* data, size_t num_elems, const CompareType& comp) - { - (void)comp; - DummyValueManager dvm; - Internal::InitAndSort(data, num_elems, &dvm); - } -}; - -// Frontend class for sorting pairs -template -class PairSort -{ - using ValueManager = value_manager::PairValueManager; - using Internal = internal:: - ParallelRadixSortInternal; - -public: - void InitAndSort(PlainType* keys, ValueType* vals, size_t num_elems, const CompareType& comp) - { - (void)comp; - ValueManager vm; - vm.Init(num_elems); - vm.Start(vals, num_elems); - Internal::InitAndSort(keys, num_elems, &vm); - ValueType* res_vals = vm.GetResult(); - if (res_vals != vals) - { - for (size_t i = 0; i < num_elems; ++i) - { - vals[i] = res_vals[i]; - } - } - } - -private: -}; - -#define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type) \ - template <> \ - class KeySort \ - : public KeySort \ - { \ - }; \ - template \ - class PairSort \ - : public PairSort \ - { \ - }; - -// Unsigned integers -KEY_SORT_CASE(unsigned int, std::less, unsigned int, Unsigned); -KEY_SORT_CASE(unsigned int, std::greater, unsigned int, Unsigned); -KEY_SORT_CASE(unsigned short int, std::less, unsigned short int, Unsigned); -KEY_SORT_CASE(unsigned short int, std::greater, unsigned short int, Unsigned); -KEY_SORT_CASE(unsigned long int, std::less, unsigned long int, Unsigned); -KEY_SORT_CASE(unsigned long int, std::greater, unsigned long int, Unsigned); -KEY_SORT_CASE(unsigned long long int, - std::less, - unsigned long long int, - Unsigned); -KEY_SORT_CASE(unsigned long long int, - std::greater, - unsigned long long int, - Unsigned); - -// Unsigned char -KEY_SORT_CASE(unsigned char, std::less, unsigned char, Unsigned); -KEY_SORT_CASE(unsigned char, std::greater, unsigned char, Unsigned); -KEY_SORT_CASE(char16_t, std::less, uint16_t, Unsigned); -KEY_SORT_CASE(char16_t, std::greater, uint16_t, Unsigned); -KEY_SORT_CASE(char32_t, std::less, uint32_t, Unsigned); -KEY_SORT_CASE(char32_t, std::greater, uint32_t, Unsigned); -KEY_SORT_CASE(wchar_t, std::less, uint32_t, Unsigned); -KEY_SORT_CASE(wchar_t, std::greater, uint32_t, Unsigned); - -// Signed integers -KEY_SORT_CASE(char, std::less, unsigned char, Signed); -KEY_SORT_CASE(char, std::greater, unsigned char, Signed); -KEY_SORT_CASE(short, std::less, unsigned short, Signed); -KEY_SORT_CASE(short, std::greater, unsigned short, Signed); -KEY_SORT_CASE(int, std::less, unsigned int, Signed); -KEY_SORT_CASE(int, std::greater, unsigned int, Signed); -KEY_SORT_CASE(long, std::less, unsigned long, Signed); -KEY_SORT_CASE(long, std::greater, unsigned long, Signed); -KEY_SORT_CASE(long long, std::less, unsigned long long, Signed); -KEY_SORT_CASE(long long, std::greater, unsigned long long, Signed); - -// |signed char| and |char| are treated as different types -KEY_SORT_CASE(signed char, std::less, unsigned char, Signed); -KEY_SORT_CASE(signed char, std::greater, unsigned char, Signed); - -// Floating point numbers -KEY_SORT_CASE(float, std::less, uint32_t, Decimal); -KEY_SORT_CASE(float, std::greater, uint32_t, Decimal); -KEY_SORT_CASE(double, std::less, uint64_t, Decimal); -KEY_SORT_CASE(double, std::greater, uint64_t, Decimal); - -#undef KEY_SORT_CASE - -template -struct run_kx_radix_sort_keys -{ - static void run(T* data, size_t num_elems, const CompareType& comp) - { - std::sort(data, data + num_elems, comp); - } -}; - -#define KX_SORT_KEYS(key_type) \ - template <> \ - struct run_kx_radix_sort_keys> \ - { \ - static void run(key_type* data, size_t num_elems, const std::less& comp) \ - { \ - (void)comp; \ - kx::radix_sort(data, data + num_elems); \ - } \ - }; - -KX_SORT_KEYS(unsigned short int); -KX_SORT_KEYS(int); -KX_SORT_KEYS(unsigned int); -KX_SORT_KEYS(long int); -KX_SORT_KEYS(unsigned long int); -KX_SORT_KEYS(long long int); -KX_SORT_KEYS(unsigned long long int); -KX_SORT_KEYS(unsigned char); - -#undef KX_SORT_KEYS - -template -bool use_serial_sort_keys(T* data, size_t num_elems, const CompareType& comp) -{ - size_t total_bytes = (num_elems) * sizeof(T); - if (total_bytes < MIN_BYTES_FOR_PARALLEL) - { - run_kx_radix_sort_keys::run(data, num_elems, comp); - return true; - } - return false; -} - -// Generate radix sort interfaces for key and key value sorts. -#define VTKM_TBB_SORT_EXPORT(key_type) \ - void parallel_radix_sort_key_values( \ - key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater& comp) \ - { \ - PairSort> ps; \ - ps.InitAndSort(keys, vals, num_elems, comp); \ - } \ - void parallel_radix_sort_key_values( \ - key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less& comp) \ - { \ - PairSort> ps; \ - ps.InitAndSort(keys, vals, num_elems, comp); \ - } \ - void parallel_radix_sort(key_type* data, size_t num_elems, const std::greater& comp) \ - { \ - if (!use_serial_sort_keys(data, num_elems, comp)) \ - { \ - KeySort> ks; \ - ks.InitAndSort(data, num_elems, comp); \ - } \ - } \ - void parallel_radix_sort(key_type* data, size_t num_elems, const std::less& comp) \ - { \ - if (!use_serial_sort_keys(data, num_elems, comp)) \ - { \ - KeySort> ks; \ - ks.InitAndSort(data, num_elems, comp); \ - } \ - } - -VTKM_TBB_SORT_EXPORT(short int); -VTKM_TBB_SORT_EXPORT(unsigned short int); -VTKM_TBB_SORT_EXPORT(int); -VTKM_TBB_SORT_EXPORT(unsigned int); -VTKM_TBB_SORT_EXPORT(long int); -VTKM_TBB_SORT_EXPORT(unsigned long int); -VTKM_TBB_SORT_EXPORT(long long int); -VTKM_TBB_SORT_EXPORT(unsigned long long int); -VTKM_TBB_SORT_EXPORT(unsigned char); -VTKM_TBB_SORT_EXPORT(signed char); -VTKM_TBB_SORT_EXPORT(char); -VTKM_TBB_SORT_EXPORT(char16_t); -VTKM_TBB_SORT_EXPORT(char32_t); -VTKM_TBB_SORT_EXPORT(wchar_t); -VTKM_TBB_SORT_EXPORT(float); -VTKM_TBB_SORT_EXPORT(double); - -#undef VTKM_TBB_SORT_EXPORT - -VTKM_THIRDPARTY_POST_INCLUDE -} +VTKM_INSTANTIATE_RADIX_SORT_FOR_THREADER(RadixThreaderTBB) } } } +} // vtkm::cont::tbb::sort diff --git a/vtkm/cont/tbb/internal/ParallelSortTBB.h b/vtkm/cont/tbb/internal/ParallelSortTBB.h index 5835e4382..e9dc17686 100644 --- a/vtkm/cont/tbb/internal/ParallelSortTBB.h +++ b/vtkm/cont/tbb/internal/ParallelSortTBB.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -38,128 +39,27 @@ namespace cont { namespace tbb { -namespace internal +namespace sort { -struct RadixSortTag -{ -}; -struct PSortTag -{ -}; -template -struct is_valid_compare_type : std::integral_constant -{ -}; -template -struct is_valid_compare_type> : std::integral_constant -{ -}; -template -struct is_valid_compare_type> : std::integral_constant -{ -}; -template <> -struct is_valid_compare_type : std::integral_constant -{ -}; -template <> -struct is_valid_compare_type : std::integral_constant -{ -}; -template -BComp&& get_std_compare(BComp&& b, T&&) -{ - return std::forward(b); -} -template -std::less get_std_compare(vtkm::SortLess, T&&) -{ - return std::less{}; -} -template -std::greater get_std_compare(vtkm::SortGreater, T&&) -{ - return std::greater{}; -} - - -template -struct sort_tag_type -{ - using type = PSortTag; -}; -template -struct sort_tag_type -{ - using PrimT = std::is_arithmetic; - using LongDT = std::is_same; - using BComp = is_valid_compare_type; - using type = typename std::conditional::type; -}; - -template -struct sortbykey_tag_type -{ - using type = PSortTag; -}; -template -struct sortbykey_tag_type -{ - using PrimT = std::is_arithmetic; - using PrimU = std::is_arithmetic; - using LongDT = std::is_same; - using BComp = is_valid_compare_type; - using type = - typename std::conditional::type; -}; - - -#define VTKM_TBB_SORT_EXPORT(key_type) \ - VTKM_CONT_EXPORT void parallel_radix_sort( \ - key_type* data, size_t num_elems, const std::greater& comp); \ - VTKM_CONT_EXPORT void parallel_radix_sort( \ - key_type* data, size_t num_elems, const std::less& comp); \ - VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ - key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater& comp); \ - VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \ - key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less& comp); - -// Generate radix sort interfaces for key and key value sorts. -VTKM_TBB_SORT_EXPORT(short int); -VTKM_TBB_SORT_EXPORT(unsigned short int); -VTKM_TBB_SORT_EXPORT(int); -VTKM_TBB_SORT_EXPORT(unsigned int); -VTKM_TBB_SORT_EXPORT(long int); -VTKM_TBB_SORT_EXPORT(unsigned long int); -VTKM_TBB_SORT_EXPORT(long long int); -VTKM_TBB_SORT_EXPORT(unsigned long long int); -VTKM_TBB_SORT_EXPORT(unsigned char); -VTKM_TBB_SORT_EXPORT(signed char); -VTKM_TBB_SORT_EXPORT(char); -VTKM_TBB_SORT_EXPORT(char16_t); -VTKM_TBB_SORT_EXPORT(char32_t); -VTKM_TBB_SORT_EXPORT(wchar_t); -VTKM_TBB_SORT_EXPORT(float); -VTKM_TBB_SORT_EXPORT(double); -#undef VTKM_TBB_SORT_EXPORT +// Declare the compiled radix sort specializations: +VTKM_DECLARE_RADIX_SORT() +// Forward declare entry points (See stack overflow discussion 7255281 -- +// templated overloads of template functions are not specialization, and will +// be resolved during the first phase of two part lookup). template -void parallel_sort(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) -{ - using SortAlgorithmTag = typename sort_tag_type::type; - parallel_sort(values, binary_compare, SortAlgorithmTag{}); -} +void parallel_sort(vtkm::cont::ArrayHandle&, BinaryCompare); +template +void parallel_sort_bykey(vtkm::cont::ArrayHandle&, + vtkm::cont::ArrayHandle&, + BinaryCompare); + +// Quicksort values: template -void parallel_sort(HandleType& values, BinaryCompare binary_compare, PSortTag) +void parallel_sort(HandleType& values, + BinaryCompare binary_compare, + vtkm::cont::internal::radix::PSortTag) { auto arrayPortal = values.PrepareForInPlace(vtkm::cont::DeviceAdapterTagTBB()); @@ -169,31 +69,37 @@ void parallel_sort(HandleType& values, BinaryCompare binary_compare, PSortTag) internal::WrappedBinaryOperator wrappedCompare(binary_compare); ::tbb::parallel_sort(iterators.GetBegin(), iterators.GetEnd(), wrappedCompare); } + +// Radix sort values: template void parallel_sort(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare, - RadixSortTag) + vtkm::cont::internal::radix::RadixSortTag) { + using namespace vtkm::cont::internal::radix; auto c = get_std_compare(binary_compare, T{}); parallel_radix_sort( values.GetStorage().GetArray(), static_cast(values.GetNumberOfValues()), c); } -template -void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, - vtkm::cont::ArrayHandle& values, - BinaryCompare binary_compare) +// Value sort -- static switch between quicksort and radix sort +template +void parallel_sort(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { - using SortAlgorithmTag = - typename sortbykey_tag_type::type; - parallel_sort_bykey(keys, values, binary_compare, SortAlgorithmTag{}); + using namespace vtkm::cont::internal::radix; + using SortAlgorithmTag = typename sort_tag_type::type; + parallel_sort(values, binary_compare, SortAlgorithmTag{}); } + + +// Quicksort by key template void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare, - PSortTag) + vtkm::cont::internal::radix::PSortTag) { + using namespace vtkm::cont::internal::radix; using KeyType = vtkm::cont::ArrayHandle; constexpr bool larger_than_64bits = sizeof(U) > sizeof(vtkm::Int64); if (larger_than_64bits) @@ -243,23 +149,28 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, zipHandle, vtkm::cont::internal::KeyCompare(binary_compare), PSortTag{}); } } + +// Radix sort by key -- Specialize for vtkm::Id values: template void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare, - RadixSortTag) + vtkm::cont::internal::radix::RadixSortTag) { + using namespace vtkm::cont::internal::radix; auto c = get_std_compare(binary_compare, T{}); parallel_radix_sort_key_values(keys.GetStorage().GetArray(), values.GetStorage().GetArray(), static_cast(keys.GetNumberOfValues()), c); } + +// Radix sort by key -- Generic impl: template void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare, - RadixSortTag) + vtkm::cont::internal::radix::RadixSortTag) { using KeyType = vtkm::cont::ArrayHandle; using ValueType = vtkm::cont::ArrayHandle; @@ -287,7 +198,7 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, ZipHandleType zipHandle = vtkm::cont::make_ArrayHandleZip(keys, indexArray); parallel_sort(zipHandle, vtkm::cont::internal::KeyCompare(binary_compare), - PSortTag{}); + vtkm::cont::internal::radix::PSortTag{}); } tbb::ScatterPortal(values.PrepareForInput(vtkm::cont::DeviceAdapterTagTBB()), @@ -301,9 +212,21 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, tbb::CopyPortals(inputPortal, outputPortal, 0, 0, valuesScattered.GetNumberOfValues()); } } + +// Sort by key -- static switch between radix and quick sort: +template +void parallel_sort_bykey(vtkm::cont::ArrayHandle& keys, + vtkm::cont::ArrayHandle& values, + BinaryCompare binary_compare) +{ + using namespace vtkm::cont::internal::radix; + using SortAlgorithmTag = + typename sortbykey_tag_type::type; + parallel_sort_bykey(keys, values, binary_compare, SortAlgorithmTag{}); } } } } +} // end namespace vtkm::cont::tbb::sort #endif // vtk_m_cont_tbb_internal_ParallelSort_h