Merge branch 'master' of gitlab.kitware.com:m-kim/vtk-m into advdatamodel

2024-09-16 17:22:55 +00:00 · 2019-06-26 19:37:47 -04:00 · 2019-06-26 19:37:47 -04:00 · 8dbb1c4de3
commit 8dbb1c4de3
parent f3f71eb141 0eac06f5b0
15 changed files with 2923 additions and 100 deletions
--- a/CMake/FindMPI.cmake
+++ b/CMake/FindMPI.cmake
--- a/CMake/VTKmMPI.cmake
+++ b/CMake/VTKmMPI.cmake
@ -0,0 +1,24 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##============================================================================
+
+if(VTKm_ENABLE_MPI AND NOT TARGET MPI::MPI_CXX)
+  if(CMAKE_VERSION VERSION_LESS 3.15)
+    #While CMake 3.10 introduced the new MPI module.
+    #Fixes related to MPI+CUDA that VTK-m needs are
+    #only found in CMake 3.15+.
+    find_package(MPI REQUIRED MODULE)
+  else()
+    #clunky but we need to make sure we use the upstream module if it exists
+    set(orig_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH})
+    set(CMAKE_MODULE_PATH "")
+    find_package(MPI MODULE)
+    set(CMAKE_MODULE_PATH ${orig_CMAKE_MODULE_PATH})
+  endif()
+endif()
--- a/CMake/VTKmWrappers.cmake
+++ b/CMake/VTKmWrappers.cmake
@ -12,6 +12,7 @@ include(CMakeParseArguments)

 include(VTKmDeviceAdapters)
 include(VTKmCPUVectorization)
+include(VTKmMPI)

 #-----------------------------------------------------------------------------
 # Utility to build a kit name from the current directory.
--- a/CMake/testing/VTKmTestInstall.cmake
+++ b/CMake/testing/VTKmTestInstall.cmake
@ -62,13 +62,13 @@ file(GENERATE
  OUTPUT "${${file_loc_var}}"
  CONTENT
 "
-set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING \"\")
-set(CMAKE_PREFIX_PATH ${install_prefix} CACHE STRING \"\")
-set(CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH \"\")
-set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING \"\")
-set(CMAKE_CUDA_COMPILER ${CMAKE_CUDA_COMPILER} CACHE FILEPATH \"\")
-set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} CACHE STRING \"\")
-set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER} CACHE FILEPATH \"\")
+set(CMAKE_MAKE_PROGRAM \"${CMAKE_MAKE_PROGRAM}\" CACHE FILEPATH \"\")
+set(CMAKE_PREFIX_PATH \"${CMAKE_PREFIX_PATH};${install_prefix}/\" CACHE STRING \"\")
+set(CMAKE_CXX_COMPILER \"${CMAKE_CXX_COMPILER}\" CACHE FILEPATH \"\")
+set(CMAKE_CXX_FLAGS \"$CACHE{CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
+set(CMAKE_CUDA_COMPILER \"${CMAKE_CUDA_COMPILER}\" CACHE FILEPATH \"\")
+set(CMAKE_CUDA_FLAGS \"$CACHE{CMAKE_CUDA_FLAGS}\" CACHE STRING \"\")
+set(CMAKE_CUDA_HOST_COMPILER \"${CMAKE_CUDA_HOST_COMPILER}\" CACHE FILEPATH \"\")
 "
 )

@ -81,8 +81,34 @@ function(vtkm_test_against_install dir)
  set(src_dir "${CMAKE_CURRENT_SOURCE_DIR}/${name}/")
  set(build_dir "${VTKm_BINARY_DIR}/CMakeFiles/_tmp_build/test_${name}/")

-  set(build_config "${build_dir}/build_options.cmake")
-  vtkm_generate_install_build_options(build_config)
+  set(args )
+  if(CMAKE_VERSION VERSION_LESS 3.13)
+    #Before 3.13 the config file passing to cmake via ctest --build-options
+    #was broken
+    set(args
+      -DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}
+      -DCMAKE_PREFIX_PATH:STRING=${install_prefix}
+      -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}
+      -DCMAKE_CUDA_COMPILER:FILEPATH=${CMAKE_CUDA_COMPILER}
+      -DCMAKE_CUDA_HOST_COMPILER:FILEPATH=${CMAKE_CUDA_HOST_COMPILER}
+      -DCMAKE_CXX_FLAGS:STRING=$CACHE{CMAKE_CXX_FLAGS}
+      -DCMAKE_CUDA_FLAGS:STRING=$CACHE{CMAKE_CUDA_FLAGS}
+    )
+  else()
+    set(build_config "${build_dir}build_options.cmake")
+    vtkm_generate_install_build_options(build_config)
+    set(args -C ${build_config})
+  endif()
+
+  if(WIN32 AND TARGET vtkm::tbb)
+    #on windows we need to specify these as FindTBB won't
+    #find the installed version just with the prefix path
+    list(APPEND args
+      -DTBB_LIBRARY_DEBUG:FILEPATH=${TBB_LIBRARY_DEBUG}
+      -DTBB_LIBRARY_RELEASE:FILEPATH=${TBB_LIBRARY_RELEASE}
+      -DTBB_INCLUDE_DIR:PATH=${TBB_INCLUDE_DIR}
+    )
+  endif()

  #determine if the test is expected to compile or fail to build. We use
  #this information to built the test name to make it clear to the user
@ -93,10 +119,13 @@ function(vtkm_test_against_install dir)

  add_test(NAME ${build_name}
           COMMAND ${CMAKE_CTEST_COMMAND}
+           -C $<CONFIG>
           --build-and-test ${src_dir} ${build_dir}
           --build-generator ${CMAKE_GENERATOR}
           --build-makeprogram ${CMAKE_MAKE_PROGRAM}
-           --build-options -C "${build_config}"
+           --build-options
+            ${args}
+            --no-warn-unused-cli
           )

  set_tests_properties(${build_name} PROPERTIES LABELS ${test_label} )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -182,14 +182,6 @@ check_type_size("long long" VTKm_SIZE_LONG_LONG BUILTIN_TYPES_ONLY)

 #-----------------------------------------------------------------------------
 # Add subdirectories
-if(VTKm_ENABLE_MPI)
-  # This `if` is temporary and will be removed once `diy` supports building
-  # without MPI.
-  if (NOT MPI_C_FOUND)
-    find_package(MPI ${VTKm_FIND_PACKAGE_QUIETLY})
-  endif()
-endif()
-
 add_subdirectory(vtkm)

 #-----------------------------------------------------------------------------
@ -241,6 +233,7 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
  install(
    FILES
      ${VTKm_SOURCE_DIR}/CMake/FindTBB.cmake
+      ${VTKm_SOURCE_DIR}/CMake/FindMPI.cmake
      ${VTKm_SOURCE_DIR}/CMake/FindOpenGL.cmake
      ${VTKm_SOURCE_DIR}/CMake/FindOpenMP.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
@ -253,6 +246,7 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
      ${VTKm_SOURCE_DIR}/CMake/VTKmDetectCUDAVersion.cu
      ${VTKm_SOURCE_DIR}/CMake/VTKmDeviceAdapters.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
+      ${VTKm_SOURCE_DIR}/CMake/VTKmMPI.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmRenderingContexts.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmWrappers.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
@ -315,6 +309,4 @@ endif()

 #-----------------------------------------------------------------------------
 # Build examples
-if(VTKm_ENABLE_EXAMPLES)
-  add_subdirectory(examples)
-endif(VTKm_ENABLE_EXAMPLES)
+add_subdirectory(examples)
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -49,7 +49,6 @@ contents of these for details on the specifics of their respective
 licenses.
 - - - - - - - - - - - - - - - - - - - - - - - - do not remove this line
 CMake/FindTBB.cmake
-CMake/FindGLEW.cmake
 Utilities
 vtkm/cont/tbb/internal/parallel_sort.h
 vtkm/cont/tbb/internal/parallel_radix_sort_tbb.h
--- a/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/benchmarking/BenchmarkDeviceAdapter.cxx
@ -57,21 +57,24 @@ enum BenchmarkName
  BITFIELD_TO_UNORDERED_SET = 1 << 0,
  COPY = 1 << 1,
  COPY_IF = 1 << 2,
-  LOWER_BOUNDS = 1 << 3,
-  REDUCE = 1 << 4,
-  REDUCE_BY_KEY = 1 << 5,
-  SCAN_INCLUSIVE = 1 << 6,
-  SCAN_EXCLUSIVE = 1 << 7,
-  SORT = 1 << 8,
-  SORT_BY_KEY = 1 << 9,
-  STABLE_SORT_INDICES = 1 << 10,
-  STABLE_SORT_INDICES_UNIQUE = 1 << 11,
-  UNIQUE = 1 << 12,
-  UPPER_BOUNDS = 1 << 13,
+  COUNT_SET_BITS = 1 << 3,
+  FILL = 1 << 4,
+  LOWER_BOUNDS = 1 << 5,
+  REDUCE = 1 << 6,
+  REDUCE_BY_KEY = 1 << 7,
+  SCAN_EXCLUSIVE = 1 << 8,
+  SCAN_INCLUSIVE = 1 << 9,
+  SORT = 1 << 10,
+  SORT_BY_KEY = 1 << 11,
+  STABLE_SORT_INDICES = 1 << 12,
+  STABLE_SORT_INDICES_UNIQUE = 1 << 13,
+  UNIQUE = 1 << 14,
+  UPPER_BOUNDS = 1 << 15,

-  ALL = BITFIELD_TO_UNORDERED_SET | COPY | COPY_IF | LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY |
-    SCAN_INCLUSIVE |
+  ALL = BITFIELD_TO_UNORDERED_SET | COPY | COPY_IF | COUNT_SET_BITS | FILL | LOWER_BOUNDS | REDUCE |
+    REDUCE_BY_KEY |
    SCAN_EXCLUSIVE |
+    SCAN_INCLUSIVE |
    SORT |
    SORT_BY_KEY |
    STABLE_SORT_INDICES |
@ -84,28 +87,20 @@ enum BenchmarkName
 /// described below:
 struct BenchDevAlgoConfig
 {
-  /// Benchmarks to run. Possible values:
-  /// Copy, CopyIf, LowerBounds, Reduce, ReduceByKey, ScanInclusive,
-  /// ScanExclusive, Sort, SortByKey, StableSortIndices, StableSortIndicesUnique,
-  /// Unique, UpperBounds, or All. (Default: All).
-  // Zero is for parsing, will change to 'all' in main if needed.
+  /// Benchmarks to run. See BenchmarkName enum.
  int BenchmarkFlags{ 0 };

  /// ValueTypes to test.
-  /// CLI arg: "TypeList [Base|Extended]" (Base is default).
+  /// CLI arg: "--base-typelist | --extended-typelist" (Base is default).
  bool ExtendedTypeList{ false };

  /// Run benchmarks using the same number of bytes for all arrays.
-  /// CLI arg: "FixBytes [n|off]" (n is the number of bytes, default: 2097152, ie. 2MiB)
-  /// @note FixBytes and FixSizes are not mutually exclusive. If both are
-  /// specified, both will run.
+  /// CLI arg: "--array-size-bytes [n]" (n is the number of bytes, default: 2097152, ie. 2MiB)
  bool TestArraySizeBytes{ true };
  vtkm::UInt64 ArraySizeBytes{ 1 << 21 };

  /// Run benchmarks using the same number of values for all arrays.
-  /// CLI arg: "FixSizes [n|off]" (n is the number of values, default: off)
-  /// @note FixBytes and FixSizes are not mutually exclusive. If both are
-  /// specified, both will run.
+  /// CLI arg: "--array-size-values [n]" (n is the number of values, default: off)
  bool TestArraySizeValues{ false };
  vtkm::UInt64 ArraySizeValues{ 1 << 21 };

@ -113,7 +108,7 @@ struct BenchDevAlgoConfig
  /// values (5%, 10%, 15%, 20%, 25%, 30%, 35%, 40%, 45%, 50%, 75%, 100%
  /// unique). If false (default), the range is limited to 5%, 25%, 50%, 75%,
  /// 100%.
-  /// CLI arg: "DetailedOutputRange" enables the extended range.
+  /// CLI arg: "--more-output-range" enables the extended range.
  bool DetailedOutputRangeScaling{ false };

  // Internal: The benchmarking code will set this depending on execution phase:
@ -289,11 +284,11 @@ public:
    {
      if (wordIdx <= this->MaxMaskedWord && (wordIdx % this->Stride) == 0)
      {
-        this->Portal.SetWord(wordIdx, this->Exemplar);
+        this->Portal.SetWordAtomic(wordIdx, this->Exemplar);
      }
      else
      {
-        this->Portal.SetWord(wordIdx, static_cast<WordType>(0));
+        this->Portal.SetWordAtomic(wordIdx, static_cast<WordType>(0));
      }
    }
  };
@ -317,8 +312,10 @@ public:
      stride = 1;
    }

+    vtkm::Id numBits = numWords * static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT);
+
    vtkm::cont::BitField bits;
-    auto portal = bits.PrepareForOutput(numWords, DeviceAdapterTag{});
+    auto portal = bits.PrepareForOutput(numBits, DeviceAdapterTag{});

    using Functor = GenerateBitFieldFunctor<WordType, decltype(portal)>;

@ -533,6 +530,184 @@ private:
  VTKM_MAKE_BENCHMARK(CopyIf75, BenchCopyIf, 75);
  VTKM_MAKE_BENCHMARK(CopyIf100, BenchCopyIf, 100);

+  template <typename WordType, typename DeviceAdapter>
+  struct BenchCountSetBits
+  {
+    vtkm::Id NumWords;
+    vtkm::Id NumBits;
+    WordType Exemplar;
+    vtkm::Id Stride;
+    vtkm::Float32 FillRatio;
+    vtkm::Id MaxMaskedIndex;
+    std::string Name;
+
+    vtkm::cont::BitField Bits;
+
+    // See GenerateBitField for details. fillRatio is used to compute
+    // maxMaskedWord.
+    VTKM_CONT
+    BenchCountSetBits(WordType exemplar,
+                      vtkm::Id stride,
+                      vtkm::Float32 fillRatio,
+                      const std::string& name)
+      : NumWords(Config.ComputeNumberOfWords<WordType>())
+      , NumBits(this->NumWords * static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT))
+      , Exemplar(exemplar)
+      , Stride(stride)
+      , FillRatio(fillRatio)
+      , MaxMaskedIndex(this->NumWords / static_cast<vtkm::Id>(1. / this->FillRatio))
+      , Name(name)
+      , Bits(GenerateBitField<WordType, DeviceAdapter>(this->Exemplar,
+                                                       this->Stride,
+                                                       this->MaxMaskedIndex,
+                                                       this->NumWords))
+    {
+    }
+
+    VTKM_CONT
+    vtkm::Float64 operator()()
+    {
+      Timer timer(DeviceAdapter{});
+      timer.Start();
+      Algorithm::CountSetBits(DeviceAdapter{}, this->Bits);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT
+    std::string Description() const
+    {
+      const vtkm::Id numFilledWords = this->MaxMaskedIndex / this->Stride;
+      const vtkm::Id numSetBits = numFilledWords * vtkm::CountSetBits(this->Exemplar);
+
+      std::stringstream description;
+      description << "CountSetBits" << this->Name << " ( "
+                  << "NumWords: " << this->NumWords << " "
+                  << "Exemplar: " << std::hex << this->Exemplar << std::dec << " "
+                  << "FillRatio: " << this->FillRatio << " "
+                  << "Stride: " << this->Stride << " "
+                  << "NumSetBits: " << numSetBits << " )";
+      return description.str();
+    }
+  };
+  VTKM_MAKE_BENCHMARK(CountSetBitsNull, BenchCountSetBits, 0x00000000, 1, 0.f, "Null");
+  VTKM_MAKE_BENCHMARK(CountSetBitsFull, BenchCountSetBits, 0xffffffff, 1, 1.f, "Full");
+  VTKM_MAKE_BENCHMARK(CountSetBitsHalfWord, BenchCountSetBits, 0xffff0000, 1, 1.f, "HalfWord");
+  VTKM_MAKE_BENCHMARK(CountSetBitsHalfField, BenchCountSetBits, 0xffffffff, 1, 0.5f, "HalfField");
+  VTKM_MAKE_BENCHMARK(CountSetBitsAlternateWords,
+                      BenchCountSetBits,
+                      0xffffffff,
+                      2,
+                      1.f,
+                      "AlternateWords");
+  VTKM_MAKE_BENCHMARK(CountSetBitsAlternateBits,
+                      BenchCountSetBits,
+                      0x55555555,
+                      1,
+                      1.f,
+                      "AlternateBits");
+
+  template <typename ValueType, typename DeviceAdapter>
+  struct BenchFillArrayHandle
+  {
+    vtkm::cont::ArrayHandle<ValueType> Handle;
+
+    VTKM_CONT
+    BenchFillArrayHandle() {}
+
+    VTKM_CONT
+    vtkm::Float64 operator()()
+    {
+      const vtkm::Id numVals = Config.ComputeSize<ValueType>();
+      Timer timer(DeviceAdapter{});
+      timer.Start();
+      Algorithm::Fill(DeviceAdapter{}, this->Handle, TestValue(19, ValueType{}), numVals);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT
+    std::string Description() const
+    {
+      const vtkm::Id numVals = Config.ComputeSize<ValueType>();
+      std::stringstream description;
+      description << "Fill (ArrayHandle, " << numVals << " values)";
+      return description.str();
+    }
+  };
+  VTKM_MAKE_BENCHMARK(FillArrayHandle, BenchFillArrayHandle);
+
+  template <typename WordType, typename DeviceAdapter>
+  struct BenchFillBitField
+  {
+    vtkm::Id NumWords;
+    vtkm::Id NumBits;
+    bool UseBool;
+    WordType Pattern;
+    std::string Name;
+
+    vtkm::cont::BitField Bits;
+
+    VTKM_CONT
+    BenchFillBitField(bool useBool, WordType pattern, const std::string& name)
+      : NumWords(Config.ComputeNumberOfWords<WordType>())
+      , NumBits(this->NumWords * static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT))
+      , UseBool(useBool)
+      , Pattern(pattern)
+      , Name(name)
+    {
+    }
+
+    VTKM_CONT
+    vtkm::Float64 operator()()
+    {
+      Timer timer(DeviceAdapter{});
+      if (this->UseBool)
+      {
+        timer.Start();
+        Algorithm::Fill(DeviceAdapter{}, this->Bits, this->Pattern != 0, this->NumBits);
+        return timer.GetElapsedTime();
+      }
+      else
+      {
+        timer.Start();
+        Algorithm::Fill(DeviceAdapter{}, this->Bits, this->Pattern, this->NumBits);
+        return timer.GetElapsedTime();
+      }
+    }
+
+    VTKM_CONT
+    std::string Description() const
+    {
+      std::stringstream description;
+      description << "Fill (BitField)" << this->Name << " ( "
+                  << "FillPattern: " << std::hex << this->Pattern << std::dec << " "
+                  << "UseBool: " << this->UseBool << " "
+                  << "NumBits: " << this->NumBits << " )";
+      return description.str();
+    }
+  };
+  VTKM_MAKE_BENCHMARK(FillBitFieldTrue, BenchFillBitField, true, 0x1, "True");
+  VTKM_MAKE_BENCHMARK(FillBitFieldFalse, BenchFillBitField, true, 0x0, "False");
+  VTKM_MAKE_BENCHMARK(FillBitField8Bit,
+                      BenchFillBitField,
+                      false,
+                      static_cast<vtkm::UInt8>(0xcc),
+                      "8Bit");
+  VTKM_MAKE_BENCHMARK(FillBitField16Bit,
+                      BenchFillBitField,
+                      false,
+                      static_cast<vtkm::UInt16>(0xcccc),
+                      "16Bit");
+  VTKM_MAKE_BENCHMARK(FillBitField32Bit,
+                      BenchFillBitField,
+                      false,
+                      static_cast<vtkm::UInt32>(0xcccccccc),
+                      "32Bit");
+  VTKM_MAKE_BENCHMARK(FillBitField64Bit,
+                      BenchFillBitField,
+                      false,
+                      static_cast<vtkm::UInt64>(0xcccccccccccccccc),
+                      "64Bit");
+
  template <typename Value, typename DeviceAdapter>
  struct BenchLowerBounds
  {
@ -1154,17 +1329,46 @@ public:
  template <typename ValueTypes>
  static VTKM_CONT void RunInternal(vtkm::cont::DeviceAdapterId id)
  {
-    using BitFieldWordTypes = vtkm::ListTagBase<vtkm::UInt32>;
+    using UInt8Type = vtkm::ListTagBase<vtkm::UInt8>;
+    using UInt16Type = vtkm::ListTagBase<vtkm::UInt16>;
+    using UInt32Type = vtkm::ListTagBase<vtkm::UInt32>;
+    using UInt64Type = vtkm::ListTagBase<vtkm::UInt64>;

+    // These need specific word types:
    if (Config.BenchmarkFlags & BITFIELD_TO_UNORDERED_SET)
    {
      std::cout << DIVIDER << "\nBenchmarking BitFieldToUnorderedSet\n";
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetNull, BitFieldWordTypes{}, id);
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetFull, BitFieldWordTypes{}, id);
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfWord, BitFieldWordTypes{}, id);
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfField, BitFieldWordTypes{}, id);
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateWords, BitFieldWordTypes{}, id);
-      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateBits, BitFieldWordTypes{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetNull, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetFull, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfWord, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfField, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateWords, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateBits, UInt32Type{}, id);
+    }
+
+    if (Config.BenchmarkFlags & COUNT_SET_BITS)
+    {
+      std::cout << DIVIDER << "\nBenchmarking CountSetBits\n";
+      VTKM_RUN_BENCHMARK(CountSetBitsNull, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(CountSetBitsFull, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(CountSetBitsHalfWord, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(CountSetBitsHalfField, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(CountSetBitsAlternateWords, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(CountSetBitsAlternateBits, UInt32Type{}, id);
+    }
+
+    if (Config.BenchmarkFlags & FILL)
+    {
+      std::cout << DIVIDER << "\nBenchmarking Fill (ArrayHandle)\n";
+      VTKM_RUN_BENCHMARK(FillArrayHandle, ValueTypes{}, id);
+
+      std::cout << DIVIDER << "\nBenchmarking Fill (BitField)\n";
+      VTKM_RUN_BENCHMARK(FillBitFieldTrue, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(FillBitFieldFalse, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(FillBitField8Bit, UInt8Type{}, id);
+      VTKM_RUN_BENCHMARK(FillBitField16Bit, UInt16Type{}, id);
+      VTKM_RUN_BENCHMARK(FillBitField32Bit, UInt32Type{}, id);
+      VTKM_RUN_BENCHMARK(FillBitField64Bit, UInt64Type{}, id);
    }

    if (Config.BenchmarkFlags & COPY)
@ -1413,7 +1617,7 @@ struct Arg : vtkm::cont::internal::option::Arg
    const char* c = option.arg;
    while (argIsNum && (*c != '\0'))
    {
-      argIsNum &= static_cast<bool>(std::isdigit(*c));
+      argIsNum = argIsNum && static_cast<bool>(std::isdigit(*c));
      ++c;
    }

@ -1524,7 +1728,8 @@ int main(int argc, char* argv[])
                    "",
                    "",
                    Arg::None,
-                    "\tCopy, CopyIf, LowerBounds, Reduce, ReduceByKey, ScanExclusive, "
+                    "\tBitFieldToUnorderedSet, Copy, CopyIf, CountSetBits, FillBitField, "
+                    "LowerBounds, Reduce, ReduceByKey, ScanExclusive, "
                    "ScanInclusive, Sort, SortByKey, StableSortIndices, StableSortIndicesUnique, "
                    "Unique, UpperBounds" });
  usage.push_back(
@ -1631,6 +1836,14 @@ int main(int argc, char* argv[])
    {
      config.BenchmarkFlags |= vtkm::benchmarking::COPY_IF;
    }
+    else if (arg == "countsetbits")
+    {
+      config.BenchmarkFlags |= vtkm::benchmarking::COUNT_SET_BITS;
+    }
+    else if (arg == "fill")
+    {
+      config.BenchmarkFlags |= vtkm::benchmarking::FILL;
+    }
    else if (arg == "lowerbounds")
    {
      config.BenchmarkFlags |= vtkm::benchmarking::LOWER_BOUNDS;
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -10,28 +10,28 @@

 #add the directory that contains the VTK-m config file to the cmake
 #path so that our examples can find VTK-m
-set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})
-
-add_subdirectory(clipping)
-add_subdirectory(contour_tree)
-add_subdirectory(contour_tree_augmented)
-add_subdirectory(cosmotools)
-add_subdirectory(demo)
-add_subdirectory(game_of_life)
-add_subdirectory(hello_world)
-add_subdirectory(histogram)
-add_subdirectory(isosurface)
-add_subdirectory(lagrangian)
-add_subdirectory(multi_backend)
-add_subdirectory(oscillator)
-add_subdirectory(particle_advection)
-add_subdirectory(redistribute_points)
-add_subdirectory(rendering)
-add_subdirectory(streamline)
-add_subdirectory(temporal_advection)
-add_subdirectory(tetrahedra)
-# add_subdirectory(unified_memory)

+if(VTKm_ENABLE_EXAMPLES)
+  set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})
+  add_subdirectory(clipping)
+  add_subdirectory(contour_tree)
+  add_subdirectory(contour_tree_augmented)
+  add_subdirectory(cosmotools)
+  add_subdirectory(demo)
+  add_subdirectory(game_of_life)
+  add_subdirectory(hello_world)
+  add_subdirectory(histogram)
+  add_subdirectory(isosurface)
+  add_subdirectory(lagrangian)
+  add_subdirectory(multi_backend)
+  add_subdirectory(oscillator)
+  add_subdirectory(particle_advection)
+  add_subdirectory(redistribute_points)
+  add_subdirectory(rendering)
+  add_subdirectory(streamline)
+  add_subdirectory(temporal_advection)
+  add_subdirectory(tetrahedra)
+endif()

 if (VTKm_ENABLE_TESTING)
  # These need to be fast to build as they will
--- a/vtkm/cont/Algorithm.h
+++ b/vtkm/cont/Algorithm.h
@ -108,6 +108,32 @@ struct CopySubRangeFunctor
  }
 };

+struct CountSetBitsFunctor
+{
+  vtkm::Id PopCount{ 0 };
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    this->PopCount = vtkm::cont::DeviceAdapterAlgorithm<Device>::CountSetBits(
+      PrepareArgForExec<Device>(std::forward<Args>(args))...);
+    return true;
+  }
+};
+
+struct FillFunctor
+{
+  vtkm::Id PopCount{ 0 };
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Fill(
+      PrepareArgForExec<Device>(std::forward<Args>(args))...);
+    return true;
+  }
+};
+
 struct LowerBoundsFunctor
 {

@ -489,6 +515,109 @@ struct Algorithm
                        outputIndex);
  }

+  VTKM_CONT static vtkm::Id CountSetBits(vtkm::cont::DeviceAdapterId devId,
+                                         const vtkm::cont::BitField& bits)
+  {
+    detail::CountSetBitsFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, bits);
+    return functor.PopCount;
+  }
+
+  VTKM_CONT static vtkm::Id CountSetBits(const vtkm::cont::BitField& bits)
+  {
+    return CountSetBits(vtkm::cont::DeviceAdapterTagAny{}, bits);
+  }
+
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::BitField& bits,
+                             bool value,
+                             vtkm::Id numBits)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, bits, value, numBits);
+  }
+
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value, vtkm::Id numBits)
+  {
+    Fill(vtkm::cont::DeviceAdapterTagAny{}, bits, value, numBits);
+  }
+
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::BitField& bits,
+                             bool value)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, bits, value);
+  }
+
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value)
+  {
+    Fill(vtkm::cont::DeviceAdapterTagAny{}, bits, value);
+  }
+
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::BitField& bits,
+                             WordType word,
+                             vtkm::Id numBits)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, bits, word, numBits);
+  }
+
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word, vtkm::Id numBits)
+  {
+    Fill(vtkm::cont::DeviceAdapterTagAny{}, bits, word, numBits);
+  }
+
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::BitField& bits,
+                             WordType word)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, bits, word);
+  }
+
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word)
+  {
+    FillBitField(vtkm::cont::DeviceAdapterTagAny{}, bits, word);
+  }
+
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::ArrayHandle<T, S>& handle,
+                             const T& value)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, handle, value);
+  }
+
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& handle, const T& value)
+  {
+    Fill(vtkm::cont::DeviceAdapterTagAny{}, handle, value);
+  }
+
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::DeviceAdapterId devId,
+                             vtkm::cont::ArrayHandle<T, S>& handle,
+                             const T& value,
+                             const vtkm::Id numValues)
+  {
+    detail::FillFunctor functor;
+    vtkm::cont::TryExecuteOnDevice(devId, functor, handle, value, numValues);
+  }
+
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& handle,
+                             const T& value,
+                             const vtkm::Id numValues)
+  {
+    Fill(vtkm::cont::DeviceAdapterTagAny{}, handle, value, numValues);
+  }

  template <typename T, class CIn, class CVal, class COut>
  VTKM_CONT static void LowerBounds(vtkm::cont::DeviceAdapterId devId,
--- a/vtkm/cont/DeviceAdapterAlgorithm.h
+++ b/vtkm/cont/DeviceAdapterAlgorithm.h
@ -119,6 +119,35 @@ struct DeviceAdapterAlgorithm
                                     vtkm::cont::ArrayHandle<U, COut>& output,
                                     vtkm::Id outputIndex = 0);

+  /// \brief Returns the total number of "1" bits in BitField.
+  VTKM_CONT static vtkm::Id CountSetBits(const vtkm::cont::BitField& bits);
+
+  /// \brief Fill the BitField with a specific pattern of bits.
+  /// For boolean values, all bits are set to 1 if value is true, or 0 if value
+  /// is false.
+  /// For word masks, the word type must be an unsigned integral type, which
+  /// will be stamped across the BitField.
+  /// If numBits is provided, the BitField is resized appropriately.
+  /// @{
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value, vtkm::Id numBits);
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value);
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word, vtkm::Id numBits);
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word, bool value);
+  /// @}
+
+  /// Fill @a array with @a value. If @a numValues is specified, the array will
+  /// be resized.
+  /// @{
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& array, const T& value);
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& array,
+                             const T& value,
+                             const vtkm::Id numValues);
+  /// @}
+
  /// \brief Output is the first index in input for each item in values that wouldn't alter the ordering of input
  ///
  /// LowerBounds is a vectorized search. From each value in \c values it finds
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -439,6 +439,95 @@ private:
    }
  }

+
+  template <typename BitsPortal, typename GlobalPopCountType>
+  struct CountSetBitsFunctor : public vtkm::exec::FunctorBase
+  {
+    VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
+                                            std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
+                                            std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
+                           "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
+
+    //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
+    //which is the GCC required compiler for CUDA 9.2 on summit/power9
+    using Word = typename vtkm::cont::internal::AtomicInterfaceExecution<
+      DeviceAdapterTagCuda>::WordTypePreferred;
+
+    VTKM_CONT
+    CountSetBitsFunctor(const BitsPortal& portal, GlobalPopCountType* globalPopCount)
+      : Portal{ portal }
+      , GlobalPopCount{ globalPopCount }
+      , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
+      , FinalWordMask{ portal.GetFinalWordMask() }
+    {
+    }
+
+    ~CountSetBitsFunctor() {}
+
+    VTKM_CONT void Initialize()
+    {
+      assert(this->GlobalPopCount != nullptr);
+      VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
+    }
+
+    VTKM_SUPPRESS_EXEC_WARNINGS
+    __device__ void operator()(vtkm::Id wordIdx) const
+    {
+      Word word = this->Portal.GetWord(wordIdx);
+
+      // The last word may be partial -- mask out trailing bits if needed.
+      const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
+
+      word &= mask;
+
+      if (word != 0)
+      {
+        this->LocalPopCount = vtkm::CountSetBits(word);
+        this->Reduce();
+      }
+    }
+
+    VTKM_CONT vtkm::Id Finalize() const
+    {
+      assert(this->GlobalPopCount != nullptr);
+      GlobalPopCountType result;
+      VTKM_CUDA_CALL(cudaMemcpy(
+        &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
+      return static_cast<vtkm::Id>(result);
+    }
+
+  private:
+    // Every thread with a non-zero local popcount calls this function, which
+    // computes the total popcount for the coalesced threads and atomically
+    // increasing the global popcount.
+    VTKM_SUPPRESS_EXEC_WARNINGS
+    __device__ void Reduce() const
+    {
+      const auto activeLanes = cooperative_groups::coalesced_threads();
+      const int activeRank = activeLanes.thread_rank();
+      const int activeSize = activeLanes.size();
+
+      // Reduction value:
+      vtkm::Int32 rVal = this->LocalPopCount;
+      for (int delta = 1; delta < activeSize; delta *= 2)
+      {
+        rVal += activeLanes.shfl_down(rVal, delta);
+      }
+
+      if (activeRank == 0)
+      {
+        atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
+      }
+    }
+
+    const BitsPortal Portal;
+    GlobalPopCountType* GlobalPopCount;
+    mutable vtkm::Int32 LocalPopCount{ 0 };
+    // Used to mask trailing bits the in last word.
+    vtkm::Id FinalWordIndex{ 0 };
+    Word FinalWordMask{ 0 };
+  };
+
  template <class InputPortal, class ValuesPortal, class OutputPortal>
  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
                                          const ValuesPortal& values,
@ -959,6 +1048,21 @@ private:
    return functor.Finalize();
  }

+  template <typename GlobalPopCountType, typename BitsPortal>
+  VTKM_CONT static vtkm::Id CountSetBitsPortal(const BitsPortal& bits)
+  {
+    using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
+
+    // RAII for the global atomic counter.
+    auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
+    Functor functor{ bits, globalCount.get() };
+
+    functor.Initialize();
+    Schedule(functor, bits.GetNumberOfWords());
+    Synchronize(); // Ensure kernel is done before checking final atomic count
+    return functor.Finalize();
+  }
+
  //-----------------------------------------------------------------------------

 public:
@ -1095,6 +1199,14 @@ public:
    return true;
  }

+  VTKM_CONT static vtkm::Id CountSetBits(const vtkm::cont::BitField& bits)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+    auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{});
+    // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
+    return CountSetBitsPortal<vtkm::UInt64>(bitsPortal);
+  }
+
  template <typename T, class SIn, class SVal, class SOut>
  VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle<T, SIn>& input,
                                    const vtkm::cont::ArrayHandle<T, SVal>& values,
--- a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
+++ b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
@ -269,6 +269,170 @@ public:
    return true;
  }

+  //--------------------------------------------------------------------------
+  // Count Set Bits
+  VTKM_CONT static vtkm::Id CountSetBits(const vtkm::cont::BitField& bits)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    auto bitsPortal = bits.PrepareForInput(DeviceAdapterTag{});
+
+    std::atomic<vtkm::UInt64> popCount;
+    popCount.store(0, std::memory_order_relaxed);
+
+    using Functor = CountSetBitsFunctor<decltype(bitsPortal)>;
+    Functor functor{ bitsPortal, popCount };
+
+    DerivedAlgorithm::Schedule(functor, functor.GetNumberOfInstances());
+    DerivedAlgorithm::Synchronize();
+
+    return static_cast<vtkm::Id>(popCount.load(std::memory_order_seq_cst));
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill Bit Field (bool, resize)
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value, vtkm::Id numBits)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    if (numBits == 0)
+    {
+      bits.Shrink(0);
+      return;
+    }
+
+    auto portal = bits.PrepareForOutput(numBits, DeviceAdapterTag{});
+
+    using WordType =
+      typename vtkm::cont::BitField::template ExecutionTypes<DeviceAdapterTag>::WordTypePreferred;
+
+    using Functor = FillBitFieldFunctor<decltype(portal), WordType>;
+    Functor functor{ portal, value ? ~WordType{ 0 } : WordType{ 0 } };
+
+    const vtkm::Id numWords = portal.template GetNumberOfWords<WordType>();
+    DerivedAlgorithm::Schedule(functor, numWords);
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill Bit Field (bool)
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, bool value)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    const vtkm::Id numBits = bits.GetNumberOfBits();
+    if (numBits == 0)
+    {
+      return;
+    }
+
+    auto portal = bits.PrepareForOutput(numBits, DeviceAdapterTag{});
+
+    using WordType =
+      typename vtkm::cont::BitField::template ExecutionTypes<DeviceAdapterTag>::WordTypePreferred;
+
+    using Functor = FillBitFieldFunctor<decltype(portal), WordType>;
+    Functor functor{ portal, value ? ~WordType{ 0 } : WordType{ 0 } };
+
+    const vtkm::Id numWords = portal.template GetNumberOfWords<WordType>();
+    DerivedAlgorithm::Schedule(functor, numWords);
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill Bit Field (mask, resize)
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word, vtkm::Id numBits)
+  {
+    VTKM_STATIC_ASSERT_MSG(vtkm::cont::BitField::IsValidWordType<WordType>{}, "Invalid word type.");
+
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    if (numBits == 0)
+    {
+      bits.Shrink(0);
+      return;
+    }
+
+    auto portal = bits.PrepareForOutput(numBits, DeviceAdapterTag{});
+
+    // If less than 32 bits, repeat the word until we get a 32 bit pattern.
+    // Using this for the pattern prevents races while writing small numbers
+    // to adjacent memory locations.
+    auto repWord = RepeatTo32BitsIfNeeded(word);
+    using RepWordType = decltype(repWord);
+
+    using Functor = FillBitFieldFunctor<decltype(portal), RepWordType>;
+    Functor functor{ portal, repWord };
+
+    const vtkm::Id numWords = portal.template GetNumberOfWords<RepWordType>();
+    DerivedAlgorithm::Schedule(functor, numWords);
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill Bit Field (mask)
+  template <typename WordType>
+  VTKM_CONT static void Fill(vtkm::cont::BitField& bits, WordType word)
+  {
+    VTKM_STATIC_ASSERT_MSG(vtkm::cont::BitField::IsValidWordType<WordType>{}, "Invalid word type.");
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    const vtkm::Id numBits = bits.GetNumberOfBits();
+    if (numBits == 0)
+    {
+      return;
+    }
+
+    auto portal = bits.PrepareForOutput(numBits, DeviceAdapterTag{});
+
+    // If less than 32 bits, repeat the word until we get a 32 bit pattern.
+    // Using this for the pattern prevents races while writing small numbers
+    // to adjacent memory locations.
+    auto repWord = RepeatTo32BitsIfNeeded(word);
+    using RepWordType = decltype(repWord);
+
+    using Functor = FillBitFieldFunctor<decltype(portal), RepWordType>;
+    Functor functor{ portal, repWord };
+
+    const vtkm::Id numWords = portal.template GetNumberOfWords<RepWordType>();
+    DerivedAlgorithm::Schedule(functor, numWords);
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill ArrayHandle
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& handle, const T& value)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+
+    const vtkm::Id numValues = handle.GetNumberOfValues();
+    if (numValues == 0)
+    {
+      return;
+    }
+
+    auto portal = handle.PrepareForOutput(numValues, DeviceAdapterTag{});
+    FillArrayHandleFunctor<decltype(portal)> functor{ portal, value };
+    DerivedAlgorithm::Schedule(functor, numValues);
+  }
+
+  //--------------------------------------------------------------------------
+  // Fill ArrayHandle (resize)
+  template <typename T, typename S>
+  VTKM_CONT static void Fill(vtkm::cont::ArrayHandle<T, S>& handle,
+                             const T& value,
+                             const vtkm::Id numValues)
+  {
+    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
+    if (numValues == 0)
+    {
+      handle.Shrink(0);
+      return;
+    }
+
+    auto portal = handle.PrepareForOutput(numValues, DeviceAdapterTag{});
+    FillArrayHandleFunctor<decltype(portal)> functor{ portal, value };
+    DerivedAlgorithm::Schedule(functor, numValues);
+  }
+
  //--------------------------------------------------------------------------
  // Lower Bounds
  template <typename T, class CIn, class CVal, class COut>
--- a/vtkm/cont/internal/FunctorsGeneral.h
+++ b/vtkm/cont/internal/FunctorsGeneral.h
@ -494,6 +494,149 @@ struct CopyKernel
  void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer&) {}
 };

+template <typename BitsPortal>
+struct CountSetBitsFunctor : public vtkm::exec::FunctorBase
+{
+  using WordType = typename BitsPortal::WordTypePreferred;
+
+  // This functor executes a number of instances, where each instance handles
+  // two cachelines worth of data. This reduces the number of atomic operations.
+  // Figure out how many words that is:
+  static constexpr vtkm::Id CacheLineSize = VTKM_ALLOCATION_ALIGNMENT;
+  static constexpr vtkm::Id WordsPerCacheLine =
+    CacheLineSize / static_cast<vtkm::Id>(sizeof(WordType));
+  static constexpr vtkm::Id CacheLinesPerInstance = 2;
+  static constexpr vtkm::Id WordsPerInstance = CacheLinesPerInstance * WordsPerCacheLine;
+
+  VTKM_CONT
+  CountSetBitsFunctor(const BitsPortal& input, std::atomic<vtkm::UInt64>& popCount)
+    : Input{ input }
+    , PopCount{ popCount }
+    , FinalWordIndex{ input.GetNumberOfWords() - 1 }
+    , FinalWordMask{ input.GetFinalWordMask() }
+  {
+  }
+
+  VTKM_CONT vtkm::Id GetNumberOfInstances() const
+  {
+    const auto numWords = this->Input.GetNumberOfWords();
+    return (numWords + WordsPerInstance - 1) / WordsPerInstance;
+  }
+
+  VTKM_EXEC void operator()(vtkm::Id instanceIdx) const
+  {
+    const vtkm::Id numWords = this->Input.GetNumberOfWords();
+    const vtkm::Id wordStart = vtkm::Min(instanceIdx * WordsPerInstance, numWords);
+    const vtkm::Id wordEnd = vtkm::Min(wordStart + WordsPerInstance, numWords);
+
+    if (wordStart != wordEnd) // range is valid
+    {
+      this->ExecuteRange(wordStart, wordEnd);
+    }
+  }
+
+  VTKM_CONT vtkm::UInt64 GetPopCount() const { return PopCount.load(std::memory_order_relaxed); }
+
+private:
+  VTKM_EXEC void ExecuteRange(vtkm::Id wordStart, vtkm::Id wordEnd) const
+  {
+#ifndef VTKM_CUDA_DEVICE_PASS // for std::atomic call from VTKM_EXEC function:
+    // Count bits and allocate space for output:
+    vtkm::UInt64 chunkBits = this->CountChunkBits(wordStart, wordEnd);
+    this->PopCount.fetch_add(chunkBits, std::memory_order_relaxed);
+#else
+    (void)wordStart;
+    (void)wordEnd;
+#endif
+  }
+
+  VTKM_EXEC vtkm::UInt64 CountChunkBits(vtkm::Id wordStart, vtkm::Id wordEnd) const
+  {
+    // Need to mask out trailing bits from the final word:
+    const bool isFinalChunk = wordEnd == (this->FinalWordIndex + 1);
+
+    if (isFinalChunk)
+    {
+      wordEnd = this->FinalWordIndex;
+    }
+
+    vtkm::Int32 tmp = 0;
+    for (vtkm::Id i = wordStart; i < wordEnd; ++i)
+    {
+      tmp += vtkm::CountSetBits(this->Input.GetWord(i));
+    }
+
+    if (isFinalChunk)
+    {
+      tmp += vtkm::CountSetBits(this->Input.GetWord(this->FinalWordIndex) & this->FinalWordMask);
+    }
+
+    return static_cast<vtkm::UInt64>(tmp);
+  }
+
+  BitsPortal Input;
+  std::atomic<vtkm::UInt64>& PopCount;
+  // Used to mask trailing bits the in last word.
+  vtkm::Id FinalWordIndex{ 0 };
+  WordType FinalWordMask{ 0 };
+};
+
+// For a given unsigned integer less than 32 bits, repeat its bits until we
+// have a 32 bit pattern. This is used to make all fill patterns at least
+// 32 bits in size, since concurrently writing to adjacent locations smaller
+// than 32 bits may race on some platforms.
+template <typename WordType, typename = typename std::enable_if<(sizeof(WordType) >= 4)>::type>
+static constexpr VTKM_CONT WordType RepeatTo32BitsIfNeeded(WordType pattern)
+{ // for 32 bits or more, just pass the type through.
+  return pattern;
+}
+
+static inline constexpr VTKM_CONT vtkm::UInt32 RepeatTo32BitsIfNeeded(vtkm::UInt16 pattern)
+{
+  return static_cast<vtkm::UInt32>(pattern << 16 | pattern);
+}
+
+static inline constexpr VTKM_CONT vtkm::UInt32 RepeatTo32BitsIfNeeded(vtkm::UInt8 pattern)
+{
+  return RepeatTo32BitsIfNeeded(static_cast<vtkm::UInt16>(pattern << 8 | pattern));
+}
+
+template <typename BitsPortal, typename WordType>
+struct FillBitFieldFunctor : public vtkm::exec::FunctorBase
+{
+  VTKM_CONT
+  FillBitFieldFunctor(const BitsPortal& portal, WordType mask)
+    : Portal{ portal }
+    , Mask{ mask }
+  {
+  }
+
+  VTKM_EXEC void operator()(vtkm::Id wordIdx) const { this->Portal.SetWord(wordIdx, this->Mask); }
+
+private:
+  BitsPortal Portal;
+  WordType Mask;
+};
+
+template <typename PortalType>
+struct FillArrayHandleFunctor : public vtkm::exec::FunctorBase
+{
+  using ValueType = typename PortalType::ValueType;
+
+  VTKM_CONT
+  FillArrayHandleFunctor(const PortalType& portal, ValueType value)
+    : Portal{ portal }
+    , Value{ value }
+  {
+  }
+
+  VTKM_EXEC void operator()(vtkm::Id idx) const { this->Portal.Set(idx, this->Value); }
+
+private:
+  PortalType Portal;
+  ValueType Value;
+};
+
 template <class InputPortalType, class ValuesPortalType, class OutputPortalType>
 struct LowerBoundsKernel
 {
--- a/vtkm/cont/testing/TestingDeviceAdapter.h
+++ b/vtkm/cont/testing/TestingDeviceAdapter.h
@ -2438,7 +2438,7 @@ private:

    auto testRepeatedMask = [&](WordType mask) {
      std::cout << "Testing BitFieldToUnorderedSet with repeated 32-bit word 0x" << std::hex << mask
-                << std::endl;
+                << std::dec << std::endl;

      BitField bits;
      {
@ -2455,7 +2455,7 @@ private:

    auto testRandomMask = [&](WordType seed) {
      std::cout << "Testing BitFieldToUnorderedSet with random sequence seeded with 0x" << std::hex
-                << seed << std::endl;
+                << seed << std::dec << std::endl;

      std::mt19937 mt{ seed };
      std::uniform_int_distribution<std::mt19937::result_type> rng;
@ -2486,11 +2486,244 @@ private:
    testRandomMask(0xdeadbeef);
  }

+  static VTKM_CONT void TestCountSetBits()
+  {
+    using WordType = WordTypeDefault;
+
+    // Test that everything works correctly with a partial word at the end.
+    static constexpr vtkm::Id BitsPerWord = static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT);
+    // +5 to get a partial word:
+    static constexpr vtkm::Id NumFullWords = 1024;
+    static constexpr vtkm::Id NumBits = NumFullWords * BitsPerWord + 5;
+    static constexpr vtkm::Id NumWords = (NumBits + BitsPerWord - 1) / BitsPerWord;
+
+    auto verifyPopCount = [](const BitField& bits) {
+      vtkm::Id refPopCount = 0;
+      const vtkm::Id numBits = bits.GetNumberOfBits();
+      auto portal = bits.GetPortalConstControl();
+      for (vtkm::Id idx = 0; idx < numBits; ++idx)
+      {
+        if (portal.GetBit(idx))
+        {
+          ++refPopCount;
+        }
+      }
+
+      const vtkm::Id popCount = Algorithm::CountSetBits(bits);
+
+      VTKM_TEST_ASSERT(
+        refPopCount == popCount, "CountSetBits returned ", popCount, ", expected ", refPopCount);
+    };
+
+    auto testRepeatedMask = [&](WordType mask) {
+      std::cout << "Testing CountSetBits with repeated word 0x" << std::hex << mask << std::dec
+                << std::endl;
+
+      BitField bits;
+      {
+        bits.Allocate(NumBits);
+        auto fillPortal = bits.GetPortalControl();
+        for (vtkm::Id i = 0; i < NumWords; ++i)
+        {
+          fillPortal.SetWord(i, mask);
+        }
+      }
+
+      verifyPopCount(bits);
+    };
+
+    auto testRandomMask = [&](WordType seed) {
+      std::cout << "Testing CountSetBits with random sequence seeded with 0x" << std::hex << seed
+                << std::dec << std::endl;
+
+      std::mt19937 mt{ seed };
+      std::uniform_int_distribution<std::mt19937::result_type> rng;
+
+      BitField bits;
+      {
+        bits.Allocate(NumBits);
+        auto fillPortal = bits.GetPortalControl();
+        for (vtkm::Id i = 0; i < NumWords; ++i)
+        {
+          fillPortal.SetWord(i, static_cast<WordType>(rng(mt)));
+        }
+      }
+
+      verifyPopCount(bits);
+    };
+
+    testRepeatedMask(0x00000000);
+    testRepeatedMask(0xeeeeeeee);
+    testRepeatedMask(0xffffffff);
+    testRepeatedMask(0x1c0fd395);
+    testRepeatedMask(0xdeadbeef);
+
+    testRandomMask(0x00000000);
+    testRandomMask(0xeeeeeeee);
+    testRandomMask(0xffffffff);
+    testRandomMask(0x1c0fd395);
+    testRandomMask(0xdeadbeef);
+  }
+
+  template <typename WordType>
+  static VTKM_CONT void TestFillBitFieldMask(WordType mask)
+  {
+    std::cout << "Testing Fill with " << (sizeof(WordType) * CHAR_BIT) << " bit mask: " << std::hex
+              << vtkm::UInt64{ mask } << std::dec << std::endl;
+
+    // Test that everything works correctly with a partial word at the end.
+    static constexpr vtkm::Id BitsPerWord = static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT);
+    // +5 to get a partial word:
+    static constexpr vtkm::Id NumFullWords = 1024;
+    static constexpr vtkm::Id NumBits = NumFullWords * BitsPerWord + 5;
+    static constexpr vtkm::Id NumWords = (NumBits + BitsPerWord - 1) / BitsPerWord;
+
+    vtkm::cont::BitField bits;
+    {
+      Algorithm::Fill(bits, mask, NumBits);
+
+      vtkm::Id numBits = bits.GetNumberOfBits();
+      VTKM_TEST_ASSERT(numBits == NumBits, "Unexpected number of bits.");
+      vtkm::Id numWords = bits.GetNumberOfWords<WordType>();
+      VTKM_TEST_ASSERT(numWords == NumWords, "Unexpected number of words.");
+
+      auto portal = bits.GetPortalConstControl();
+      for (vtkm::Id wordIdx = 0; wordIdx < NumWords; ++wordIdx)
+      {
+        VTKM_TEST_ASSERT(portal.GetWord<WordType>(wordIdx) == mask,
+                         "Incorrect word in result BitField; expected 0x",
+                         std::hex,
+                         vtkm::UInt64{ mask },
+                         ", got 0x",
+                         vtkm::UInt64{ portal.GetWord<WordType>(wordIdx) },
+                         std::dec,
+                         " for word ",
+                         wordIdx,
+                         "/",
+                         NumWords);
+      }
+    }
+
+    // Now fill the BitField with the reversed mask to test the no-alloc
+    // overload:
+    {
+      WordType invWord = static_cast<WordType>(~mask);
+      Algorithm::Fill(bits, invWord);
+
+      vtkm::Id numBits = bits.GetNumberOfBits();
+      VTKM_TEST_ASSERT(numBits == NumBits, "Unexpected number of bits.");
+      vtkm::Id numWords = bits.GetNumberOfWords<WordType>();
+      VTKM_TEST_ASSERT(numWords == NumWords, "Unexpected number of words.");
+
+      auto portal = bits.GetPortalConstControl();
+      for (vtkm::Id wordIdx = 0; wordIdx < NumWords; ++wordIdx)
+      {
+        VTKM_TEST_ASSERT(portal.GetWord<WordType>(wordIdx) == invWord,
+                         "Incorrect word in result BitField; expected 0x",
+                         std::hex,
+                         vtkm::UInt64{ invWord },
+                         ", got 0x",
+                         vtkm::UInt64{ portal.GetWord<WordType>(wordIdx) },
+                         std::dec,
+                         " for word ",
+                         wordIdx,
+                         "/",
+                         NumWords);
+      }
+    }
+  }
+
+  static VTKM_CONT void TestFillBitFieldBool(bool value)
+  {
+    std::cout << "Testing Fill with bool: " << value << std::endl;
+
+    // Test that everything works correctly with a partial word at the end.
+    // +5 to get a partial word:
+    static constexpr vtkm::Id NumBits = 1024 * 32 + 5;
+
+    vtkm::cont::BitField bits;
+    {
+      Algorithm::Fill(bits, value, NumBits);
+
+      vtkm::Id numBits = bits.GetNumberOfBits();
+      VTKM_TEST_ASSERT(numBits == NumBits, "Unexpected number of bits.");
+
+      auto portal = bits.GetPortalConstControl();
+      for (vtkm::Id bitIdx = 0; bitIdx < NumBits; ++bitIdx)
+      {
+        VTKM_TEST_ASSERT(portal.GetBit(bitIdx) == value, "Incorrect bit in result BitField.");
+      }
+    }
+
+    // Now fill the BitField with the reversed mask to test the no-alloc
+    // overload:
+    {
+      Algorithm::Fill(bits, !value);
+
+      vtkm::Id numBits = bits.GetNumberOfBits();
+      VTKM_TEST_ASSERT(numBits == NumBits, "Unexpected number of bits.");
+
+      auto portal = bits.GetPortalConstControl();
+      for (vtkm::Id bitIdx = 0; bitIdx < NumBits; ++bitIdx)
+      {
+        VTKM_TEST_ASSERT(portal.GetBit(bitIdx) == !value, "Incorrect bit in result BitField.");
+      }
+    }
+  }
+
+  static VTKM_CONT void TestFillBitField()
+  {
+    TestFillBitFieldBool(true);
+    TestFillBitFieldBool(false);
+    TestFillBitFieldMask<vtkm::UInt8>(vtkm::UInt8{ 0 });
+    TestFillBitFieldMask<vtkm::UInt8>(static_cast<vtkm::UInt8>(~vtkm::UInt8{ 0 }));
+    TestFillBitFieldMask<vtkm::UInt8>(vtkm::UInt8{ 0xab });
+    TestFillBitFieldMask<vtkm::UInt8>(vtkm::UInt8{ 0x4f });
+    TestFillBitFieldMask<vtkm::UInt16>(vtkm::UInt16{ 0 });
+    TestFillBitFieldMask<vtkm::UInt16>(static_cast<vtkm::UInt16>(~vtkm::UInt16{ 0 }));
+    TestFillBitFieldMask<vtkm::UInt16>(vtkm::UInt16{ 0xfade });
+    TestFillBitFieldMask<vtkm::UInt16>(vtkm::UInt16{ 0xbeef });
+    TestFillBitFieldMask<vtkm::UInt32>(vtkm::UInt32{ 0 });
+    TestFillBitFieldMask<vtkm::UInt32>(static_cast<vtkm::UInt32>(~vtkm::UInt32{ 0 }));
+    TestFillBitFieldMask<vtkm::UInt32>(vtkm::UInt32{ 0xfacecafe });
+    TestFillBitFieldMask<vtkm::UInt32>(vtkm::UInt32{ 0xbaddecaf });
+    TestFillBitFieldMask<vtkm::UInt64>(vtkm::UInt64{ 0 });
+    TestFillBitFieldMask<vtkm::UInt64>(static_cast<vtkm::UInt64>(~vtkm::UInt64{ 0 }));
+    TestFillBitFieldMask<vtkm::UInt64>(vtkm::UInt64{ 0xbaddefacedfacade });
+    TestFillBitFieldMask<vtkm::UInt64>(vtkm::UInt64{ 0xfeeddeadbeef2dad });
+  }
+
+  static VTKM_CONT void TestFillArrayHandle()
+  {
+    vtkm::cont::ArrayHandle<vtkm::Int32> handle;
+    Algorithm::Fill(handle, 867, ARRAY_SIZE);
+
+    {
+      auto portal = handle.GetPortalConstControl();
+      VTKM_TEST_ASSERT(portal.GetNumberOfValues() == ARRAY_SIZE);
+      for (vtkm::Id i = 0; i < ARRAY_SIZE; ++i)
+      {
+        VTKM_TEST_ASSERT(portal.Get(i) == 867);
+      }
+    }
+
+    Algorithm::Fill(handle, 5309);
+    {
+      auto portal = handle.GetPortalConstControl();
+      VTKM_TEST_ASSERT(portal.GetNumberOfValues() == ARRAY_SIZE);
+      for (vtkm::Id i = 0; i < ARRAY_SIZE; ++i)
+      {
+        VTKM_TEST_ASSERT(portal.Get(i) == 5309);
+      }
+    }
+  }
+
  struct TestAll
  {
    VTKM_CONT void operator()() const
    {
      std::cout << "Doing DeviceAdapter tests" << std::endl;
+
      TestArrayTransfer();
      TestOutOfMemory();
      TestTimer();
@ -2541,6 +2774,10 @@ private:
      TestAtomicArray();

      TestBitFieldToUnorderedSet();
+      TestCountSetBits();
+      TestFillBitField();
+
+      TestFillArrayHandle();
    }
  };

--- a/vtkm/thirdparty/diy/CMakeLists.txt
+++ b/vtkm/thirdparty/diy/CMakeLists.txt
@ -24,23 +24,7 @@ target_include_directories(vtkm_diy INTERFACE
  $<INSTALL_INTERFACE:${VTKm_INSTALL_INCLUDE_DIR}/vtkm/thirdparty/diy>)

 if(VTKm_ENABLE_MPI)
-  set(arg)
-  foreach(apath IN LISTS MPI_C_INCLUDE_PATH MPI_CXX_INCLUDE_PATH)
-    list(APPEND arg $<BUILD_INTERFACE:${apath}>)
-  endforeach()
-  list(REMOVE_DUPLICATES arg)
-  target_include_directories(vtkm_diy INTERFACE ${arg})
-  target_link_libraries(vtkm_diy INTERFACE
-    $<BUILD_INTERFACE:${MPI_C_LIBRARIES}>
-    $<BUILD_INTERFACE:${MPI_CXX_LIBRARIES}>)
-  if(MPI_C_COMPILE_DEFINITIONS)
-    target_compile_definitions(vtkm_diy INTERFACE
-      $<$<COMPILE_LANGUAGE:C>:${MPI_C_COMPILE_DEFINITIONS}>)
-  endif()
-  if(MPI_CXX_COMPILE_DEFNITIONS)
-    target_compile_definitions(vtkm_diy INTERFACE
-      $<$<COMPILE_LANGUAGE:CXX>:${MPI_CXX_COMPILE_DEFNITIONS>)
-  endif()
+  target_link_libraries(vtkm_diy INTERFACE MPI::MPI_CXX)
 endif()

 install(TARGETS vtkm_diy