Cycles: make TBB a required library dependency, and use in a few places

Now that the rest of Blender also relies on TBB, no point in maintaining custom code for paraller_for and thread local storage.
2020-06-05 12:53:38 +02:00 · 2020-06-05 12:53:38 +02:00 · d8c2092b15
commit d8c2092b15
parent ace3268482
10 changed files with 58 additions and 138 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -654,6 +654,7 @@ if(WITH_BOOST AND NOT (WITH_CYCLES OR WITH_OPENIMAGEIO OR WITH_INTERNATIONAL OR
  set(WITH_BOOST OFF)
 endif()

+set_and_warn_dependency(WITH_TBB WITH_CYCLES            OFF)
 set_and_warn_dependency(WITH_TBB WITH_USD               OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENIMAGEDENOISE  OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENVDB           OFF)
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -286,6 +286,7 @@ include_directories(
  ${OPENEXR_INCLUDE_DIR}
  ${OPENEXR_INCLUDE_DIRS}
  ${PUGIXML_INCLUDE_DIR}
+  ${TBB_INCLUDE_DIRS}
 )

 if(CYCLES_STANDALONE_REPOSITORY)
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@ -423,22 +423,6 @@ BVHNode *BVHBuild::run()
  }

  spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
-  if (params.use_spatial_split) {
-    /* NOTE: The API here tries to be as much ready for multi-threaded build
-     * as possible, but at the same time it tries not to introduce any
-     * changes in behavior for until all refactoring needed for threading is
-     * finished.
-     *
-     * So we currently allocate single storage for now, which is only used by
-     * the only thread working on the spatial BVH build.
-     */
-    spatial_storage.resize(TaskScheduler::num_threads() + 1);
-    size_t num_bins = max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1;
-    foreach (BVHSpatialStorage &storage, spatial_storage) {
-      storage.right_bounds.clear();
-    }
-    spatial_storage[0].right_bounds.resize(num_bins);
-  }
  spatial_free_index = 0;

  need_prim_time = params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0;
@ -475,6 +459,9 @@ BVHNode *BVHBuild::run()
    task_pool.wait_work();
  }

+  /* clean up temporary memory usage by threads */
+  spatial_storage.clear();
+
  /* delete if we canceled */
  if (rootnode) {
    if (progress.get_cancel()) {
@ -551,19 +538,18 @@ void BVHBuild::thread_build_node(InnerNode *inner, int child, BVHObjectBinning *
  }
 }

-void BVHBuild::thread_build_spatial_split_node(InnerNode *inner,
-                                               int child,
-                                               BVHRange *range,
-                                               vector<BVHReference> *references,
-                                               int level,
-                                               int thread_id)
+void BVHBuild::thread_build_spatial_split_node(
+    InnerNode *inner, int child, BVHRange *range, vector<BVHReference> *references, int level)
 {
  if (progress.get_cancel()) {
    return;
  }

+  /* Get per-thread memory for spatial split. */
+  BVHSpatialStorage *local_storage = &spatial_storage.local();
+
  /* build nodes */
-  BVHNode *node = build_node(*range, references, level, thread_id);
+  BVHNode *node = build_node(*range, references, level, local_storage);

  /* set child in inner node */
  inner->children[child] = node;
@ -690,7 +676,7 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
 BVHNode *BVHBuild::build_node(const BVHRange &range,
                              vector<BVHReference> *references,
                              int level,
-                              int thread_id)
+                              BVHSpatialStorage *storage)
 {
  /* Update progress.
   *
@ -712,7 +698,6 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
  }

  /* Perform splitting test. */
-  BVHSpatialStorage *storage = &spatial_storage[thread_id];
  BVHMixedSplit split(this, storage, range, references, level);

  if (!(range.size() > 0 && params.top_level && level == 0)) {
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@ -76,7 +76,7 @@ class BVHBuild {
  BVHNode *build_node(const BVHRange &range,
                      vector<BVHReference> *references,
                      int level,
-                      int thread_id);
+                      BVHSpatialStorage *storage);
  BVHNode *build_node(const BVHObjectBinning &range, int level);
  BVHNode *create_leaf_node(const BVHRange &range, const vector<BVHReference> &references);
  BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);
@ -87,12 +87,8 @@ class BVHBuild {
  /* Threads. */
  enum { THREAD_TASK_SIZE = 4096 };
  void thread_build_node(InnerNode *node, int child, BVHObjectBinning *range, int level);
-  void thread_build_spatial_split_node(InnerNode *node,
-                                       int child,
-                                       BVHRange *range,
-                                       vector<BVHReference> *references,
-                                       int level,
-                                       int thread_id);
+  void thread_build_spatial_split_node(
+      InnerNode *node, int child, BVHRange *range, vector<BVHReference> *references, int level);
  thread_mutex build_mutex;

  /* Progress. */
@ -127,7 +123,7 @@ class BVHBuild {

  /* Spatial splitting. */
  float spatial_min_overlap;
-  vector<BVHSpatialStorage> spatial_storage;
+  enumerable_thread_specific<BVHSpatialStorage> spatial_storage;
  size_t spatial_free_index;
  thread_spin_lock spatial_spin_lock;

--- a/intern/cycles/render/image_sky.cpp
+++ b/intern/cycles/render/image_sky.cpp
@ -20,6 +20,7 @@
 #include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_sky_model.h"
+#include "util/util_task.h"

 CCL_NAMESPACE_BEGIN

@ -58,26 +59,21 @@ bool SkyLoader::load_pixels(const ImageMetaData &metadata,
  float altitude_f = (float)altitude;

  /* precompute sky texture */
-  const int num_chunks = TaskScheduler::num_threads();
-  const int chunk_size = height / num_chunks;
-  TaskPool pool;
-  for (int chunk = 0; chunk < num_chunks; chunk++) {
-    const int chunk_start = chunk * chunk_size;
-    const int chunk_end = (chunk + 1 < num_chunks) ? (chunk + 1) * chunk_size : height;
-    pool.push(function_bind(&nishita_skymodel_precompute_texture,
-                            pixel_data,
-                            metadata.channels,
-                            chunk_start,
-                            chunk_end,
-                            width,
-                            height,
-                            sun_elevation,
-                            altitude_f,
-                            air_density,
-                            dust_density,
-                            ozone_density));
-  }
-  pool.wait_work();
+  const int rows_per_task = divide_up(1024, width);
+  parallel_for(blocked_range<size_t>(0, height, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 nishita_skymodel_precompute_texture(pixel_data,
+                                                     metadata.channels,
+                                                     r.begin(),
+                                                     r.end(),
+                                                     width,
+                                                     height,
+                                                     sun_elevation,
+                                                     altitude_f,
+                                                     air_density,
+                                                     dust_density,
+                                                     ozone_density);
+               });

  return true;
 }
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@ -680,29 +680,13 @@ void LightManager::device_update_background(Device *device,
  float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_width * res.y);

  double time_start = time_dt();
-  if (max(res.x, res.y) < 512) {
-    /* Small enough resolution, faster to do single-threaded. */
-    background_cdf(0, res.y, res.x, res.y, &pixels, cond_cdf);
-  }
-  else {
-    /* Threaded evaluation for large resolution. */
-    const int num_blocks = TaskScheduler::num_threads();
-    const int chunk_size = res.y / num_blocks;
-    int start_row = 0;
-    TaskPool pool;
-    for (int i = 0; i < num_blocks; ++i) {
-      const int current_chunk_size = (i != num_blocks - 1) ? chunk_size : (res.y - i * chunk_size);
-      pool.push(function_bind(&background_cdf,
-                              start_row,
-                              start_row + current_chunk_size,
-                              res.x,
-                              res.y,
-                              &pixels,
-                              cond_cdf));
-      start_row += current_chunk_size;
-    }
-    pool.wait_work();
-  }
+
+  /* Create CDF in parallel. */
+  const int rows_per_task = divide_up(10240, res.x);
+  parallel_for(blocked_range<size_t>(0, res.y, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 background_cdf(r.begin(), r.end(), res.x, res.y, &pixels, cond_cdf);
+               });

  /* marginal CDFs (column, V direction, sum of rows) */
  marg_cdf[0].x = cond_cdf[res.x].x;
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@ -78,7 +78,6 @@ struct UpdateObjectTransformState {
  Scene *scene;

  /* Some locks to keep everything thread-safe. */
-  thread_spin_lock queue_lock;
  thread_spin_lock surface_area_lock;

  /* First unused object index in the queue. */
@ -551,41 +550,6 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
  }
 }

-bool ObjectManager::device_update_object_transform_pop_work(UpdateObjectTransformState *state,
-                                                            int *start_index,
-                                                            int *num_objects)
-{
-  /* Tweakable parameter, number of objects per chunk.
-   * Too small value will cause some extra overhead due to spin lock,
-   * too big value might not use all threads nicely.
-   */
-  static const int OBJECTS_PER_TASK = 32;
-  bool have_work = false;
-  state->queue_lock.lock();
-  int num_scene_objects = state->scene->objects.size();
-  if (state->queue_start_object < num_scene_objects) {
-    int count = min(OBJECTS_PER_TASK, num_scene_objects - state->queue_start_object);
-    *start_index = state->queue_start_object;
-    *num_objects = count;
-    state->queue_start_object += count;
-    have_work = true;
-  }
-  state->queue_lock.unlock();
-  return have_work;
-}
-
-void ObjectManager::device_update_object_transform_task(UpdateObjectTransformState *state)
-{
-  int start_index, num_objects;
-  while (device_update_object_transform_pop_work(state, &start_index, &num_objects)) {
-    for (int i = 0; i < num_objects; ++i) {
-      const int object_index = start_index + i;
-      Object *ob = state->scene->objects[object_index];
-      device_update_object_transform(state, ob);
-    }
-  }
-}
-
 void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, Progress &progress)
 {
  UpdateObjectTransformState state;
@ -631,29 +595,16 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
    numparticles += psys->particles.size();
  }

-  /* NOTE: If it's just a handful of objects we deal with them in a single
-   * thread to avoid threading overhead. However, this threshold is might
-   * need some tweaks to make mid-complex scenes optimal.
-   */
-  if (scene->objects.size() < 64) {
-    foreach (Object *ob, scene->objects) {
-      device_update_object_transform(&state, ob);
-      if (progress.get_cancel()) {
-        return;
-      }
-    }
-  }
-  else {
-    const int num_threads = TaskScheduler::num_threads();
-    TaskPool pool;
-    for (int i = 0; i < num_threads; ++i) {
-      pool.push(function_bind(&ObjectManager::device_update_object_transform_task, this, &state));
-    }
-    pool.wait_work();
-    if (progress.get_cancel()) {
-      return;
-    }
-  }
+  /* Parallel object update, with grain size to avoid too much threadng overhead
+   * for individual objects. */
+  static const int OBJECTS_PER_TASK = 32;
+  parallel_for(blocked_range<size_t>(0, scene->objects.size(), OBJECTS_PER_TASK),
+               [&](const blocked_range<size_t> &r) {
+                 for (size_t i = r.begin(); i != r.end(); i++) {
+                   Object *ob = state.scene->objects[i];
+                   device_update_object_transform(&state, ob);
+                 }
+               });

  dscene->objects.copy_to_device();
  if (state.need_motion == Scene::MOTION_PASS) {
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@ -94,8 +94,7 @@ void SVMShaderManager::device_update(Device *device,
                                 scene,
                                 scene->shaders[i],
                                 &progress,
-                                 &shader_svm_nodes[i]),
-                   false);
+                                 &shader_svm_nodes[i]));
  }
  task_pool.wait_work();

--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -29,7 +29,7 @@ set(SRC
 )

 set(LIB
-
+  ${TBB_LIBRARIES}
 )

 if(WITH_CYCLES_STANDALONE)
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@ -22,8 +22,15 @@
 #include "util/util_thread.h"
 #include "util/util_vector.h"

+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#include <tbb/tbb.h>
+
 CCL_NAMESPACE_BEGIN

+using tbb::blocked_range;
+using tbb::enumerable_thread_specific;
+using tbb::parallel_for;
+
 class Task;
 class TaskPool;
 class TaskScheduler;