diff --git a/source/blender/blenlib/BLI_threads.h b/source/blender/blenlib/BLI_threads.h index c7aae6d7972..60ed5f0544a 100644 --- a/source/blender/blenlib/BLI_threads.h +++ b/source/blender/blenlib/BLI_threads.h @@ -210,13 +210,6 @@ void BLI_thread_queue_nowait(ThreadQueue *queue); # define BLI_thread_local_set(name, value) name = value #endif /* defined(__APPLE__) */ -/* **** Special functions to help performance on crazy NUMA setups. **** */ - -/* Make sure process/thread is using NUMA node with fast memory access. */ - -void BLI_thread_put_process_on_fast_node(void); -void BLI_thread_put_thread_on_fast_node(void); - #ifdef __cplusplus } #endif diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index 0db0d9a67b3..3958fd8e2d2 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -25,7 +25,6 @@ set(INC ../../../intern/atomic ../../../intern/eigen ../../../intern/guardedalloc - ../../../intern/numaapi/include ../../../extern/wcwidth ../../../extern/json/include ) @@ -334,7 +333,6 @@ set(SRC set(LIB bf_intern_eigen bf_intern_guardedalloc - bf_intern_numaapi extern_wcwidth ${ZLIB_LIBRARIES} diff --git a/source/blender/blenlib/intern/threads.cc b/source/blender/blenlib/intern/threads.cc index 3589c952409..f131a464650 100644 --- a/source/blender/blenlib/intern/threads.cc +++ b/source/blender/blenlib/intern/threads.cc @@ -52,7 +52,6 @@ #endif #include "atomic_ops.h" -#include "numaapi.h" #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && \ !defined(__clang__) @@ -125,7 +124,6 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_t mainid; -static bool is_numa_available = false; static unsigned int thread_levels = 0; /* threads can be invoked inside threads */ static int num_threads_override = 0; @@ -143,9 +141,6 @@ struct ThreadSlot { void BLI_threadapi_init() { mainid = pthread_self(); - if (numaAPI_Initialize() == NUMAAPI_SUCCESS) { - is_numa_available = true; - } } void BLI_threadapi_exit() @@ -807,110 +802,3 @@ void BLI_thread_queue_wait_finish(ThreadQueue *queue) pthread_mutex_unlock(&queue->mutex); } - -/* **** Special functions to help performance on crazy NUMA setups. **** */ - -#if 0 /* UNUSED */ -static bool check_is_threadripper2_alike_topology(){ - /* NOTE: We hope operating system does not support CPU hot-swap to - * a different brand. And that SMP of different types is also not - * encouraged by the system. */ - static bool is_initialized = false; - static bool is_threadripper2 = false; - if (is_initialized) { - return is_threadripper2; - } - is_initialized = true; - char *cpu_brand = BLI_cpu_brand_string(); - if (cpu_brand == nullptr) { - return false; - } - if (strstr(cpu_brand, "Threadripper")) { - /* NOTE: We consider all Thread-rippers having similar topology to - * the second one. This is because we are trying to utilize NUMA node - * 0 as much as possible. This node does exist on earlier versions of - * thread-ripper and setting affinity to it should not have negative - * effect. - * This allows us to avoid per-model check, making the code more - * reliable for the CPUs which are not yet released. - */ - if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) { - is_threadripper2 = true; - } - } - /* NOTE: While all dies of EPYC has memory controller, only two f them - * has access to a lower-indexed DDR slots. Those dies are same as on - * Threadripper2 with the memory controller. - * Now, it is rather likely that reasonable amount of users don't max - * up their DR slots, making it only two dies connected to a DDR slot - * with actual memory in it. */ - if (strstr(cpu_brand, "EPYC")) { - /* NOTE: Similarly to Thread-ripper we do not do model check. */ - is_threadripper2 = true; - } - MEM_freeN(cpu_brand); - return is_threadripper2; -} - -static void threadripper_put_process_on_fast_node(){ - if (!is_numa_available) { - return; - } - /* NOTE: Technically, we can use NUMA nodes 0 and 2 and using both of - * them in the affinity mask will allow OS to schedule threads more - * flexible,possibly increasing overall performance when multiple apps - * are crunching numbers. - * - * However, if scene fits into memory adjacent to a single die we don't - * want OS to re-schedule the process to another die since that will make - * it further away from memory allocated for .blend file. */ - /* NOTE: Even if NUMA is available in the API but is disabled in BIOS on - * this workstation we still process here. If NUMA is disabled it will be a - * single node, so our action is no-visible-changes, but allows to keep - * things simple and unified. */ - numaAPI_RunProcessOnNode(0); -} - -static void threadripper_put_thread_on_fast_node(){ - if (!is_numa_available) { - return; - } - /* NOTE: This is where things becomes more interesting. On the one hand - * we can use nodes 0 and 2 and allow operating system to do balancing - * of processes/threads for the maximum performance when multiple apps - * are running. - * On another hand, however, we probably want to use same node as the - * main thread since that's where the memory of .blend file is likely - * to be allocated. - * Since the main thread is currently on node 0, we also put thread on - * same node. */ - /* See additional note about NUMA disabled in BIOS above. */ - numaAPI_RunThreadOnNode(0); -} -#endif /* UNUSED */ - -void BLI_thread_put_process_on_fast_node() -{ - /* Disabled for now since this causes only 16 threads to be used on a - * thread-ripper for computations like sculpting and fluid sim. The problem - * is that all threads created as children from this thread will inherit - * the NUMA node and so will end up on the same node. This can be fixed - * case-by-case by assigning the NUMA node for every child thread, however - * this is difficult for external libraries and OpenMP, and out of our - * control for plugins like external renderers. */ -#if 0 - if (check_is_threadripper2_alike_topology()) { - threadripper_put_process_on_fast_node(); - } -#endif -} - -void BLI_thread_put_thread_on_fast_node() -{ - /* Disabled for now, see comment above. */ -#if 0 - if (check_is_threadripper2_alike_topology()) { - threadripper_put_thread_on_fast_node(); - } -#endif -} diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c index 66277ec57f4..e7ad654d3e5 100644 --- a/source/blender/windowmanager/intern/wm_jobs.c +++ b/source/blender/windowmanager/intern/wm_jobs.c @@ -381,7 +381,6 @@ static void *do_job_thread(void *job_v) { wmJob *wm_job = job_v; - BLI_thread_put_thread_on_fast_node(); wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress); wm_job->ready = true; diff --git a/source/creator/creator.c b/source/creator/creator.c index c50b6ac4c79..d69e4d075fd 100644 --- a/source/creator/creator.c +++ b/source/creator/creator.c @@ -408,7 +408,6 @@ int main(int argc, BKE_appdir_program_path_init(argv[0]); BLI_threadapi_init(); - BLI_thread_put_process_on_fast_node(); DNA_sdna_current_init();