Sculpt/dyntopo: Make the omp threads configurable to overcome performance issues

- autodetect optimal default, which typically avoids HT threads
- can store setting in .blend per scene
- this does not touch general omp max threads, due i found other areas where the calculations are fitting for huge corecount
- Intel notes, some of the older generation processors with HyperThreading would not provide significant performance boost for FPU intensive applications. On those systems you might want to set OMP_NUM_THREADS = total number of cores (not total number of hardware theads).
This commit is contained in:
Jens Verwiebe 2014-03-31 13:51:40 +02:00
parent e05d35bfaf
commit 277fb1a31f
8 changed files with 108 additions and 8 deletions

@ -1283,7 +1283,8 @@ class VIEW3D_PT_sculpt_options(Panel, View3DPaintPanel):
def draw(self, context):
layout = self.layout
scene = context.scene
toolsettings = context.tool_settings
sculpt = toolsettings.sculpt
capabilities = sculpt.brush.sculpt_capabilities
@ -1293,6 +1294,14 @@ class VIEW3D_PT_sculpt_options(Panel, View3DPaintPanel):
col.label(text="Gravity:")
col.prop(sculpt, "gravity", slider=True, text="Factor")
col.prop(sculpt, "gravity_object")
col.separator()
col.label(text="OpenMP Threads:")
col.row(align=True).prop(scene, "omp_mode", expand=True)
sub = col.column(align=True)
sub.enabled = scene.omp_mode == 'MANUAL'
sub.prop(scene, "omp_num_threads")
col.separator()
layout.prop(sculpt, "use_threaded", text="Threaded Sculpt")
layout.prop(sculpt, "show_low_resolution")

@ -137,6 +137,8 @@ bool BKE_scene_check_rigidbody_active(const struct Scene *scene);
int BKE_scene_num_threads(const struct Scene *scene);
int BKE_render_num_threads(const struct RenderData *r);
int BKE_scene_num_omp_threads(const struct Scene *scene);
void BKE_scene_omp_threads_update(const struct Scene *scene);
#ifdef __cplusplus
}
#endif

@ -638,6 +638,9 @@ Scene *BKE_scene_add(Main *bmain, const char *name)
sce->gm.exitkey = 218; // Blender key code for ESC
sce->omp_mode = SCE_OMP_AUTO;
sce->omp_num_threads = 1;
sound_create_scene(sce);
/* color management */
@ -1868,3 +1871,10 @@ int BKE_scene_num_threads(const Scene *scene)
return BKE_render_num_threads(&scene->r);
}
int BKE_scene_num_omp_threads(const struct Scene *scene)
{
if (scene->omp_mode == SCE_OMP_AUTO)
return BLI_omp_thread_count();
else
return scene->omp_num_threads;
}

@ -75,6 +75,8 @@ int BLI_system_thread_count(void); /* gets the number of threads the system
void BLI_system_num_threads_override_set(int num);
int BLI_system_num_threads_override_get(void);
int BLI_omp_thread_count(void); /* gets the number of openmp threads the system can make use of */
/* Global Mutex Locks
*
* One custom lock available now. can be extended. */

@ -54,10 +54,25 @@
# include <sys/time.h>
#endif
#if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
#ifdef _OPENMP
#include <omp.h>
#endif
#if defined(__APPLE__)
#if defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
# define USE_APPLE_OMP_FIX
#endif
/* how many cores not counting HT aka pysical cores */
static int system_physical_thread_count(void)
{
int ptcount;
size_t ptcount_len = sizeof(ptcount);
sysctlbyname("hw.physicalcpu", &ptcount, &ptcount_len, NULL, 0);
return ptcount;
}
#endif // __APPLE__
#ifdef USE_APPLE_OMP_FIX
/* ************** libgomp (Apple gcc 4.2.1) TLS bug workaround *************** */
extern pthread_key_t gomp_tls_key;
@ -335,6 +350,22 @@ void BLI_end_threads(ListBase *threadbase)
/* System Information */
/* gets the number of openmp threads the system can make use of */
int BLI_omp_thread_count(void)
{
int t;
#ifdef _OPENMP
#ifdef __APPLE__
t = system_physical_thread_count();
#else
t = omp_get_num_procs();
#endif
#else
t = 1;
#endif
return t;
}
/* how many threads are native on this system? */
int BLI_system_thread_count(void)
{

@ -67,6 +67,7 @@
#include "BKE_multires.h"
#include "BKE_paint.h"
#include "BKE_report.h"
#include "BKE_scene.h"
#include "BKE_lattice.h" /* for armature_deform_verts */
#include "BKE_node.h"
#include "BKE_object.h"
@ -1541,10 +1542,10 @@ static void do_multires_smooth_brush(Sculpt *sd, SculptSession *ss, PBVHNode *no
grid_hidden = BKE_pbvh_grid_hidden(ss->pbvh);
thread_num = 0;
#ifdef _OPENMP
if (sd->flags & SCULPT_USE_OPENMP)
thread_num = omp_get_thread_num();
thread_num = omp_get_thread_num();
#else
thread_num = 0;
#endif
tmpgrid_co = ss->cache->tmpgrid_co[thread_num];
tmprow_co = ss->cache->tmprow_co[thread_num];
@ -3769,7 +3770,7 @@ static void sculpt_init_mirror_clipping(Object *ob, SculptSession *ss)
}
}
static void sculpt_omp_start(Sculpt *sd, SculptSession *ss)
static void sculpt_omp_start(Scene *scene, Sculpt *sd, SculptSession *ss)
{
StrokeCache *cache = ss->cache;
@ -3779,15 +3780,17 @@ static void sculpt_omp_start(Sculpt *sd, SculptSession *ss)
* Justification: Empirically I've found that two threads per
* processor gives higher throughput. */
if (sd->flags & SCULPT_USE_OPENMP) {
cache->num_threads = omp_get_num_procs();
cache->num_threads = BKE_scene_num_omp_threads(scene);
}
else {
cache->num_threads = 1;
}
omp_set_num_threads(cache->num_threads);
#else
(void)sd;
cache->num_threads = 1;
#endif
// printf("Sculpt omp threadcount: %d\n", cache->num_threads);
if (ss->multires) {
int i, gridsize, array_mem_size;
BKE_pbvh_node_get_grids(ss->pbvh, NULL, NULL, NULL, NULL,
@ -4002,7 +4005,7 @@ static void sculpt_update_cache_invariants(bContext *C, Sculpt *sd, SculptSessio
cache->previous_vertex_rotation = 0;
cache->init_dir_set = false;
sculpt_omp_start(sd, ss);
sculpt_omp_start(scene, sd, ss);
}
static void sculpt_update_brush_delta(UnifiedPaintSettings *ups, Object *ob, Brush *brush)
@ -4626,6 +4629,12 @@ static void sculpt_stroke_done(const bContext *C, struct PaintStroke *UNUSED(str
WM_event_add_notifier(C, NC_OBJECT | ND_DRAW, ob);
}
#ifdef _OPENMP
if (!(sd->flags & SCULPT_USE_OPENMP))
omp_set_num_threads(BLI_system_thread_count());
// printf("Reseted to omp threadcount: %d\n", BLI_system_thread_count());
#endif
sculpt_brush_exit_tex(sd);
}

@ -1224,6 +1224,10 @@ typedef struct Scene {
/* RigidBody simulation world+settings */
struct RigidBodyWorld *rigidbody_world;
/* Openmp Global Settings */
int omp_num_threads;
int omp_mode;
} Scene;
@ -1769,6 +1773,10 @@ typedef enum SculptFlags {
#define USER_UNIT_OPT_SPLIT 1
#define USER_UNIT_ROT_RADIANS 2
/* OpenMP settings */
#define SCE_OMP_AUTO 0
#define SCE_OMP_MANUAL 1
#ifdef __cplusplus
}
#endif

@ -43,6 +43,7 @@
#include "BKE_freestyle.h"
#include "BKE_editmesh.h"
#include "BKE_paint.h"
#include "BKE_scene.h"
#include "RNA_define.h"
#include "RNA_enum_types.h"
@ -680,6 +681,17 @@ static char *rna_RenderSettings_path(PointerRNA *UNUSED(ptr))
return BLI_sprintfN("render");
}
static void rna_omp_threads_update(Main *UNUSED(bmain), Scene *scene, PointerRNA *UNUSED(ptr))
{
BKE_scene_omp_threads_update(scene);
}
static int rna_omp_threads_get(PointerRNA *ptr)
{
Scene *scene = (Scene *)ptr->data;
return BKE_scene_num_omp_threads(scene);
}
static int rna_RenderSettings_threads_get(PointerRNA *ptr)
{
RenderData *rd = (RenderData *)ptr->data;
@ -5088,6 +5100,12 @@ void RNA_def_scene(BlenderRNA *brna)
{0, NULL, 0, NULL, NULL}
};
static EnumPropertyItem omp_threads_mode_items[] = {
{SCE_OMP_AUTO, "AUTO", 0, "Auto-detect", "Automatically determine the number of threads, based on CPUs"},
{SCE_OMP_MANUAL, "MANUAL", 0, "Manual", "Manually determine the number of threads"},
{0, NULL, 0, NULL, NULL}
};
/* Struct definition */
srna = RNA_def_struct(brna, "Scene", "ID");
RNA_def_struct_ui_text(srna, "Scene",
@ -5450,6 +5468,17 @@ void RNA_def_scene(BlenderRNA *brna)
RNA_def_property_struct_type(prop, "ColorManagedSequencerColorspaceSettings");
RNA_def_property_ui_text(prop, "Sequencer Color Space Settings", "Settings of color space sequencer is working in");
prop = RNA_def_property(srna, "omp_num_threads", PROP_INT, PROP_NONE);
RNA_def_property_range(prop, 1, BLENDER_MAX_THREADS);
RNA_def_property_int_funcs(prop, "rna_omp_threads_get", NULL, NULL);
RNA_def_property_ui_text(prop, "OpenMP Threads",
"Number of CPU threads to use simultaneously for openmp"
"(for multi-core/CPU systems)");
prop = RNA_def_property(srna, "omp_mode", PROP_ENUM, PROP_NONE);
RNA_def_property_enum_items(prop, omp_threads_mode_items);
RNA_def_property_ui_text(prop, "OpenMP Mode", "Determine the amount of openmp threads used");
/* Nestled Data */
/* *** Non-Animated *** */
RNA_define_animate_sdna(false);