diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index fc193d99a57..6fa6260c81e 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -8,10 +8,6 @@ include(cmake/external_libs.cmake) # Build Flags -if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD) - set(WITH_CYCLES_OPTIMIZED_KERNEL ON) -endif() - if(WIN32 AND MSVC) # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) @@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS}) add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {) add_definitions(-DCCL_NAMESPACE_END=}) -if(WITH_CYCLES_OPTIMIZED_KERNEL) - add_definitions(-DWITH_OPTIMIZED_KERNEL) -endif() - if(WITH_CYCLES_NETWORK) add_definitions(-DWITH_NETWORK) endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 448375a04ff..e31fb5bed96 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', ' incs.append(env['BF_PTHREADS_INC']) # optimized kernel -if env['WITH_BF_RAYOPTIMIZATION']: - sse2_cxxflags = Split(env['CXXFLAGS']) - sse3_cxxflags = Split(env['CXXFLAGS']) - sse41_cxxflags = Split(env['CXXFLAGS']) +sse2_cxxflags = Split(env['CXXFLAGS']) +sse3_cxxflags = Split(env['CXXFLAGS']) +sse41_cxxflags = Split(env['CXXFLAGS']) - if env['OURPLATFORM'] == 'win32-vc': - # there is no /arch:SSE3, but intrinsics are available anyway - sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - elif env['OURPLATFORM'] == 'win64-vc': - sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) - else: - sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split()) - sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split()) - sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split()) - - defs.append('WITH_OPTIMIZED_KERNEL') - optim_defs = defs[:] +if env['OURPLATFORM'] == 'win32-vc': + # there is no /arch:SSE3, but intrinsics are available anyway + sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) +elif env['OURPLATFORM'] == 'win64-vc': + sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) +else: + sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split()) + sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split()) + sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split()) - if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']: - cycles_sse41 = cycles.Clone() - sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')] - cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags) - defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41') +defs.append('WITH_OPTIMIZED_KERNEL') +optim_defs = defs[:] - cycles_sse3 = cycles.Clone() - sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')] - cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags) +if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']: + cycles_sse41 = cycles.Clone() + sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')] + cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags) + defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41') - cycles_sse2 = cycles.Clone() - sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')] - cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags) +cycles_sse3 = cycles.Clone() +sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')] +cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags) + +cycles_sse2 = cycles.Clone() +sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')] +cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags) cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index ea632b744dc..b29d64eb454 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -166,7 +166,6 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; -#ifdef WITH_OPTIMIZED_KERNEL #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -189,6 +188,7 @@ public: } else #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int sample = start_sample; sample < end_sample; sample++) { if (task.get_cancel() || task_pool.canceled()) { @@ -208,7 +208,10 @@ public: task.update_progress(tile); } } - else if(system_cpu_support_sse2()) { + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { for(int sample = start_sample; sample < end_sample; sample++) { if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) @@ -267,7 +270,6 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { -#ifdef WITH_OPTIMIZED_KERNEL #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -276,14 +278,18 @@ public: sample_scale, x, y, task.offset, task.stride); } else -#endif +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, sample_scale, x, y, task.offset, task.stride); } - else if(system_cpu_support_sse2()) { + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, @@ -299,7 +305,6 @@ public: } } else { -#ifdef WITH_OPTIMIZED_KERNEL #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -309,13 +314,17 @@ public: } else #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, sample_scale, x, y, task.offset, task.stride); } - else if(system_cpu_support_sse2()) { + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, @@ -340,7 +349,6 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif -#ifdef WITH_OPTIMIZED_KERNEL #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { @@ -352,6 +360,7 @@ public: } else #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); @@ -360,7 +369,10 @@ public: break; } } - else if(system_cpu_support_sse2()) { + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 998d1a3540f..81499bbfda8 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -192,10 +192,8 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -if(WITH_CYCLES_OPTIMIZED_KERNEL) - set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") -endif() +set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") +set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41) set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index 3e2727fde9a..3fe1e80890b 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -84,6 +84,11 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t assert(0); } +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */ +#if defined(__x86_64__) || defined(_M_X64) +#define __KERNEL_SSE2__ +#endif + /* Path Tracing */ void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 105a3887da0..b6db92f26e9 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -17,9 +17,10 @@ #ifndef __KERNEL_H__ #define __KERNEL_H__ -/* CPU Kernel Interfae */ +/* CPU Kernel Interface */ #include "util_types.h" +#include "util_optimization.h" CCL_NAMESPACE_BEGIN @@ -43,7 +44,7 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); -#ifdef WITH_OPTIMIZED_KERNEL +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -52,7 +53,9 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -61,7 +64,9 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 9c69e519dca..953c3e4f9c9 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -17,8 +17,10 @@ /* Optimized CPU kernel entry points. This file is compiled with SSE2 * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -#ifdef WITH_OPTIMIZED_KERNEL + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 /* SSE optimization disabled for now on 32 bit, see bug #36316 */ #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) @@ -70,4 +72,3 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int CCL_NAMESPACE_END #endif - diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 05877a41b4a..2a36c974191 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -17,8 +17,10 @@ /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -#ifdef WITH_OPTIMIZED_KERNEL + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 /* SSE optimization disabled for now on 32 bit, see bug #36316 */ #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) @@ -72,4 +74,3 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int CCL_NAMESPACE_END #endif - diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp index 0c68fd3651b..6583feaeb45 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernel_sse41.cpp @@ -17,8 +17,10 @@ /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -#ifdef WITH_OPTIMIZED_KERNEL + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 /* SSE optimization disabled for now on 32 bit, see bug #36316 */ #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) @@ -73,4 +75,3 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, in CCL_NAMESPACE_END #endif - diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 389f76e6df2..df188f8236a 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -46,6 +46,7 @@ set(SRC_HEADERS util_md5.h util_opencl.h util_opengl.h + util_optimization.h util_param.h util_path.h util_progress.h diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h new file mode 100644 index 00000000000..6ffc7545335 --- /dev/null +++ b/intern/cycles/util/util_optimization.h @@ -0,0 +1,31 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#if defined(__x86_64__) || defined(_M_X64) + +/* no SSE2 kernel on x86-64, part of regular kernel */ +#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + +#endif + +#if defined(i386) || defined(_M_IX86) + +#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + +#endif