Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.

Reviewed by: brecht Differential Revision: https://developer.blender.org/D199
2014-01-14 20:39:21 +01:00 · 2014-01-14 20:39:21 +01:00 · 9351ac0d85
commit 9351ac0d85
parent d980c3eccb
11 changed files with 109 additions and 63 deletions
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)

 # Build Flags

-if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
-	set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-endif()
-
 if(WIN32 AND MSVC)
 	# there is no /arch:SSE3, but intrinsics are available anyway
 	if(CMAKE_CL_64)
@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
 add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
 add_definitions(-DCCL_NAMESPACE_END=})

-if(WITH_CYCLES_OPTIMIZED_KERNEL)
-	add_definitions(-DWITH_OPTIMIZED_KERNEL)
-endif()
-
 if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
    incs.append(env['BF_PTHREADS_INC'])

 # optimized kernel
-if env['WITH_BF_RAYOPTIMIZATION']:
-    sse2_cxxflags = Split(env['CXXFLAGS'])
-    sse3_cxxflags = Split(env['CXXFLAGS'])
-    sse41_cxxflags = Split(env['CXXFLAGS'])
+sse2_cxxflags = Split(env['CXXFLAGS'])
+sse3_cxxflags = Split(env['CXXFLAGS'])
+sse41_cxxflags = Split(env['CXXFLAGS'])

-    if env['OURPLATFORM'] == 'win32-vc':
-        # there is no /arch:SSE3, but intrinsics are available anyway
-        sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-    elif env['OURPLATFORM'] == 'win64-vc':
-        sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-    else:
-        sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
-        sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
-        sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
-    
-    defs.append('WITH_OPTIMIZED_KERNEL')
-    optim_defs = defs[:]
+if env['OURPLATFORM'] == 'win32-vc':
+    # there is no /arch:SSE3, but intrinsics are available anyway
+    sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+elif env['OURPLATFORM'] == 'win64-vc':
+    sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+else:
+    sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+    sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
+    sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())

-    if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
-        cycles_sse41 = cycles.Clone()
-        sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
-        cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
-        defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
+defs.append('WITH_OPTIMIZED_KERNEL')
+optim_defs = defs[:]

-    cycles_sse3 = cycles.Clone()
-    sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
-    cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
+    cycles_sse41 = cycles.Clone()
+    sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
+    cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
+    defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')

-    cycles_sse2 = cycles.Clone()
-    sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
-    cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
+cycles_sse3 = cycles.Clone()
+sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+cycles_sse2 = cycles.Clone()
+sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)

 cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)

--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@ -166,7 +166,6 @@ public:
 			int start_sample = tile.start_sample;
 			int end_sample = tile.start_sample + tile.num_samples;

-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
@ -189,6 +188,7 @@ public:
 			}
 			else
 #endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 			if(system_cpu_support_sse3()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
@ -208,7 +208,10 @@ public:
 					task.update_progress(tile);
 				}
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
 						if(task.need_finish_queue == false)
@ -267,7 +270,6 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);

 		if(task.rgba_half) {
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@ -276,14 +278,18 @@ public:
 							sample_scale, x, y, task.offset, task.stride);
 			}
 			else
-#endif				
+#endif		
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
 							sample_scale, x, y, task.offset, task.stride);
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
@ -299,7 +305,6 @@ public:
 			}
 		}
 		else {
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@ -309,13 +314,17 @@ public:
 			}
 			else
 #endif			
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
 							sample_scale, x, y, task.offset, task.stride);
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
@ -340,7 +349,6 @@ public:
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif

-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
@ -352,6 +360,7 @@ public:
 		}
 		else
 #endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
@ -360,7 +369,10 @@ public:
 					break;
 			}
 		}
-		else if(system_cpu_support_sse2()) {
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -192,10 +192,8 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

-if(WITH_CYCLES_OPTIMIZED_KERNEL)
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-endif()
+set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")

 if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41)
 	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@ -84,6 +84,11 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		assert(0);
 }

+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
+#if defined(__x86_64__) || defined(_M_X64)
+#define __KERNEL_SSE2__
+#endif
+
 /* Path Tracing */

 void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@ -17,9 +17,10 @@
 #ifndef __KERNEL_H__
 #define __KERNEL_H__

-/* CPU Kernel Interfae */
+/* CPU Kernel Interface */

 #include "util_types.h"
+#include "util_optimization.h"

 CCL_NAMESPACE_BEGIN

@ -43,7 +44,7 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i);

-#ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@ -52,7 +53,9 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i);
+#endif

+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@ -61,7 +64,9 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i);
+#endif

+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@ -17,8 +17,10 @@
 /* Optimized CPU kernel entry points. This file is compiled with SSE2
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2

 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -70,4 +72,3 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
 CCL_NAMESPACE_END

 #endif
-
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@ -17,8 +17,10 @@
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3

 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -72,4 +74,3 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
 CCL_NAMESPACE_END

 #endif
-
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@ -17,8 +17,10 @@
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41

 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -73,4 +75,3 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, in
 CCL_NAMESPACE_END

 #endif
-
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -46,6 +46,7 @@ set(SRC_HEADERS
 	util_md5.h
 	util_opencl.h
 	util_opengl.h
+	util_optimization.h
 	util_param.h
 	util_path.h
 	util_progress.h
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+/* no SSE2 kernel on x86-64, part of regular kernel */
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#endif
+
+#if defined(i386) || defined(_M_IX86)
+
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#endif