Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.

Reviewed by: brecht
Differential Revision: https://developer.blender.org/D199
This commit is contained in:
Thomas Dinges 2014-01-14 20:39:21 +01:00
parent d980c3eccb
commit 9351ac0d85
11 changed files with 109 additions and 63 deletions

@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)
# Build Flags
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
endif()
if(WIN32 AND MSVC)
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
if(WITH_CYCLES_OPTIMIZED_KERNEL)
add_definitions(-DWITH_OPTIMIZED_KERNEL)
endif()
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()

@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
incs.append(env['BF_PTHREADS_INC'])
# optimized kernel
if env['WITH_BF_RAYOPTIMIZATION']:
sse2_cxxflags = Split(env['CXXFLAGS'])
sse3_cxxflags = Split(env['CXXFLAGS'])
sse41_cxxflags = Split(env['CXXFLAGS'])
sse2_cxxflags = Split(env['CXXFLAGS'])
sse3_cxxflags = Split(env['CXXFLAGS'])
sse41_cxxflags = Split(env['CXXFLAGS'])
if env['OURPLATFORM'] == 'win32-vc':
# there is no /arch:SSE3, but intrinsics are available anyway
sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
elif env['OURPLATFORM'] == 'win64-vc':
sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
else:
sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
defs.append('WITH_OPTIMIZED_KERNEL')
optim_defs = defs[:]
if env['OURPLATFORM'] == 'win32-vc':
# there is no /arch:SSE3, but intrinsics are available anyway
sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
elif env['OURPLATFORM'] == 'win64-vc':
sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
else:
sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
cycles_sse41 = cycles.Clone()
sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
defs.append('WITH_OPTIMIZED_KERNEL')
optim_defs = defs[:]
cycles_sse3 = cycles.Clone()
sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
cycles_sse41 = cycles.Clone()
sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
cycles_sse2 = cycles.Clone()
sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
cycles_sse3 = cycles.Clone()
sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
cycles_sse2 = cycles.Clone()
sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)

@ -166,7 +166,6 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@ -189,6 +188,7 @@ public:
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
@ -208,7 +208,10 @@ public:
task.update_progress(tile);
}
}
else if(system_cpu_support_sse2()) {
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
@ -267,7 +270,6 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@ -276,14 +278,18 @@ public:
sample_scale, x, y, task.offset, task.stride);
}
else
#endif
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else if(system_cpu_support_sse2()) {
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
@ -299,7 +305,6 @@ public:
}
}
else {
#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@ -309,13 +314,17 @@ public:
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
else if(system_cpu_support_sse2()) {
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
@ -340,7 +349,6 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
@ -352,6 +360,7 @@ public:
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
@ -360,7 +369,10 @@ public:
break;
}
}
else if(system_cpu_support_sse2()) {
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);

@ -192,10 +192,8 @@ endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
if(WITH_CYCLES_OPTIMIZED_KERNEL)
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
endif()
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41)
set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")

@ -84,6 +84,11 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
assert(0);
}
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
#if defined(__x86_64__) || defined(_M_X64)
#define __KERNEL_SSE2__
#endif
/* Path Tracing */
void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)

@ -17,9 +17,10 @@
#ifndef __KERNEL_H__
#define __KERNEL_H__
/* CPU Kernel Interfae */
/* CPU Kernel Interface */
#include "util_types.h"
#include "util_optimization.h"
CCL_NAMESPACE_BEGIN
@ -43,7 +44,7 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@ -52,7 +53,9 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@ -61,7 +64,9 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,

@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#ifdef WITH_OPTIMIZED_KERNEL
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -70,4 +72,3 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
CCL_NAMESPACE_END
#endif

@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#ifdef WITH_OPTIMIZED_KERNEL
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -72,4 +74,3 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
CCL_NAMESPACE_END
#endif

@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#ifdef WITH_OPTIMIZED_KERNEL
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@ -73,4 +75,3 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, in
CCL_NAMESPACE_END
#endif

@ -46,6 +46,7 @@ set(SRC_HEADERS
util_md5.h
util_opencl.h
util_opengl.h
util_optimization.h
util_param.h
util_path.h
util_progress.h

@ -0,0 +1,31 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
#if defined(__x86_64__) || defined(_M_X64)
/* no SSE2 kernel on x86-64, part of regular kernel */
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
#endif
#if defined(i386) || defined(_M_IX86)
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
#endif