forked from bartvdbraak/blender
Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.
Reviewed by: brecht Differential Revision: https://developer.blender.org/D199
This commit is contained in:
parent
d980c3eccb
commit
9351ac0d85
@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)
|
||||
|
||||
# Build Flags
|
||||
|
||||
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
|
||||
set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
|
||||
endif()
|
||||
|
||||
if(WIN32 AND MSVC)
|
||||
# there is no /arch:SSE3, but intrinsics are available anyway
|
||||
if(CMAKE_CL_64)
|
||||
@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
|
||||
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
|
||||
add_definitions(-DCCL_NAMESPACE_END=})
|
||||
|
||||
if(WITH_CYCLES_OPTIMIZED_KERNEL)
|
||||
add_definitions(-DWITH_OPTIMIZED_KERNEL)
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_NETWORK)
|
||||
add_definitions(-DWITH_NETWORK)
|
||||
endif()
|
||||
|
@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
|
||||
incs.append(env['BF_PTHREADS_INC'])
|
||||
|
||||
# optimized kernel
|
||||
if env['WITH_BF_RAYOPTIMIZATION']:
|
||||
sse2_cxxflags = Split(env['CXXFLAGS'])
|
||||
sse3_cxxflags = Split(env['CXXFLAGS'])
|
||||
sse41_cxxflags = Split(env['CXXFLAGS'])
|
||||
sse2_cxxflags = Split(env['CXXFLAGS'])
|
||||
sse3_cxxflags = Split(env['CXXFLAGS'])
|
||||
sse41_cxxflags = Split(env['CXXFLAGS'])
|
||||
|
||||
if env['OURPLATFORM'] == 'win32-vc':
|
||||
# there is no /arch:SSE3, but intrinsics are available anyway
|
||||
sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
elif env['OURPLATFORM'] == 'win64-vc':
|
||||
sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
else:
|
||||
sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
|
||||
sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
|
||||
sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
|
||||
if env['OURPLATFORM'] == 'win32-vc':
|
||||
# there is no /arch:SSE3, but intrinsics are available anyway
|
||||
sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
elif env['OURPLATFORM'] == 'win64-vc':
|
||||
sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
|
||||
else:
|
||||
sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
|
||||
sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
|
||||
sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
|
||||
|
||||
defs.append('WITH_OPTIMIZED_KERNEL')
|
||||
optim_defs = defs[:]
|
||||
defs.append('WITH_OPTIMIZED_KERNEL')
|
||||
optim_defs = defs[:]
|
||||
|
||||
if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
|
||||
cycles_sse41 = cycles.Clone()
|
||||
sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
|
||||
cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
|
||||
defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
|
||||
if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
|
||||
cycles_sse41 = cycles.Clone()
|
||||
sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
|
||||
cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
|
||||
defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
|
||||
|
||||
cycles_sse3 = cycles.Clone()
|
||||
sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
|
||||
cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
|
||||
cycles_sse3 = cycles.Clone()
|
||||
sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
|
||||
cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
|
||||
|
||||
cycles_sse2 = cycles.Clone()
|
||||
sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
|
||||
cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
|
||||
cycles_sse2 = cycles.Clone()
|
||||
sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
|
||||
cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
|
||||
|
||||
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
|
||||
|
||||
|
@ -166,7 +166,6 @@ public:
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
@ -189,6 +188,7 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if (task.get_cancel() || task_pool.canceled()) {
|
||||
@ -208,7 +208,10 @@ public:
|
||||
task.update_progress(tile);
|
||||
}
|
||||
}
|
||||
else if(system_cpu_support_sse2()) {
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if (task.get_cancel() || task_pool.canceled()) {
|
||||
if(task.need_finish_queue == false)
|
||||
@ -267,7 +270,6 @@ public:
|
||||
float sample_scale = 1.0f/(task.sample + 1);
|
||||
|
||||
if(task.rgba_half) {
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
@ -277,13 +279,17 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
|
||||
sample_scale, x, y, task.offset, task.stride);
|
||||
}
|
||||
else if(system_cpu_support_sse2()) {
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
|
||||
@ -299,7 +305,6 @@ public:
|
||||
}
|
||||
}
|
||||
else {
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
@ -309,13 +314,17 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
|
||||
sample_scale, x, y, task.offset, task.stride);
|
||||
}
|
||||
else if(system_cpu_support_sse2()) {
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
for(int y = task.y; y < task.y + task.h; y++)
|
||||
for(int x = task.x; x < task.x + task.w; x++)
|
||||
kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
|
||||
@ -340,7 +349,6 @@ public:
|
||||
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
||||
#endif
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
|
||||
@ -352,6 +360,7 @@ public:
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
|
||||
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
|
||||
@ -360,7 +369,10 @@ public:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if(system_cpu_support_sse2()) {
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
|
||||
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
|
||||
|
||||
|
@ -192,10 +192,8 @@ endif()
|
||||
include_directories(${INC})
|
||||
include_directories(SYSTEM ${INC_SYS})
|
||||
|
||||
if(WITH_CYCLES_OPTIMIZED_KERNEL)
|
||||
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
endif()
|
||||
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
|
||||
if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41)
|
||||
set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
|
@ -84,6 +84,11 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
/* Path Tracing */
|
||||
|
||||
void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
|
||||
|
@ -17,9 +17,10 @@
|
||||
#ifndef __KERNEL_H__
|
||||
#define __KERNEL_H__
|
||||
|
||||
/* CPU Kernel Interfae */
|
||||
/* CPU Kernel Interface */
|
||||
|
||||
#include "util_types.h"
|
||||
#include "util_optimization.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
@ -43,7 +44,7 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
|
||||
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
|
||||
int type, int i);
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
|
||||
int sample, int x, int y, int offset, int stride);
|
||||
void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
|
||||
@ -52,7 +53,9 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
|
||||
float sample_scale, int x, int y, int offset, int stride);
|
||||
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
|
||||
int type, int i);
|
||||
#endif
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
|
||||
int sample, int x, int y, int offset, int stride);
|
||||
void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
|
||||
@ -61,7 +64,9 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
|
||||
float sample_scale, int x, int y, int offset, int stride);
|
||||
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
|
||||
int type, int i);
|
||||
#endif
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
|
||||
int sample, int x, int y, int offset, int stride);
|
||||
void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
|
||||
|
@ -18,7 +18,9 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
@ -70,4 +72,3 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -18,7 +18,9 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
@ -72,4 +74,3 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -18,7 +18,9 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
@ -73,4 +75,3 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, in
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -46,6 +46,7 @@ set(SRC_HEADERS
|
||||
util_md5.h
|
||||
util_opencl.h
|
||||
util_opengl.h
|
||||
util_optimization.h
|
||||
util_param.h
|
||||
util_path.h
|
||||
util_progress.h
|
||||
|
31
intern/cycles/util/util_optimization.h
Normal file
31
intern/cycles/util/util_optimization.h
Normal file
@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License
|
||||
*/
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
|
||||
/* no SSE2 kernel on x86-64, part of regular kernel */
|
||||
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(i386) || defined(_M_IX86)
|
||||
|
||||
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user