Merge branch 'master' into blender2.8

This commit is contained in:
Julian Eisel 2017-05-08 00:19:22 +02:00
commit 9181f13af7
138 changed files with 584641 additions and 1253 deletions

@ -241,3 +241,15 @@ def register_passes(engine, scene, srl):
if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
if crl.use_denoising and crl.denoising_store_passes:
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR');
engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR');
engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE');
engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE');
engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR');
engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR');
engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR');
engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR');

@ -1189,6 +1189,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
default=False,
)
cls.use_denoising = BoolProperty(
name="Use Denoising",
description="Denoise the rendered image",
default=False,
)
cls.denoising_diffuse_direct = BoolProperty(
name="Diffuse Direct",
description="Denoise the direct diffuse lighting",
default=True,
)
cls.denoising_diffuse_indirect = BoolProperty(
name="Diffuse Indirect",
description="Denoise the indirect diffuse lighting",
default=True,
)
cls.denoising_glossy_direct = BoolProperty(
name="Glossy Direct",
description="Denoise the direct glossy lighting",
default=True,
)
cls.denoising_glossy_indirect = BoolProperty(
name="Glossy Indirect",
description="Denoise the indirect glossy lighting",
default=True,
)
cls.denoising_transmission_direct = BoolProperty(
name="Transmission Direct",
description="Denoise the direct transmission lighting",
default=True,
)
cls.denoising_transmission_indirect = BoolProperty(
name="Transmission Indirect",
description="Denoise the indirect transmission lighting",
default=True,
)
cls.denoising_subsurface_direct = BoolProperty(
name="Subsurface Direct",
description="Denoise the direct subsurface lighting",
default=True,
)
cls.denoising_subsurface_indirect = BoolProperty(
name="Subsurface Indirect",
description="Denoise the indirect subsurface lighting",
default=True,
)
cls.denoising_strength = FloatProperty(
name="Denoising Strength",
description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
min=0.0, max=1.0,
default=0.5,
)
cls.denoising_feature_strength = FloatProperty(
name="Denoising Feature Strength",
description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
min=0.0, max=1.0,
default=0.5,
)
cls.denoising_radius = IntProperty(
name="Denoising Radius",
description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
min=1, max=50,
default=8,
)
cls.denoising_relative_pca = BoolProperty(
name="Relative filter",
description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
default=False,
)
cls.denoising_store_passes = BoolProperty(
name="Store denoising passes",
description="Store the denoising feature passes and the noisy image",
default=False,
)
@classmethod
def unregister(cls):
del bpy.types.SceneRenderLayer.cycles

@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
col.prop(rl, "use_pass_emit", text="Emission")
col.prop(rl, "use_pass_environment")
if context.scene.cycles.feature_set == 'EXPERIMENTAL':
col.separator()
sub = col.column()
sub.active = crl.use_denoising
sub.prop(crl, "denoising_store_passes", text="Denoising")
if _cycles.with_cycles_debug:
col = layout.column()
col.prop(crl, "pass_debug_bvh_traversed_nodes")
@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
row.prop(rv, "camera_suffix", text="")
class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
bl_label = "Denoising"
bl_context = "render_layer"
bl_options = {'DEFAULT_CLOSED'}
def draw_header(self, context):
rd = context.scene.render
rl = rd.layers.active
crl = rl.cycles
cscene = context.scene.cycles
layout = self.layout
layout.active = not cscene.use_progressive_refine
layout.prop(crl, "use_denoising", text="")
def draw(self, context):
layout = self.layout
scene = context.scene
cscene = scene.cycles
rd = scene.render
rl = rd.layers.active
crl = rl.cycles
layout.active = crl.use_denoising and not cscene.use_progressive_refine
split = layout.split()
col = split.column()
sub = col.column(align=True)
sub.prop(crl, "denoising_radius", text="Radius")
sub.prop(crl, "denoising_strength", slider=True, text="Strength")
col = split.column()
sub = col.column(align=True)
sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
sub.prop(crl, "denoising_relative_pca")
layout.separator()
row = layout.row()
row.label(text="Diffuse:")
sub = row.row(align=True)
sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
row = layout.row()
row.label(text="Glossy:")
sub = row.row(align=True)
sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
row = layout.row()
row.label(text="Transmission:")
sub = row.row(align=True)
sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
row = layout.row()
row.label(text="Subsurface:")
sub = row.row(align=True)
sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
bl_label = "Post Processing"
bl_options = {'DEFAULT_CLOSED'}
@ -1732,6 +1803,7 @@ classes = (
CyclesRender_PT_layer_options,
CyclesRender_PT_layer_passes,
CyclesRender_PT_views,
CyclesRender_PT_denoising,
Cycles_PT_post_processing,
CyclesCamera_PT_dof,
Cycles_PT_context_material,

@ -303,12 +303,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
static void end_render_result(BL::RenderEngine& b_engine,
BL::RenderResult& b_rr,
bool cancel,
bool highlight,
bool do_merge_results)
{
b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
}
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
{
BufferParams& params = rtile.buffers->params;
int x = params.full_x - session->tile_manager.params.full_x;
@ -344,37 +345,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
update_render_result(b_rr, b_rlay, rtile);
}
end_render_result(b_engine, b_rr, true, true);
end_render_result(b_engine, b_rr, true, highlight, true);
}
else {
/* write result */
write_render_result(b_rr, b_rlay, rtile);
end_render_result(b_engine, b_rr, false, true);
end_render_result(b_engine, b_rr, false, false, true);
}
}
void BlenderSession::write_render_tile(RenderTile& rtile)
{
do_write_update_render_tile(rtile, false);
do_write_update_render_tile(rtile, false, false);
}
void BlenderSession::update_render_tile(RenderTile& rtile)
void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
{
/* use final write for preview renders, otherwise render result wouldn't be
* be updated in blender side
* would need to be investigated a bit further, but for now shall be fine
*/
if(!b_engine.is_preview())
do_write_update_render_tile(rtile, true);
do_write_update_render_tile(rtile, true, highlight);
else
do_write_update_render_tile(rtile, false);
do_write_update_render_tile(rtile, false, false);
}
void BlenderSession::render()
{
/* set callback to write out render results */
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
/* get buffer parameters */
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@ -395,7 +396,7 @@ void BlenderSession::render()
/* layer will be missing if it was disabled in the UI */
if(b_single_rlay == b_rr.layers.end()) {
end_render_result(b_engine, b_rr, true, false);
end_render_result(b_engine, b_rr, true, true, false);
continue;
}
@ -411,6 +412,29 @@ void BlenderSession::render()
}
buffer_params.passes = passes;
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
buffer_params.denoising_data_pass = use_denoising;
session->tile_manager.schedule_denoising = use_denoising;
session->params.use_denoising = use_denoising;
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
scene->film->denoising_flags = 0;
if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
session->params.denoising_radius = get_int(crl, "denoising_radius");
session->params.denoising_strength = get_float(crl, "denoising_strength");
session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
scene->film->tag_passes_update(scene, passes);
scene->film->tag_update(scene);
@ -464,7 +488,7 @@ void BlenderSession::render()
}
/* free result without merging */
end_render_result(b_engine, b_rr, true, false);
end_render_result(b_engine, b_rr, true, true, false);
if(session->progress.get_cancel())
break;
@ -670,6 +694,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
/* copy pixels */
read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
}
else {
int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
if(denoising_offset >= 0) {
read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
}
}
if(!read) {
memset(&pixels[0], 0, pixels.size()*sizeof(float));

@ -81,7 +81,7 @@ public:
void update_render_result(BL::RenderResult& b_rr,
BL::RenderLayer& b_rlay,
RenderTile& rtile);
void update_render_tile(RenderTile& rtile);
void update_render_tile(RenderTile& rtile, bool highlight);
/* interactive updates */
void synchronize();
@ -150,7 +150,7 @@ protected:
BL::RenderLayer& b_rlay,
RenderTile& rtile,
bool do_update_only);
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
int builtin_image_frame(const string &builtin_name);
void builtin_image_info(const string &builtin_name,

@ -509,6 +509,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
return PASS_NONE;
}
int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
{
string name = b_pass.name();
if(name.substr(0, 10) != "Denoising ") {
return -1;
}
name = name.substr(10);
#define MAP_PASS(passname, offset) if(name == passname) return offset;
MAP_PASS("Normal", DENOISING_PASS_NORMAL);
MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
MAP_PASS("Depth", DENOISING_PASS_DEPTH);
MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
MAP_PASS("Image", DENOISING_PASS_COLOR);
MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
#undef MAP_PASS
return -1;
}
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
BL::SceneRenderLayer& b_srlay)
{
@ -528,8 +552,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
Pass::add(pass_type, passes);
}
#ifdef __KERNEL_DEBUG__
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
if(get_boolean(crp, "denoising_store_passes")) {
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str());
b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
}
#ifdef __KERNEL_DEBUG__
if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
Pass::add(PASS_BVH_TRAVERSED_NODES, passes);

@ -97,6 +97,7 @@ public:
int width, int height);
static PassType get_pass_type(BL::RenderPass& b_pass);
static int get_denoising_pass(BL::RenderPass& b_pass);
private:
/* sync */

@ -25,6 +25,7 @@ set(SRC
device.cpp
device_cpu.cpp
device_cuda.cpp
device_denoising.cpp
device_multi.cpp
device_opencl.cpp
device_split_kernel.cpp
@ -48,6 +49,7 @@ endif()
set(SRC_HEADERS
device.h
device_denoising.h
device_memory.h
device_intern.h
device_network.h

@ -549,4 +549,16 @@ void Device::free_memory()
devices.free_memory();
}
device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
: device(device)
{
ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
}
device_sub_ptr::~device_sub_ptr()
{
device->mem_free_sub_ptr(ptr);
}
CCL_NAMESPACE_END

@ -228,6 +228,7 @@ struct DeviceDrawParams {
};
class Device {
friend class device_sub_ptr;
protected:
enum {
FALLBACK_SHADER_STATUS_NONE = 0,
@ -250,6 +251,14 @@ protected:
bool bind_fallback_display_space_shader(const float width, const float height);
virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
{
/* Only required for devices that implement denoising. */
assert(false);
return (device_ptr) 0;
}
virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
public:
virtual ~Device();
@ -278,6 +287,8 @@ public:
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
virtual int mem_address_alignment() { return 16; }
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
@ -326,6 +337,8 @@ public:
/* multi device */
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
virtual int device_number(Device * /*sub_device*/) { return 0; }
virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
/* static */
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);

File diff suppressed because it is too large Load Diff

@ -21,11 +21,14 @@
#include <string.h>
#include "device/device.h"
#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
#include "render/buffers.h"
#include "kernel/filter/filter_defines.h"
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
@ -113,7 +116,7 @@ public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
map<device_ptr, uint> tex_bindless_map;
int cuDevId;
@ -170,7 +173,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@ -301,7 +304,8 @@ public:
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
const DeviceRequestedFeatures& requested_features, bool split=false)
const DeviceRequestedFeatures& requested_features,
bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@ -316,7 +320,7 @@ public:
machine,
cuda_version,
include_path.c_str());
if(use_adaptive_compilation()) {
if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@ -364,8 +368,22 @@ public:
return true;
}
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
string compile_kernel(const DeviceRequestedFeatures& requested_features,
bool filter=false, bool split=false)
{
const char *name, *source;
if(filter) {
name = "filter";
source = "filter.cu";
}
else if(split) {
name = "kernel_split";
source = "kernel_split.cu";
}
else {
name = "kernel";
source = "kernel.cu";
}
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@ -373,9 +391,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
: "lib/kernel_sm_%d%d.cubin",
major, minor));
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@ -384,7 +401,7 @@ public:
}
const string common_cflags =
compile_kernel_get_common_cflags(requested_features, split);
compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
const string source_path = path_get("source");
@ -395,9 +412,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
: "cycles_kernel_sm%d%d_%s.cubin",
major, minor,
const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@ -432,7 +448,7 @@ public:
const string kernel = path_join(
path_join(source_path, "kernel"),
path_join("kernels",
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@ -480,11 +496,14 @@ public:
return false;
/* get kernel */
string cubin = compile_kernel(requested_features, use_split_kernel());
string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
string filter_cubin = compile_kernel(requested_features, true, false);
if(filter_cubin == "")
return false;
/* open module */
cuda_push_context();
@ -499,6 +518,14 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
if(path_read_text(filter_cubin, cubin_data))
result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
else
result = CUDA_ERROR_FILE_NOT_FOUND;
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
cuda_pop_context();
return (result == CUDA_SUCCESS);
@ -581,6 +608,11 @@ public:
}
}
virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
{
return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
}
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@ -881,6 +913,368 @@ public:
}
}
bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
{
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
for(int i = 0; i < 9; i++) {
tiles->buffers[i] = buffers[i];
}
mem_copy_to(task->tiles_mem);
return !have_error();
}
#define CUDA_GET_BLOCKSIZE(func, w, h) \
int threads_per_block; \
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
int threads = (int)sqrt((float)threads_per_block); \
int xblocks = ((w) + threads - 1)/threads; \
int yblocks = ((h) + threads - 1)/threads;
#define CUDA_LAUNCH_KERNEL(func, args) \
cuda_assert(cuLaunchKernel(func, \
xblocks, yblocks, 1, \
threads, threads, 1, \
0, 0, args, 0));
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
DenoisingTask *task)
{
if(have_error())
return false;
cuda_push_context();
int4 rect = task->rect;
int w = rect.z-rect.x;
int h = rect.w-rect.y;
int r = task->nlm_state.r;
int f = task->nlm_state.f;
float a = task->nlm_state.a;
float k_2 = task->nlm_state.k_2;
CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
int dx, dy;
int4 local_rect;
int channel_offset = 0;
void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
dy = i / (2*r+1) - r;
dx = i % (2*r+1) - r;
local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
}
local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
bool denoising_construct_transform(DenoisingTask *task)
{
if(have_error())
return false;
cuda_push_context();
CUfunction cuFilterConstructTransform;
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
task->storage.w,
task->storage.h);
void *args[] = {&task->buffer.mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->filter_area,
&task->rect,
&task->radius,
&task->pca_threshold,
&task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
bool denoising_reconstruct(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr guide_ptr,
device_ptr guide_variance_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
if(have_error())
return false;
mem_zero(task->storage.XtWX);
mem_zero(task->storage.XtWY);
cuda_push_context();
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
int r = task->radius;
int f = 4;
float a = 1.0f;
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
int local_rect[4] = {max(0, -dx), max(0, -dy),
task->reconstruction_state.source_w - max(0, dx),
task->reconstruction_state.source_h - max(0, dy)};
void *calc_difference_args[] = {&dx, &dy,
&guide_ptr,
&guide_variance_ptr,
&difference,
&local_rect,
&task->buffer.w,
&task->buffer.pass_stride,
&a,
&task->nlm_k_2};
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
void *blur_args[] = {&difference,
&blurDifference,
&local_rect,
&task->buffer.w,
&f};
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
void *calc_weight_args[] = {&blurDifference,
&difference,
&local_rect,
&task->buffer.w,
&f};
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
/* Reuse previous arguments. */
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
void *construct_gramian_args[] = {&dx, &dy,
&blurDifference,
&task->buffer.mem.device_pointer,
&color_ptr,
&color_variance_ptr,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->storage.XtWX.device_pointer,
&task->storage.XtWY.device_pointer,
&local_rect,
&task->reconstruction_state.filter_rect,
&task->buffer.w,
&task->buffer.h,
&f,
&task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
}
void *finalize_args[] = {&task->buffer.w,
&task->buffer.h,
&output_ptr,
&task->storage.rank.device_pointer,
&task->storage.XtWX.device_pointer,
&task->storage.XtWY.device_pointer,
&task->filter_area,
&task->reconstruction_state.buffer_params.x,
&task->render_buffer.samples};
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
device_ptr mean_ptr, device_ptr variance_ptr,
int r, int4 rect, DenoisingTask *task)
{
(void) task;
if(have_error())
return false;
cuda_push_context();
CUfunction cuFilterCombineHalves;
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
void *args[] = {&mean_ptr,
&variance_ptr,
&a_ptr,
&b_ptr,
&rect,
&r};
CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
device_ptr buffer_variance_ptr, DenoisingTask *task)
{
(void) task;
if(have_error())
return false;
cuda_push_context();
CUfunction cuFilterDivideShadow;
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer,
&a_ptr,
&b_ptr,
&sample_variance_ptr,
&sv_variance_ptr,
&buffer_variance_ptr,
&task->rect,
&task->render_buffer.pass_stride,
&task->render_buffer.denoising_data_offset,
&use_split_variance};
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
bool denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
DenoisingTask *task)
{
if(have_error())
return false;
cuda_push_context();
CUfunction cuFilterGetFeature;
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer,
&mean_offset,
&variance_offset,
&mean_ptr,
&variance_ptr,
&task->rect,
&task->render_buffer.pass_stride,
&task->render_buffer.denoising_data_offset,
&use_split_variance};
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
void denoise(RenderTile &rtile, const DeviceTask &task)
{
DenoisingTask denoising(this);
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
denoising.render_buffer.samples = rtile.sample;
RenderTile rtiles[9];
rtiles[4] = rtile;
task.map_neighbor_tiles(rtiles, this);
denoising.tiles_from_rendertiles(rtiles);
denoising.init_from_devicetask(task);
denoising.run_denoising();
task.unmap_neighbor_tiles(rtiles, this);
}
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@ -1326,7 +1720,7 @@ public:
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::PATH_TRACE) {
if(task->type == DeviceTask::RENDER) {
RenderTile tile;
bool branched = task->integrator_branched;
@ -1334,30 +1728,8 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
if(!use_split_kernel()) {
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
path_trace(tile, sample, branched);
tile.sample = sample + 1;
task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
}
}
else {
DeviceRequestedFeatures requested_features;
DeviceRequestedFeatures requested_features;
if(use_split_kernel()) {
if(!use_adaptive_compilation()) {
requested_features.max_closure = 64;
}
@ -1366,17 +1738,46 @@ public:
split_kernel = new CUDASplitKernel(this);
split_kernel->load_kernels(requested_features);
}
}
while(task->acquire_tile(this, tile)) {
device_memory void_buffer;
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
task->release_tile(tile);
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
if(tile.task == RenderTile::PATH_TRACE) {
if(use_split_kernel()) {
device_memory void_buffer;
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
}
else {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
path_trace(tile, sample, branched);
tile.sample = sample + 1;
task->update_progress(&tile, tile.w*tile.h);
}
}
}
else if(tile.task == RenderTile::DENOISE) {
tile.sample = tile.start_sample + tile.num_samples;
denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
}
}

@ -0,0 +1,218 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "device/device_denoising.h"
#include "kernel/filter/filter_defines.h"
CCL_NAMESPACE_BEGIN
void DenoisingTask::init_from_devicetask(const DeviceTask &task)
{
radius = task.denoising_radius;
nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
if(task.denoising_relative_pca) {
pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
}
else {
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
}
render_buffer.pass_stride = task.pass_stride;
render_buffer.denoising_data_offset = task.pass_denoising_data;
render_buffer.denoising_clean_offset = task.pass_denoising_clean;
/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
rect = make_int4(max(tiles->x[0], filter_area.x - radius),
max(tiles->y[0], filter_area.y - radius),
min(tiles->x[3], filter_area.x + filter_area.z + radius),
min(tiles->y[3], filter_area.y + filter_area.w + radius));
}
void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
{
tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
device_ptr buffers[9];
for(int i = 0; i < 9; i++) {
buffers[i] = rtiles[i].buffer;
tiles->offsets[i] = rtiles[i].offset;
tiles->strides[i] = rtiles[i].stride;
}
tiles->x[0] = rtiles[3].x;
tiles->x[1] = rtiles[4].x;
tiles->x[2] = rtiles[5].x;
tiles->x[3] = rtiles[5].x + rtiles[5].w;
tiles->y[0] = rtiles[1].y;
tiles->y[1] = rtiles[4].y;
tiles->y[2] = rtiles[7].y;
tiles->y[3] = rtiles[7].y + rtiles[7].h;
render_buffer.offset = rtiles[4].offset;
render_buffer.stride = rtiles[4].stride;
render_buffer.ptr = rtiles[4].buffer;
functions.set_tiles(buffers);
}
bool DenoisingTask::run_denoising()
{
/* Allocate denoising buffer. */
buffer.passes = 14;
buffer.w = align_up(rect.z - rect.x, 4);
buffer.h = rect.w - rect.y;
buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
buffer.mem.resize(buffer.pass_stride * buffer.passes);
device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
device_ptr null_ptr = (device_ptr) 0;
/* Prefilter shadow feature. */
{
device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
nlm_state.temporary_1_ptr = *nlm_temporary_1;
nlm_state.temporary_2_ptr = *nlm_temporary_2;
nlm_state.temporary_3_ptr = *nlm_temporary_3;
/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
/* Reuse memory, the previous data isn't needed anymore. */
device_ptr filtered_a = *buffer_var,
filtered_b = *sample_var;
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
device_ptr residual_var = *sample_var_var;
/* Estimate the residual variance between the two filtered halves. */
functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
device_ptr final_a = *unfiltered_a,
final_b = *unfiltered_b;
/* Use the residual variance for a second filter pass. */
nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
/* Combine the two double-filtered halves to a final shadow feature. */
device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
}
/* Prefilter general features. */
{
device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
nlm_state.temporary_1_ptr = *nlm_temporary_1;
nlm_state.temporary_2_ptr = *nlm_temporary_2;
nlm_state.temporary_3_ptr = *nlm_temporary_3;
int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 };
for(int pass = 0; pass < 7; pass++) {
device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
/* Get the unfiltered pass and its variance from the RenderBuffers. */
functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
/* Smooth the pass and store the result in the denoising buffers. */
nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
}
}
/* Copy color passes. */
{
int mean_from[] = {20, 21, 22};
int variance_from[] = {23, 24, 25};
int mean_to[] = { 8, 9, 10};
int variance_to[] = {11, 12, 13};
int num_color_passes = 3;
for(int pass = 0; pass < num_color_passes; pass++) {
device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
}
}
storage.w = filter_area.z;
storage.h = filter_area.w;
storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
storage.rank.resize(storage.w*storage.h);
device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
functions.construct_transform();
device_only_memory<float> temporary_1;
device_only_memory<float> temporary_2;
temporary_1.resize(buffer.w*buffer.h);
temporary_2.resize(buffer.w*buffer.h);
device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
render_buffer.stride,
render_buffer.pass_stride,
render_buffer.denoising_clean_offset);
reconstruction_state.source_w = rect.z-rect.x;
reconstruction_state.source_h = rect.w-rect.y;
{
device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
}
device->mem_free(storage.XtWX);
device->mem_free(storage.XtWY);
device->mem_free(storage.transform);
device->mem_free(storage.rank);
device->mem_free(temporary_1);
device->mem_free(temporary_2);
device->mem_free(buffer.mem);
device->mem_free(tiles_mem);
return true;
}
CCL_NAMESPACE_END

@ -0,0 +1,145 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __DEVICE_DENOISING_H__
#define __DEVICE_DENOISING_H__
#include "device/device.h"
#include "render/buffers.h"
#include "kernel/filter/filter_defines.h"
CCL_NAMESPACE_BEGIN
class DenoisingTask {
public:
/* Parameters of the denoising algorithm. */
int radius;
float nlm_k_2;
float pca_threshold;
/* Pointer and parameters of the RenderBuffers. */
struct RenderBuffers {
int denoising_data_offset;
int denoising_clean_offset;
int pass_stride;
int offset;
int stride;
device_ptr ptr;
int samples;
} render_buffer;
TilesInfo *tiles;
device_vector<int> tiles_mem;
void tiles_from_rendertiles(RenderTile *rtiles);
int4 rect;
int4 filter_area;
struct DeviceFunctions {
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
device_ptr variance_ptr, /* Contains the variance of the guide image. */
device_ptr out_ptr /* The filtered output is written into this image. */
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr guide_ptr,
device_ptr guide_variance_ptr,
device_ptr output_ptr
)> reconstruct;
function<bool()> construct_transform;
function<bool(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
device_ptr variance_ptr,
int r,
int4 rect
)> combine_halves;
function<bool(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr sample_variance_ptr,
device_ptr sv_variance_ptr,
device_ptr buffer_variance_ptr
)> divide_shadow;
function<bool(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr
)> get_feature;
function<bool(device_ptr*)> set_tiles;
} functions;
/* Stores state of the current Reconstruction operation,
* which is accessed by the device in order to perform the operation. */
struct ReconstructionState {
device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
device_ptr temporary_2_ptr;
int4 filter_rect;
int4 buffer_params;
int source_w;
int source_h;
} reconstruction_state;
/* Stores state of the current NLM operation,
* which is accessed by the device in order to perform the operation. */
struct NLMState {
device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
device_ptr temporary_2_ptr;
device_ptr temporary_3_ptr;
int r; /* Search radius of the filter. */
int f; /* Patch size of the filter. */
float a; /* Variance compensation factor in the MSE estimation. */
float k_2; /* Squared value of the k parameter of the filter. */
void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
} nlm_state;
struct Storage {
device_only_memory<float> transform;
device_only_memory<int> rank;
device_only_memory<float> XtWX;
device_only_memory<float3> XtWY;
int w;
int h;
} storage;
DenoisingTask(Device *device) : device(device) {}
void init_from_devicetask(const DeviceTask &task);
bool run_denoising();
struct DenoiseBuffers {
int pass_stride;
int passes;
int w;
int h;
device_only_memory<float> mem;
} buffer;
protected:
Device *device;
};
CCL_NAMESPACE_END
#endif /* __DEVICE_DENOISING_H__ */

@ -35,6 +35,8 @@
CCL_NAMESPACE_BEGIN
class Device;
enum MemoryType {
MEM_READ_ONLY,
MEM_WRITE_ONLY,
@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
static const int num_elements = 3;
static const int num_elements = 4;
};
template<> struct device_type_traits<float4> {
@ -173,6 +175,9 @@ class device_memory
{
public:
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
size_t memory_elements_size(int elements) {
return elements*data_elements*datatype_size(data_type);
}
/* data information */
DataType data_type;
@ -213,6 +218,22 @@ protected:
device_memory& operator = (const device_memory&);
};
template<typename T>
class device_only_memory : public device_memory
{
public:
device_only_memory()
{
data_type = device_type_traits<T>::data_type;
data_elements = max(device_type_traits<T>::num_elements, 1);
}
void resize(size_t num)
{
device_memory::resize(num*sizeof(T));
}
};
/* Device Vector */
template<typename T> class device_vector : public device_memory
@ -299,6 +320,27 @@ private:
array<T> data;
};
/* A device_sub_ptr is a pointer into another existing memory.
* Therefore, it is not allocated separately, but just created from the already allocated base memory.
* It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
* Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
class device_sub_ptr
{
public:
device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
~device_sub_ptr();
/* No copying. */
device_sub_ptr& operator = (const device_sub_ptr&);
device_ptr operator*() const
{
return ptr;
}
protected:
Device *device;
device_ptr ptr;
};
CCL_NAMESPACE_END
#endif /* __DEVICE_MEMORY_H__ */

@ -302,6 +302,60 @@ public:
return -1;
}
void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
{
for(int i = 0; i < 9; i++) {
if(!tiles[i].buffers) {
continue;
}
/* If the tile was rendered on another device, copy its memory to
* to the current device now, for the duration of the denoising task.
* Note that this temporarily modifies the RenderBuffers and calls
* the device, so this function is not thread safe. */
if(tiles[i].buffers->device != sub_device) {
device_vector<float> &mem = tiles[i].buffers->buffer;
tiles[i].buffers->copy_from_device();
device_ptr original_ptr = mem.device_pointer;
mem.device_pointer = 0;
sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
sub_device->mem_copy_to(mem);
tiles[i].buffer = mem.device_pointer;
mem.device_pointer = original_ptr;
}
}
}
void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
{
for(int i = 0; i < 9; i++) {
if(!tiles[i].buffers) {
continue;
}
if(tiles[i].buffers->device != sub_device) {
device_vector<float> &mem = tiles[i].buffers->buffer;
device_ptr original_ptr = mem.device_pointer;
mem.device_pointer = tiles[i].buffer;
/* Copy denoised tile to the host. */
if(i == 4) {
tiles[i].buffers->copy_from_device(sub_device);
}
size_t mem_size = mem.device_size;
sub_device->mem_free(mem);
mem.device_pointer = original_ptr;
mem.device_size = mem_size;
/* Copy denoised tile to the original device. */
if(i == 4) {
tiles[i].buffers->device->mem_copy_to(mem);
}
}
}
}
int get_split_task_count(DeviceTask& task)
{
int total_tasks = 0;

@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
/* Allocate work_pool_wgs memory. */
work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
work_pool_wgs.resize(max_work_groups);
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
queue_index.resize(NUM_QUEUES * sizeof(int));
queue_index.resize(NUM_QUEUES);
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
use_queues_flag.resize(sizeof(char));
use_queues_flag.resize(1);
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
ray_state.resize(num_global_elements);

@ -80,16 +80,16 @@ private:
*/
device_memory split_data;
device_vector<uchar> ray_state;
device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
/* Flag to make sceneintersect and lampemission kernel use queues. */
device_memory use_queues_flag;
device_only_memory<char> use_queues_flag;
/* Approximate time it takes to complete one sample */
double avg_time_per_sample;
/* Work pool with respect to each work group. */
device_memory work_pool_wgs;
device_only_memory<unsigned int> work_pool_wgs;
/* clos_max value for which the kernels have been loaded currently. */
int current_max_closure;

@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
if(type == SHADER) {
num = min(shader_w, num);
}
else if(type == PATH_TRACE) {
else if(type == RENDER) {
}
else {
num = min(h, num);
@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
tasks.push_back(task);
}
}
else if(type == PATH_TRACE) {
else if(type == RENDER) {
for(int i = 0; i < num; i++)
tasks.push_back(*this);
}
@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
{
if((type != PATH_TRACE) &&
if((type != RENDER) &&
(type != SHADER))
return;

@ -34,7 +34,7 @@ class Tile;
class DeviceTask : public Task {
public:
typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
Type type;
int x, y, w, h;
@ -53,7 +53,7 @@ public:
int passes_size;
explicit DeviceTask(Type type = PATH_TRACE);
explicit DeviceTask(Type type = RENDER);
int get_subtask_count(int num, int max_size = 0);
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@ -65,6 +65,16 @@ public:
function<void(RenderTile&)> update_tile_sample;
function<void(RenderTile&)> release_tile;
function<bool(void)> get_cancel;
function<void(RenderTile*, Device*)> map_neighbor_tiles;
function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
int denoising_radius;
float denoising_strength;
float denoising_feature_strength;
bool denoising_relative_pca;
int pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
bool need_finish_queue;
bool integrator_branched;

@ -17,6 +17,7 @@
#ifdef WITH_OPENCL
#include "device/device.h"
#include "device/device_denoising.h"
#include "util/util_map.h"
#include "util/util_param.h"
@ -129,6 +130,8 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
static int mem_address_alignment(cl_device_id device_id);
/* Get somewhat more readable device name.
* Main difference is AMD OpenCL here which only gives code name
* for the regular device name. This will give more sane device
@ -218,7 +221,7 @@ public:
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@ -282,7 +285,7 @@ public:
map<ustring, cl_kernel> kernels;
};
OpenCLProgram base_program;
OpenCLProgram base_program, denoising_program;
typedef map<string, device_vector<uchar>*> ConstMemMap;
typedef map<string, device_ptr> MemMap;
@ -320,6 +323,9 @@ public:
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
void mem_free(device_memory& mem);
int mem_address_alignment();
void const_copy_to(const char *name, void *host, size_t size);
void tex_alloc(const char *name,
device_memory& mem,
@ -328,12 +334,14 @@ public:
void tex_free(device_memory& mem);
size_t global_size_round_up(int group_size, int global_size);
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
void shader(DeviceTask& task);
void denoise(RenderTile& tile, const DeviceTask& task);
class OpenCLDeviceTask : public DeviceTask {
public:
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@ -367,9 +375,48 @@ public:
virtual void thread_run(DeviceTask * /*task*/) = 0;
virtual bool is_split_kernel() = 0;
protected:
string kernel_build_options(const string *debug_src = NULL);
void mem_zero_kernel(device_ptr ptr, size_t size);
bool denoising_non_local_means(device_ptr image_ptr,
device_ptr guide_ptr,
device_ptr variance_ptr,
device_ptr out_ptr,
DenoisingTask *task);
bool denoising_construct_transform(DenoisingTask *task);
bool denoising_reconstruct(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr guide_ptr,
device_ptr guide_variance_ptr,
device_ptr output_ptr,
DenoisingTask *task);
bool denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
device_ptr variance_ptr,
int r, int4 rect,
DenoisingTask *task);
bool denoising_divide_shadow(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr sample_variance_ptr,
device_ptr sv_variance_ptr,
device_ptr buffer_variance_ptr,
DenoisingTask *task);
bool denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
DenoisingTask *task);
bool denoising_set_tiles(device_ptr *buffers,
DenoisingTask *task);
device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
void mem_free_sub_ptr(device_ptr ptr);
class ArgumentWrapper {
public:
ArgumentWrapper() : size(0), pointer(NULL)

@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
denoising_program.add_kernel(ustring("filter_divide_shadow"));
denoising_program.add_kernel(ustring("filter_get_feature"));
denoising_program.add_kernel(ustring("filter_combine_halves"));
denoising_program.add_kernel(ustring("filter_construct_transform"));
denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
denoising_program.add_kernel(ustring("filter_nlm_blur"));
denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
denoising_program.add_kernel(ustring("filter_nlm_update_output"));
denoising_program.add_kernel(ustring("filter_nlm_normalize"));
denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
denoising_program.add_kernel(ustring("filter_finalize"));
denoising_program.add_kernel(ustring("filter_set_tiles"));
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
programs.push_back(&denoising_program);
/* Call actual class to fill the vector with its programs. */
if(!load_kernels(requested_features, programs)) {
return false;
@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
NULL, NULL));
}
void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
{
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
size_t global_size[] = {1024, 1024};
size_t num_threads = global_size[0] * global_size[1];
cl_mem d_buffer = CL_MEM_PTR(mem);
cl_ulong d_offset = 0;
cl_ulong d_size = 0;
while(d_offset < size) {
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
ckZeroBuffer,
2,
NULL,
global_size,
NULL,
0,
NULL,
NULL);
opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
d_offset += d_size;
}
}
void OpenCLDeviceBase::mem_zero(device_memory& mem)
{
if(mem.device_pointer) {
if(base_program.is_loaded()) {
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
size_t global_size[] = {1024, 1024};
size_t num_threads = global_size[0] * global_size[1];
cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
cl_ulong d_offset = 0;
cl_ulong d_size = 0;
while(d_offset < mem.memory_size()) {
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
ckZeroBuffer,
2,
NULL,
global_size,
NULL,
0,
NULL,
NULL);
opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
d_offset += d_size;
}
mem_zero_kernel(mem.device_pointer, mem.memory_size());
}
if(mem.data_pointer) {
@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
int OpenCLDeviceBase::mem_address_alignment()
{
return OpenCLInfo::mem_address_alignment(cdDevice);
}
device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
{
cl_mem_flags mem_flag;
if(type == MEM_READ_ONLY)
mem_flag = CL_MEM_READ_ONLY;
else if(type == MEM_WRITE_ONLY)
mem_flag = CL_MEM_WRITE_ONLY;
else
mem_flag = CL_MEM_READ_WRITE;
cl_buffer_region info;
info.origin = mem.memory_elements_size(offset);
info.size = mem.memory_elements_size(size);
device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
mem_flag,
CL_BUFFER_CREATE_TYPE_REGION,
&info,
&ciErr);
opencl_assert_err(ciErr, "clCreateSubBuffer");
return sub_buf;
}
void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
{
if(device_pointer && device_pointer != null_mem) {
opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
}
}
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
{
ConstMemMap::iterator i = const_mem_map.find(name);
@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
workgroup_size = max_workgroup_size;
}
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
device_ptr guide_ptr,
device_ptr variance_ptr,
device_ptr out_ptr,
DenoisingTask *task)
{
int4 rect = task->rect;
int w = rect.z-rect.x;
int h = rect.w-rect.y;
int r = task->nlm_state.r;
int f = task->nlm_state.f;
float a = task->nlm_state.a;
float k_2 = task->nlm_state.k_2;
cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
cl_mem image_mem = CL_MEM_PTR(image_ptr);
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem out_mem = CL_MEM_PTR(out_ptr);
mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
mem_zero_kernel(out_ptr, sizeof(float)*w*h);
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
kernel_set_args(ckNLMCalcDifference, 0,
dx, dy, guide_mem, variance_mem,
difference, local_rect, w, 0, a, k_2);
kernel_set_args(ckNLMBlur, 0,
difference, blurDifference, local_rect, w, f);
kernel_set_args(ckNLMCalcWeight, 0,
blurDifference, difference, local_rect, w, f);
kernel_set_args(ckNLMUpdateOutput, 0,
dx, dy, blurDifference, image_mem,
out_mem, weightAccum, local_rect, w, f);
enqueue_kernel(ckNLMCalcDifference, w, h);
enqueue_kernel(ckNLMBlur, w, h);
enqueue_kernel(ckNLMCalcWeight, w, h);
enqueue_kernel(ckNLMBlur, w, h);
enqueue_kernel(ckNLMUpdateOutput, w, h);
}
int4 local_rect = make_int4(0, 0, w, h);
kernel_set_args(ckNLMNormalize, 0,
out_mem, weightAccum, local_rect, w);
enqueue_kernel(ckNLMNormalize, w, h);
return true;
}
bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
{
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
kernel_set_args(ckFilterConstructTransform, 0,
buffer_mem,
transform_mem,
rank_mem,
task->filter_area,
task->rect,
task->buffer.pass_stride,
task->radius,
task->pca_threshold);
enqueue_kernel(ckFilterConstructTransform,
task->storage.w,
task->storage.h,
256);
return true;
}
bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr guide_ptr,
device_ptr guide_variance_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
mem_zero(task->storage.XtWX);
mem_zero(task->storage.XtWY);
cl_mem color_mem = CL_MEM_PTR(color_ptr);
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
cl_mem output_mem = CL_MEM_PTR(output_ptr);
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
int r = task->radius;
int f = 4;
float a = 1.0f;
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
int local_rect[4] = {max(0, -dx), max(0, -dy),
task->reconstruction_state.source_w - max(0, dx),
task->reconstruction_state.source_h - max(0, dy)};
kernel_set_args(ckNLMCalcDifference, 0,
dx, dy,
guide_mem,
guide_variance_mem,
difference,
local_rect,
task->buffer.w,
task->buffer.pass_stride,
a, task->nlm_k_2);
enqueue_kernel(ckNLMCalcDifference,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
kernel_set_args(ckNLMBlur, 0,
difference,
blurDifference,
local_rect,
task->buffer.w,
f);
enqueue_kernel(ckNLMBlur,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
kernel_set_args(ckNLMCalcWeight, 0,
blurDifference,
difference,
local_rect,
task->buffer.w,
f);
enqueue_kernel(ckNLMCalcWeight,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
/* Reuse previous arguments. */
enqueue_kernel(ckNLMBlur,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
kernel_set_args(ckNLMConstructGramian, 0,
dx, dy,
blurDifference,
buffer_mem,
color_mem,
color_variance_mem,
transform_mem,
rank_mem,
XtWX_mem,
XtWY_mem,
local_rect,
task->reconstruction_state.filter_rect,
task->buffer.w,
task->buffer.h,
f,
task->buffer.pass_stride);
enqueue_kernel(ckNLMConstructGramian,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h,
256);
}
kernel_set_args(ckFinalize, 0,
task->buffer.w,
task->buffer.h,
output_mem,
rank_mem,
XtWX_mem,
XtWY_mem,
task->filter_area,
task->reconstruction_state.buffer_params,
task->render_buffer.samples);
enqueue_kernel(ckFinalize,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
return true;
}
bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
device_ptr variance_ptr,
int r, int4 rect,
DenoisingTask *task)
{
(void) task;
cl_mem a_mem = CL_MEM_PTR(a_ptr);
cl_mem b_mem = CL_MEM_PTR(b_ptr);
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
kernel_set_args(ckFilterCombineHalves, 0,
mean_mem,
variance_mem,
a_mem,
b_mem,
rect,
r);
enqueue_kernel(ckFilterCombineHalves,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
return true;
}
bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr sample_variance_ptr,
device_ptr sv_variance_ptr,
device_ptr buffer_variance_ptr,
DenoisingTask *task)
{
(void) task;
cl_mem a_mem = CL_MEM_PTR(a_ptr);
cl_mem b_mem = CL_MEM_PTR(b_ptr);
cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterDivideShadow, 0,
task->render_buffer.samples,
tiles_mem,
a_mem,
b_mem,
sample_variance_mem,
sv_variance_mem,
buffer_variance_mem,
task->rect,
task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset,
split_kernel);
enqueue_kernel(ckFilterDivideShadow,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
return true;
}
bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
DenoisingTask *task)
{
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterGetFeature, 0,
task->render_buffer.samples,
tiles_mem,
mean_offset,
variance_offset,
mean_mem,
variance_mem,
task->rect,
task->render_buffer.pass_stride,
task->render_buffer.denoising_data_offset,
split_kernel);
enqueue_kernel(ckFilterGetFeature,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
return true;
}
bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
DenoisingTask *task)
{
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
mem_copy_to(task->tiles_mem);
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
for(int i = 0; i < 9; i++) {
cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
}
enqueue_kernel(ckFilterSetTiles, 1, 1);
return true;
}
void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
{
DenoisingTask denoising(this);
denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
denoising.render_buffer.samples = rtile.sample;
RenderTile rtiles[9];
rtiles[4] = rtile;
task.map_neighbor_tiles(rtiles, this);
denoising.tiles_from_rendertiles(rtiles);
denoising.init_from_devicetask(task);
denoising.run_denoising();
task.unmap_neighbor_tiles(rtiles, this);
}
void OpenCLDeviceBase::shader(DeviceTask& task)
{
/* cast arguments to cl types */

@ -108,41 +108,53 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
else if(task->type == DeviceTask::PATH_TRACE) {
else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
if(tile.task == RenderTile::PATH_TRACE) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
path_trace(tile, sample);
tile.sample = sample + 1;
task->update_progress(&tile, tile.w*tile.h);
}
path_trace(tile, sample);
tile.sample = sample + 1;
/* Complete kernel execution before release tile */
/* This helps in multi-device render;
* The device that reaches the critical-section function
* release_tile waits (stalling other devices from entering
* release_tile) for all kernels to complete. If device1 (a
* slow-render device) reaches release_tile first then it would
* stall device2 (a fast-render device) from proceeding to render
* next tile.
*/
clFinish(cqCommandQueue);
}
else if(tile.task == RenderTile::DENOISE) {
tile.sample = tile.start_sample + tile.num_samples;
denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
/* Complete kernel execution before release tile */
/* This helps in multi-device render;
* The device that reaches the critical-section function
* release_tile waits (stalling other devices from entering
* release_tile) for all kernels to complete. If device1 (a
* slow-render device) reaches release_tile first then it would
* stall device2 (a fast-render device) from proceeding to render
* next tile.
*/
clFinish(cqCommandQueue);
task->release_tile(tile);
}
}
}
bool is_split_kernel()
{
return false;
}
};
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)

@ -104,7 +104,7 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
else if(task->type == DeviceTask::PATH_TRACE) {
else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@ -127,21 +127,29 @@ public:
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
split_kernel->path_trace(task,
tile,
kgbuffer,
*const_mem_map["__data"]);
if(tile.task == RenderTile::PATH_TRACE) {
assert(tile.task == RenderTile::PATH_TRACE);
split_kernel->path_trace(task,
tile,
kgbuffer,
*const_mem_map["__data"]);
/* Complete kernel execution before release tile. */
/* This helps in multi-device render;
* The device that reaches the critical-section function
* release_tile waits (stalling other devices from entering
* release_tile) for all kernels to complete. If device1 (a
* slow-render device) reaches release_tile first then it would
* stall device2 (a fast-render device) from proceeding to render
* next tile.
*/
clFinish(cqCommandQueue);
/* Complete kernel execution before release tile. */
/* This helps in multi-device render;
* The device that reaches the critical-section function
* release_tile waits (stalling other devices from entering
* release_tile) for all kernels to complete. If device1 (a
* slow-render device) reaches release_tile first then it would
* stall device2 (a fast-render device) from proceeding to render
* next tile.
*/
clFinish(cqCommandQueue);
}
else if(tile.task == RenderTile::DENOISE) {
tile.sample = tile.start_sample + tile.num_samples;
denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
}
@ -150,6 +158,11 @@ public:
}
}
bool is_split_kernel()
{
return true;
}
protected:
/* ** Those guys are for workign around some compiler-specific bugs ** */

@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
{
int base_align_bits;
if(clGetDeviceInfo(device_id,
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
sizeof(int),
&base_align_bits,
NULL) == CL_SUCCESS)
{
return base_align_bits/8;
}
return 1;
}
CCL_NAMESPACE_END
#endif

@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
kernels/cpu/kernel_avx.cpp
kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
kernels/cpu/kernel_split_sse2.cpp
kernels/cpu/kernel_split_sse3.cpp
kernels/cpu/kernel_split_sse41.cpp
kernels/cpu/kernel_split_avx.cpp
kernels/cpu/kernel_split_avx2.cpp
kernels/cpu/filter.cpp
kernels/cpu/filter_sse2.cpp
kernels/cpu/filter_sse3.cpp
kernels/cpu/filter_sse41.cpp
kernels/cpu/filter_avx.cpp
kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@ -32,8 +48,10 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
kernels/cpu/filter_cpu.h
kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
set(SRC_FILTER_HEADERS
filter/filter.h
filter/filter_defines.h
filter/filter_features.h
filter/filter_features_sse.h
filter/filter_kernel.h
filter/filter_nlm_cpu.h
filter/filter_nlm_gpu.h
filter/filter_prefilter.h
filter/filter_reconstruction.h
filter/filter_transform.h
filter/filter_transform_gpu.h
filter/filter_transform_sse.h
)
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
../util/util_math_int2.h
../util/util_math_int3.h
../util/util_math_int4.h
../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
set(cuda_filter_sources kernels/cuda/filter.cu
${SRC_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
${SRC_FILTER_HEADERS}
${SRC_UTIL_HEADERS}
)
set(cuda_cubins)
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
if(${split})
set(cuda_extra_flags "-D__SPLIT__")
set(cuda_cubin kernel_split)
else()
set(cuda_extra_flags "")
set(cuda_cubin kernel)
endif()
macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
set(cuda_cubin ${cuda_cubin}_experimental)
set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
set(name ${name}_experimental)
endif()
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
if(split)
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
else()
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
endif()
set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
${cuda_extra_flags}
${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
DEPENDS ${cuda_sources})
DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
list(APPEND SRC
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
kernels/cpu/kernel_split_sse2.cpp
kernels/cpu/kernel_split_sse3.cpp
kernels/cpu/kernel_split_sse41.cpp
)
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
list(APPEND SRC
kernels/cpu/kernel_avx.cpp
kernels/cpu/kernel_split_avx.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split_avx2.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@ -432,6 +452,7 @@ add_library(cycles_kernel
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)

@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
/* Classifies a closure as diffuse-like or specular-like.
* This is needed for the denoising feature pass generation,
* which are written on the first bounce where more than 25%
* of the sampling weight belongs to diffuse-line closures. */
ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
{
if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
return true;
}
if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
}
return false;
}
CCL_NAMESPACE_END

@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)

@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
float3 N;
} DiffuseBsdf;
/* DIFFUSE */

@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float3 *colors;
} DiffuseRampBsdf;

@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */

@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
{
if(cosI > 0.9999f || cosI < 1e-6f) {
if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
const float phi = M_2PI_F * randU.y;
return make_float2(r*cosf(phi), r*sinf(phi));

@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float roughness;
float a;
float b;

@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;

@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
SHADER_CLOSURE_BASE;
float roughness;
float3 N;
} PrincipledDiffuseBsdf;
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,

@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledSheenBsdf {
SHADER_CLOSURE_BASE;
float3 N;
} PrincipledSheenBsdf;
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,

@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float size;
float smooth;
} ToonBsdf;

@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
float texture_blur;
float albedo;
float roughness;
float3 N;
} Bssrdf;
/* Planar Truncated Gaussian

@ -0,0 +1,52 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __FILTER_H__
#define __FILTER_H__
/* CPU Filter Kernel Interface */
#include "util/util_types.h"
#include "kernel/filter/filter_defines.h"
CCL_NAMESPACE_BEGIN
#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/filter_cpu.h"
CCL_NAMESPACE_END
#endif /* __FILTER_H__ */

@ -0,0 +1,38 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __FILTER_DEFINES_H__
#define __FILTER_DEFINES_H__
#define DENOISE_FEATURES 10
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
#define XTWY_SIZE (DENOISE_FEATURES+1)
typedef struct TilesInfo {
int offsets[9];
int strides[9];
int x[4];
int y[4];
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
#else
long long int buffers[9];
#endif
} TilesInfo;
#endif /* __FILTER_DEFINES_H__*/

@ -0,0 +1,120 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
* pixel_buffer always points to the current pixel in the first pass. */
#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
#define END_FOR_PIXEL_WINDOW } \
pixel_buffer += buffer_w - (high.x - low.x); \
}
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
{
features[0] = pixel.x;
features[1] = pixel.y;
features[2] = ccl_get_feature(buffer, 0);
features[3] = ccl_get_feature(buffer, 1);
features[4] = ccl_get_feature(buffer, 2);
features[5] = ccl_get_feature(buffer, 3);
features[6] = ccl_get_feature(buffer, 4);
features[7] = ccl_get_feature(buffer, 5);
features[8] = ccl_get_feature(buffer, 6);
features[9] = ccl_get_feature(buffer, 7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] -= mean[i];
}
}
ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
{
scales[0] = fabsf(pixel.x - mean[0]);
scales[1] = fabsf(pixel.y - mean[1]);
scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
ccl_get_feature(buffer, 2) - mean[4],
ccl_get_feature(buffer, 3) - mean[5]));
scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
ccl_get_feature(buffer, 6) - mean[8],
ccl_get_feature(buffer, 7) - mean[9]));
}
ccl_device_inline void filter_calculate_scale(float *scale)
{
scale[0] = 1.0f/max(scale[0], 0.01f);
scale[1] = 1.0f/max(scale[1], 0.01f);
scale[2] = 1.0f/max(scale[2], 0.01f);
scale[6] = 1.0f/max(scale[4], 0.01f);
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
}
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
{
return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
}
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
{
return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
}
ccl_device_inline void design_row_add(float *design_row,
int rank,
ccl_global float ccl_restrict_ptr transform,
int stride,
int row,
float feature)
{
for(int i = 0; i < rank; i++) {
design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
}
}
/* Fill the design row. */
ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
ccl_global float ccl_restrict_ptr p_buffer,
int2 q_pixel,
ccl_global float ccl_restrict_ptr q_buffer,
int pass_stride,
int rank,
float *design_row,
ccl_global float ccl_restrict_ptr transform,
int stride)
{
design_row[0] = 1.0f;
math_vector_zero(design_row+1, rank);
design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
}
CCL_NAMESPACE_END

@ -0,0 +1,95 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
* x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
__m128 y4 = _mm_set1_ps(pixel.y); \
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
__m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
__m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
#define END_FOR_PIXEL_WINDOW_SSE } \
pixel_buffer += buffer_w - (pixel.x - low.x); \
}
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
{
features[0] = x;
features[1] = y;
features[2] = ccl_get_feature_sse(0);
features[3] = ccl_get_feature_sse(1);
features[4] = ccl_get_feature_sse(2);
features[5] = ccl_get_feature_sse(3);
features[6] = ccl_get_feature_sse(4);
features[7] = ccl_get_feature_sse(5);
features[8] = ccl_get_feature_sse(6);
features[9] = ccl_get_feature_sse(7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] = _mm_sub_ps(features[i], mean[i]);
}
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] = _mm_mask_ps(features[i], active_pixels);
}
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
{
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
__m128 diff, scale;
diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
scale = _mm_mul_ps(diff, diff);
diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
scales[3] = _mm_mask_ps(scale, active_pixels);
scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
scale = _mm_mul_ps(diff, diff);
diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
scales[5] = _mm_mask_ps(scale, active_pixels);
}
ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
{
scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
}
CCL_NAMESPACE_END

@ -0,0 +1,50 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "util/util_color.h"
#include "util/util_math.h"
#include "util/util_math_fast.h"
#include "util/util_texture.h"
#include "util/util_atomic.h"
#include "util/util_math_matrix.h"
#include "kernel/filter/filter_defines.h"
#include "kernel/filter/filter_features.h"
#ifdef __KERNEL_SSE3__
# include "kernel/filter/filter_features_sse.h"
#endif
#include "kernel/filter/filter_prefilter.h"
#ifdef __KERNEL_GPU__
# include "kernel/filter/filter_transform_gpu.h"
#else
# ifdef __KERNEL_SSE3__
# include "kernel/filter/filter_transform_sse.h"
# else
# include "kernel/filter/filter_transform.h"
# endif
#endif
#include "kernel/filter/filter_reconstruction.h"
#ifdef __KERNEL_CPU__
# include "kernel/filter/filter_nlm_cpu.h"
#else
# include "kernel/filter/filter_nlm_gpu.h"
#endif

@ -0,0 +1,163 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
float diff = 0.0f;
int numChannels = channel_offset? 3 : 1;
for(int c = 0; c < numChannels; c++) {
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
float pvar = varianceImage[c*channel_offset + y*w+x];
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
diff *= 1.0f/numChannels;
}
differenceImage[y*w+x] = diff;
}
}
}
ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
{
#ifdef __KERNEL_SSE3__
int aligned_lowx = (rect.x & ~(3));
int aligned_highx = ((rect.z + 3) & ~(3));
#endif
for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] = 0.0f;
}
for(int y1 = low; y1 < high; y1++) {
#ifdef __KERNEL_SSE3__
for(int x = aligned_lowx; x < aligned_highx; x+=4) {
_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
}
#else
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] += differenceImage[y1*w+x];
}
#endif
}
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] *= 1.0f/(high - low);
}
}
}
ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] = 0.0f;
}
}
for(int dx = -f; dx <= f; dx++) {
int pos_dx = max(0, dx);
int neg_dx = min(0, dx);
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
outImage[y*w+x] += differenceImage[y*w+dx+x];
}
}
}
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
}
}
}
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
accumImage[y*w+x] += weight;
outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
}
}
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride)
{
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
int y = fy + filter_rect.y;
for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
int x = fx + filter_rect.x;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
int storage_ofs = fy*filter_rect.z + fx;
float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
int *l_rank = rank + storage_ofs;
kernel_filter_construct_gramian(x, y, 1,
dx, dy, w, h,
pass_stride,
buffer,
color_pass, variance_pass,
l_transform, l_rank,
weight, l_XtWX, l_XtWY, 0);
}
}
}
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] /= accumImage[y*w+x];
}
}
}
CCL_NAMESPACE_END

@ -0,0 +1,147 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
int dx, int dy,
ccl_global float ccl_restrict_ptr weightImage,
ccl_global float ccl_restrict_ptr varianceImage,
ccl_global float *differenceImage,
int4 rect, int w,
int channel_offset,
float a, float k_2)
{
float diff = 0.0f;
int numChannels = channel_offset? 3 : 1;
for(int c = 0; c < numChannels; c++) {
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
float pvar = varianceImage[c*channel_offset + y*w+x];
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
diff *= 1.0f/numChannels;
}
differenceImage[y*w+x] = diff;
}
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
for(int y1 = low; y1 < high; y1++) {
sum += differenceImage[y1*w+x];
}
sum *= 1.0f/(high-low);
outImage[y*w+x] = sum;
}
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
sum *= 1.0f/(high-low);
outImage[y*w+x] = expf(-max(sum, 0.0f));
}
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
int dx, int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr image,
ccl_global float *outImage,
ccl_global float *accumImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
sum *= 1.0f/(high-low);
if(outImage) {
accumImage[y*w+x] += sum;
outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
}
else {
accumImage[y*w+x] = sum;
}
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
int dx, int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride,
int localIdx)
{
int y = fy + filter_rect.y;
int x = fx + filter_rect.x;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
int storage_ofs = fy*filter_rect.z + fx;
transform += storage_ofs;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_construct_gramian(x, y,
filter_rect.z*filter_rect.w,
dx, dy, w, h,
pass_stride,
buffer,
color_pass, variance_pass,
transform, rank,
weight, XtWX, XtWY,
localIdx);
}
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
ccl_global float *outImage,
ccl_global float ccl_restrict_ptr accumImage,
int4 rect, int w)
{
outImage[y*w+x] /= accumImage[y*w+x];
}
CCL_NAMESPACE_END

@ -0,0 +1,145 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
/* First step of the shadow prefiltering, performs the shadow division and stores all data
* in a nice and easy rectangular array that can be passed to the NLM filter.
*
* Calculates:
* unfiltered: Contains the two half images of the shadow feature pass
* sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
* sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
* bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
*/
ccl_device void kernel_filter_divide_shadow(int sample,
ccl_global TilesInfo *tiles,
int x, int y,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
ccl_global float *sampleVarianceV,
ccl_global float *bufferVariance,
int4 rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
int tile = ytile*3+xtile;
int offset = tiles->offsets[tile];
int stride = tiles->strides[tile];
ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
center_buffer += buffer_denoising_offset + 14;
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
float varA = center_buffer[2];
float varB = center_buffer[5];
int odd_sample = (sample+1)/2;
int even_sample = sample/2;
if(use_split_variance) {
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
}
varA /= (odd_sample - 1);
varB /= (even_sample - 1);
sampleVariance[idx] = 0.5f*(varA + varB) / sample;
sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
}
/* Load a regular feature from the render buffers into the denoise buffer.
* Parameters:
* - sample: The sample amount in the buffer, used to normalize the buffer.
* - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
* - x, y: Current pixel
* - mean, variance: Target denoise buffers.
* - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
*/
ccl_device void kernel_filter_get_feature(int sample,
ccl_global TilesInfo *tiles,
int m_offset, int v_offset,
int x, int y,
ccl_global float *mean,
ccl_global float *variance,
int4 rect, int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
int tile = ytile*3+xtile;
ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
mean[idx] = center_buffer[m_offset] / sample;
if(use_split_variance) {
variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
}
else {
variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
}
}
/* Combine A/B buffers.
* Calculates the combined mean and the buffer variance. */
ccl_device void kernel_filter_combine_halves(int x, int y,
ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,
ccl_global float *b,
int4 rect, int r)
{
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
if(variance) {
if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
else {
variance[idx] = 0.0f;
float values[25];
int numValues = 0;
for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
int pidx = (py-rect.y)*buffer_w + (px-rect.x);
values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
}
}
/* Insertion-sort the variances (fast enough for 25 elements). */
for(int i = 1; i < numValues; i++) {
float v = values[i];
int j;
for(j = i-1; j >= 0 && values[j] > v; j--)
values[j+1] = values[j];
values[j+1] = v;
}
variance[idx] = values[(7*numValues)/8];
}
}
}
CCL_NAMESPACE_END

@ -0,0 +1,103 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
int storage_stride,
int dx, int dy,
int w, int h,
int pass_stride,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
float weight,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int localIdx)
{
int p_offset = y *w + x;
int q_offset = (y+dy)*w + (x+dx);
#ifdef __KERNEL_CPU__
const int stride = 1;
(void)storage_stride;
(void)localIdx;
float design_row[DENOISE_FEATURES+1];
#elif defined(__KERNEL_CUDA__)
const int stride = storage_stride;
ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
#else
const int stride = storage_stride;
float design_row[DENOISE_FEATURES+1];
#endif
float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
return;
}
filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
make_int2(x+dx, y+dy), buffer + q_offset,
pass_stride, *rank, design_row, transform, stride);
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
}
ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
ccl_global float *buffer,
ccl_global int *rank,
int storage_stride,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 buffer_params,
int sample)
{
#ifdef __KERNEL_CPU__
const int stride = 1;
(void)storage_stride;
#else
const int stride = storage_stride;
#endif
math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
float3 final_color = XtWY[0];
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
final_color *= sample;
if(buffer_params.w) {
final_color.x += combined_buffer[buffer_params.w+0];
final_color.y += combined_buffer[buffer_params.w+1];
final_color.z += combined_buffer[buffer_params.w+2];
}
combined_buffer[0] = final_color.x;
combined_buffer[1] = final_color.y;
combined_buffer[2] = final_color.z;
}
#undef STORAGE_TYPE
CCL_NAMESPACE_END

@ -0,0 +1,113 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
float *transform, int *rank,
int radius, float pca_threshold)
{
int buffer_w = align_up(rect.z - rect.x, 4);
float features[DENOISE_FEATURES];
/* Temporary storage, used in different steps of the algorithm. */
float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
float tempvector[2*DENOISE_FEATURES];
float ccl_restrict_ptr pixel_buffer;
int2 pixel;
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
math_vector_zero(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
math_vector_add(feature_means, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float *feature_scale = tempvector;
math_vector_zero(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_max(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
filter_calculate_scale(feature_scale);
/* === Generate the feature transformation. ===
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float* feature_matrix = tempmatrix;
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < (*rank); i++) {
math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
}
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
}
CCL_NAMESPACE_END

@ -0,0 +1,117 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
ccl_global float *transform,
ccl_global int *rank,
int radius, float pca_threshold,
int transform_stride, int localIdx)
{
int buffer_w = align_up(rect.z - rect.x, 4);
#ifdef __KERNEL_CUDA__
ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
#else
float features[DENOISE_FEATURES];
#endif
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
ccl_global float ccl_restrict_ptr pixel_buffer;
int2 pixel;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
math_vector_zero(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
math_vector_add(feature_means, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float feature_scale[DENOISE_FEATURES];
math_vector_zero(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_max(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
filter_calculate_scale(feature_scale);
/* === Generate the feature transformation. ===
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
for(int j = 0; j < (*rank); j++) {
transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
}
}
}
CCL_NAMESPACE_END

@ -0,0 +1,102 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
float *transform, int *rank,
int radius, float pca_threshold)
{
int buffer_w = align_up(rect.z - rect.x, 4);
__m128 features[DENOISE_FEATURES];
float ccl_restrict_ptr pixel_buffer;
int2 pixel;
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
__m128 feature_means[DENOISE_FEATURES];
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
} END_FOR_PIXEL_WINDOW_SSE
__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
for(int i = 0; i < DENOISE_FEATURES; i++) {
feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
}
__m128 feature_scale[DENOISE_FEATURES];
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW_SSE
filter_calculate_scale_sse(feature_scale);
__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
}
}
CCL_NAMESPACE_END

@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */

@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END

@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
#endif
#ifdef __DENOISING_FEATURES__
L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_depth = 0.0f;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
float3 ao,
int bounce)
float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
if(bounce == 0) {
if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
float3 light = throughput * bsdf;
L->path_total += light;
L->path_total_shaded += ao * light;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
float3 light = throughput * bsdf;
L->path_total += light;
L->path_total_shaded += ao * light;
}
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * bsdf;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * bsdf;
}
#else
(void) L;
(void) state;
(void) throughput;
(void) bsdf;
#endif
}
ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
BsdfEval *bsdf_eval,
float3 shadow,
float shadow_fac,
bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
if(bounce == 0) {
if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
float3 light = throughput * bsdf_eval->sum_no_mis;
L->path_total += light;
L->path_total_shaded += shadow * light;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
float3 light = throughput * bsdf_eval->sum_no_mis;
L->path_total += light;
L->path_total_shaded += shadow * light;
}
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * bsdf_eval->sum_no_mis;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * bsdf_eval->sum_no_mis;
}
#else
(void) L;
(void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * value;
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
L->path_total_shaded += throughput * value;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * value;
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
L->path_total_shaded += throughput * value;
}
}
#endif
#ifdef __DENOISING_FEATURES__
L->denoising_albedo += state->denoising_feature_weight * value;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
{
#ifdef __PASSES__
kernel_assert(L->use_light_pass);
*clean = L->emission + L->background;
*noisy = L->direct_scatter + L->indirect_scatter;
# define ADD_COMPONENT(flag, component) \
if(kernel_data.film.denoising_flags & flag) \
*clean += component; \
else \
*noisy += component;
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
# undef ADD_COMPONENT
#else
*noisy = L->emission;
*clean = make_float3(0.0f, 0.0f, 0.0f);
#endif
*noisy = ensure_finite3(*noisy);
*clean = ensure_finite3(*clean);
}
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
ccl_addr_space float* L_transparent)
float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
*L_transparent = shadow;
*alpha = 1.0f-shadow;
L_sum = make_float3(0.0f, 0.0f, 0.0f);
}
else {

@ -42,6 +42,8 @@
#include "util/util_types.h"
#include "util/util_texture.h"
#define ccl_restrict_ptr const * __restrict
#define ccl_addr_space
#define ccl_local_id(d) 0

@ -55,6 +55,10 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
#define ccl_restrict_ptr const * __restrict__
#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
/* No assert supported for CUDA */
#define kernel_assert(cond)

@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
#define ccl_restrict_ptr const * __restrict__
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)

@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;

@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
#ifdef __DENOISING_FEATURES__
ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
{
kernel_write_pass_float(buffer, sample, value);
/* The online one-pass variance update that's used for the megakernel can't easily be implemented
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+1, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float(buffer+1, sample, 0.0f);
}
else {
float new_mean = buffer[0] * (1.0f / (sample + 1));
float old_mean = (buffer[0] - value) * (1.0f / sample);
kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
}
# endif
}
# if defined(__SPLIT_KERNEL__)
# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
# else
ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
{
buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
}
# endif
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
{
kernel_write_pass_float3_unaligned(buffer, sample, value);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
}
else {
float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
float3 new_mean = sum * (1.0f / (sample + 1));
float3 old_mean = (sum - value) * (1.0f / sample);
kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
}
# endif
}
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
int sample, float path_total, float path_total_shaded)
{
if(kernel_data.film.pass_denoising_data == 0)
return;
buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
path_total = ensure_finite(path_total);
path_total_shaded = ensure_finite(path_total_shaded);
kernel_write_pass_float(buffer, sample/2, path_total);
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
float value = path_total_shaded / max(path_total, 1e-7f);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+2, sample/2, value*value);
# else
if(sample < 2) {
kernel_write_pass_float(buffer+2, sample/2, 0.0f);
}
else {
float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
float new_value = buffer[1] / max(buffer[0], 1e-7f);
kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
}
# endif
}
#endif /* __DENOISING_FEATURES__ */
ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
ShaderData *sd,
ccl_global PathState *state,
PathRadiance *L)
{
#ifdef __DENOISING_FEATURES__
if(state->denoising_feature_weight == 0.0f) {
return;
}
L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
float3 normal = make_float3(0.0f, 0.0f, 0.0f);
float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
for(int i = 0; i < sd->num_closure; i++) {
ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
continue;
/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
normal += sc->N * sc->sample_weight;
sum_weight += sc->sample_weight;
if(!bsdf_is_specular_like(sc)) {
albedo += sc->weight;
sum_nonspecular_weight += sc->sample_weight;
}
}
/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
if(sum_weight != 0.0f) {
normal /= sum_weight;
}
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
state->denoising_feature_weight = 0.0f;
}
#else
(void) kg;
(void) sd;
(void) state;
(void) L;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
{
if(L) {
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(is_shadow_catcher) {
L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
}
else
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, L);
}
kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
# ifdef __SHADOW_TRICKS__
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
# else
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
# endif
if(kernel_data.film.pass_denoising_clean) {
float3 noisy, clean;
path_radiance_split_denoising(kg, L, &noisy, &clean);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, noisy);
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
sample, clean);
}
else {
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, L_sum);
}
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
sample, L->denoising_normal);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
sample, L->denoising_albedo);
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
sample, L->denoising_depth);
}
#endif /* __DENOISING_FEATURES__ */
}
else {
kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
sample, 0.0f);
if(kernel_data.film.pass_denoising_clean) {
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
sample, make_float3(0.0f, 0.0f, 0.0f));
}
}
#endif /* __DENOISING_FEATURES__ */
}
}
CCL_NAMESPACE_END

@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
path_radiance_accum_total_ao(L, throughput, ao_bsdf);
path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
kernel_update_denoising_features(kg, sd, state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer)
ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer,
PathRadiance *L,
bool *is_shadow_catcher)
{
/* initialize */
PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
path_radiance_init(&L, kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
&emission_sd, throughput, &state, &L, all,
&emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
path_radiance_accum_background(&L, &state, throughput, L_background);
path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
kernel_update_denoising_features(kg, &sd, &state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
&L,
L,
&state,
rng,
&ray,
@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
&L,
L,
&throughput);
}
else {
@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
}
else
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, &L);
}
kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
float4 L;
PathRadiance L;
bool is_shadow_catcher;
if(ray.t != 0.0f)
L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
else
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L);
if(ray.t != 0.0f) {
float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
}
else {
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
}
path_rng_end(kg, rng_state, rng);
}

@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
float sum_sample_weight = 0.0f;
#ifdef __DENOISING_FEATURES__
if(state->denoising_feature_weight > 0.0f) {
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
/* transparency is not handled here, but in outer loop */
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
sum_sample_weight += sc->sample_weight;
}
}
else {
sum_sample_weight = 1.0f;
}
#endif /* __DENOISING_FEATURES__ */
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSDF(sc->type))
continue;
/* transparency is not handled here, but in outer loop */
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
int num_samples;
@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
&bsdf_ray))
&bsdf_ray,
sum_sample_weight))
{
continue;
}
@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer,
PathRadiance *L,
bool *is_shadow_catcher)
{
/* initialize */
PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
path_radiance_init(&L, kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
&emission_sd, throughput, &state, &L, all,
&emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
&L,
L,
&pray))
{
kernel_path_indirect(kg,
@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
&L);
L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
path_radiance_sum_indirect(&L);
path_radiance_reset_indirect(&L);
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
&L,
L,
&pray))
{
kernel_path_indirect(kg,
@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
&L);
L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
path_radiance_sum_indirect(&L);
path_radiance_reset_indirect(&L);
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
path_radiance_accum_background(&L, &state, throughput, L_background);
path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
kernel_update_denoising_features(kg, &sd, &state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
&L, &state, rng, &ray, throughput);
L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
}
else
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, &L);
}
kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
float4 L;
PathRadiance L;
bool is_shadow_catcher;
if(ray.t != 0.0f)
L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
else
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L);
if(ray.t != 0.0f) {
float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
}
else {
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
}
path_rng_end(kg, rng_state, rng);
}
@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END

@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
state->flag |= PATH_RAY_STORE_SHADOW_INFO;
state->denoising_feature_weight = 1.0f;
}
else {
state->denoising_feature_weight = 0.0f;
}
#endif /* __DENOISING_FEATURES__ */
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
}
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)

@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
ccl_addr_space Ray *ray)
ccl_addr_space Ray *ray,
float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
#ifdef __DENOISING_FEATURES__
state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
#endif
/* modify path state */
path_state_next(kg, state, label);
@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput, &L_light);
path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}

@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}

@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
if(is_zero(dir))
return make_float2(0.0f, 0.0f);
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;

@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {

@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
#define __DENOISING_FEATURES__
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@ -314,31 +316,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
PATH_RAY_CAMERA = 1,
PATH_RAY_REFLECT = 2,
PATH_RAY_TRANSMIT = 4,
PATH_RAY_DIFFUSE = 8,
PATH_RAY_GLOSSY = 16,
PATH_RAY_SINGULAR = 32,
PATH_RAY_TRANSPARENT = 64,
PATH_RAY_CAMERA = (1 << 0),
PATH_RAY_REFLECT = (1 << 1),
PATH_RAY_TRANSMIT = (1 << 2),
PATH_RAY_DIFFUSE = (1 << 3),
PATH_RAY_GLOSSY = (1 << 4),
PATH_RAY_SINGULAR = (1 << 5),
PATH_RAY_TRANSPARENT = (1 << 6),
PATH_RAY_SHADOW_OPAQUE = 128,
PATH_RAY_SHADOW_TRANSPARENT = 256,
PATH_RAY_SHADOW_OPAQUE = (1 << 7),
PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
PATH_RAY_NODE_UNALIGNED = 2048,
PATH_RAY_NODE_UNALIGNED = (1 << 11),
PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
PATH_RAY_MIS_SKIP = 4096,
PATH_RAY_DIFFUSE_ANCESTOR = 8192,
PATH_RAY_SINGLE_PASS_DONE = 16384,
PATH_RAY_SHADOW_CATCHER = 32768,
PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
PATH_RAY_MIS_SKIP = (1 << 12),
PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
PATH_RAY_SHADOW_CATCHER = (1 << 15),
PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@ -394,6 +397,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
typedef enum DenoisingPassOffsets {
DENOISING_PASS_NORMAL = 0,
DENOISING_PASS_NORMAL_VAR = 3,
DENOISING_PASS_ALBEDO = 6,
DENOISING_PASS_ALBEDO_VAR = 9,
DENOISING_PASS_DEPTH = 12,
DENOISING_PASS_DEPTH_VAR = 13,
DENOISING_PASS_SHADOW_A = 14,
DENOISING_PASS_SHADOW_B = 17,
DENOISING_PASS_COLOR = 20,
DENOISING_PASS_COLOR_VAR = 23,
DENOISING_PASS_SIZE_BASE = 26,
DENOISING_PASS_SIZE_CLEAN = 3,
} DenoisingPassOffsets;
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
typedef enum DenoiseFlag {
DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
} DenoiseFlag;
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
/* Color of the background on which shadow is alpha-overed. */
float3 shadow_color;
#endif
#ifdef __DENOISING_FEATURES__
float3 denoising_normal;
float3 denoising_albedo;
float denoising_depth;
#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
float sample_weight \
float sample_weight; \
float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
float data[14]; /* pad to 80 bytes */
float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@ -960,6 +998,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
#ifdef __DENOISING_FEATURES__
float denoising_feature_weight;
#endif /* __DENOISING_FEATURES__ */
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
int pass_denoising_data;
int pass_denoising_clean;
int denoising_flags;
int pad;
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;

@ -0,0 +1,61 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CPU kernel entry points */
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
* one with SSE2 intrinsics.
*/
#if defined(__x86_64__) || defined(_M_X64)
# define __KERNEL_SSE2__
#endif
/* When building kernel for native machine detect kernel features from the flags
* set by compiler.
*/
#ifdef WITH_KERNEL_NATIVE
# ifdef __SSE2__
# ifndef __KERNEL_SSE2__
# define __KERNEL_SSE2__
# endif
# endif
# ifdef __SSE3__
# define __KERNEL_SSE3__
# endif
# ifdef __SSSE3__
# define __KERNEL_SSSE3__
# endif
# ifdef __SSE4_1__
# define __KERNEL_SSE41__
# endif
# ifdef __AVX__
# define __KERNEL_SSE__
# define __KERNEL_AVX__
# endif
# ifdef __AVX2__
# define __KERNEL_SSE__
# define __KERNEL_AVX2__
# endif
#endif
/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__)
/* do nothing */
#endif
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -0,0 +1,39 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -0,0 +1,40 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -0,0 +1,132 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Templated common declaration part of all CPU kernels. */
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
TilesInfo *tiles,
int x,
int y,
float *unfilteredA,
float *unfilteredB,
float *sampleV,
float *sampleVV,
float *bufferV,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
int x,
int y,
float *mean,
float *variance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,
float *a,
float *b,
int* prefilter_rect,
int r);
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int x,
int y,
int storage_ofs,
float *transform,
int *rank,
int* rect,
int pass_stride,
int radius,
float pca_threshold);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weightImage,
float *variance,
float *differenceImage,
int* rect,
int w,
int channel_offset,
float a,
float k_2);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
float *outImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
float *outImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy,
float *differenceImage,
float *image,
float *outImage,
float *accumImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
float *differenceImage,
float *buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int *rect,
int *filter_rect,
int w,
int h,
int f,
int pass_stride);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
float *accumImage,
int* rect,
int w);
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
int y,
int storage_ofs,
int w,
int h,
float *buffer,
int *rank,
float *XtWX,
float3 *XtWY,
int *buffer_params,
int sample);
#undef KERNEL_ARCH

@ -0,0 +1,259 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Templated common implementation part of all CPU kernels.
*
* The idea is that particular .cpp files sets needed optimization flags and
* simply includes this file without worry of copying actual implementation over.
*/
#include "kernel/kernel_compat_cpu.h"
#include "kernel/filter/filter_kernel.h"
#ifdef KERNEL_STUB
# include "util/util_debug.h"
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
#endif
CCL_NAMESPACE_BEGIN
/* Denoise filter */
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
TilesInfo *tiles,
int x,
int y,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
float *sampleVarianceV,
float *bufferVariance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
#else
kernel_filter_divide_shadow(sample, tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
int x,
int y,
float *mean, float *variance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
#else
kernel_filter_get_feature(sample, tiles,
m_offset, v_offset,
x, y,
mean, variance,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,
float *a,
float *b,
int* prefilter_rect,
int r)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
#else
kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int x,
int y,
int storage_ofs,
float *transform,
int *rank,
int* prefilter_rect,
int pass_stride,
int radius,
float pca_threshold)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
#else
rank += storage_ofs;
transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
x, y,
load_int4(prefilter_rect),
pass_stride,
transform,
rank,
radius,
pca_threshold);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weightImage,
float *variance,
float *differenceImage,
int *rect,
int w,
int channel_offset,
float a,
float k_2)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
#else
kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
float *outImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
#else
kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
float *outImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
#else
kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy,
float *differenceImage,
float *image,
float *outImage,
float *accumImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
#else
kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
float *differenceImage,
float *buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int *rect,
int *filter_rect,
int w,
int h,
int f,
int pass_stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
float *accumImage,
int *rect,
int w)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
#else
kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
int y,
int storage_ofs,
int w,
int h,
float *buffer,
int *rank,
float *XtWX,
float3 *XtWY,
int *buffer_params,
int sample)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_finalize);
#else
XtWX += storage_ofs*XTWX_SIZE;
XtWY += storage_ofs*XTWY_SIZE;
rank += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
#endif
}
#undef KERNEL_STUB
#undef STUB_ASSERT
#undef KERNEL_ARCH
CCL_NAMESPACE_END

@ -0,0 +1,34 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -0,0 +1,36 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -0,0 +1,37 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/filter_cpu_impl.h"

@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
#undef KERNEL_ARCH

@ -57,6 +57,11 @@
# include "kernel/split/kernel_buffer_update.h"
#endif
#ifdef KERNEL_STUB
# include "util/util_debug.h"
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
#endif
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef __BRANCHED_PATH__
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, path_trace);
#else
# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
#endif
# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
#endif /* KERNEL_STUB */
}
/* Film */
@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, shader);
#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
#ifdef __BAKING__
# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
#endif
# endif
}
else {
kernel_shader_evaluate(kg,
@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
#ifdef KERNEL_STUB
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
STUB_ASSERT(KERNEL_ARCH, name); \
}
#else
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
#endif /* KERNEL_STUB */
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
{
#define REGISTER_NAME_STRING(name) #name
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
REGISTER(path_trace);
REGISTER(convert_to_byte);
REGISTER(convert_to_half_float);
REGISTER(shader);
REGISTER(data_init);
REGISTER(path_init);
REGISTER(scene_intersect);
REGISTER(lamp_emission);
REGISTER(do_volume);
REGISTER(queue_enqueue);
REGISTER(indirect_background);
REGISTER(shader_setup);
REGISTER(shader_sort);
REGISTER(shader_eval);
REGISTER(holdout_emission_blurring_pathtermination_ao);
REGISTER(subsurface_scatter);
REGISTER(direct_lighting);
REGISTER(shadow_blocked_ao);
REGISTER(shadow_blocked_dl);
REGISTER(next_iteration_setup);
REGISTER(indirect_subsurface);
REGISTER(buffer_update);
#undef REGISTER
#undef REGISTER_EVAL_NAME
#undef REGISTER_NAME_STRING
}
#endif /* __SPLIT_KERNEL__ */
#undef KERNEL_STUB
#undef STUB_ASSERT
#undef KERNEL_ARCH
CCL_NAMESPACE_END

@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu//kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

@ -0,0 +1,235 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CUDA kernel entry points */
#ifdef __CUDA_ARCH__
#include "kernel_config.h"
#include "kernel/kernel_compat_cuda.h"
#include "kernel/filter/filter_kernel.h"
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_shadow(int sample,
TilesInfo *tiles,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
float *sampleVarianceV,
float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_divide_shadow(sample,
tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_get_feature(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
float *mean,
float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_get_feature(sample,
tiles,
m_offset, v_offset,
x, y,
mean, variance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
float *transform, int *rank,
int4 filter_area, int4 rect,
int radius, float pca_threshold,
int pass_stride)
{
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
int *l_rank = rank + y*filter_area.z + x;
float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
x + filter_area.x, y + filter_area.y,
rect, pass_stride,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
float ccl_restrict_ptr weightImage,
float ccl_restrict_ptr varianceImage,
float *differenceImage,
int4 rect, int w,
int channel_offset,
float a, float k_2) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_update_output(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr image,
float *outImage, float *accumImage,
int4 rect, int w,
int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr buffer,
float *color_pass,
float *variance_pass,
float const* __restrict__ transform,
int *rank,
float *XtWX,
float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride) {
int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
kernel_filter_nlm_construct_gramian(x, y,
dx, dy,
differenceImage,
buffer,
color_pass, variance_pass,
transform, rank,
XtWX, XtWY,
rect, filter_rect,
w, h, f,
pass_stride,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_finalize(int w, int h,
float *buffer, int *rank,
float *XtWX, float3 *XtWY,
int4 filter_area, int4 buffer_params,
int sample) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
int storage_ofs = y*filter_area.z+x;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
}
}
#endif

@ -0,0 +1,262 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* OpenCL kernel entry points */
#include "kernel/kernel_compat_opencl.h"
#include "kernel/filter/filter_kernel.h"
/* kernels */
__kernel void kernel_ocl_filter_divide_shadow(int sample,
ccl_global TilesInfo *tiles,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
ccl_global float *sampleVarianceV,
ccl_global float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
char use_split_variance)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_divide_shadow(sample,
tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
__kernel void kernel_ocl_filter_get_feature(int sample,
ccl_global TilesInfo *tiles,
int m_offset,
int v_offset,
ccl_global float *mean,
ccl_global float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
char use_split_variance)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_get_feature(sample,
tiles,
m_offset, v_offset,
x, y,
mean, variance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,
ccl_global float *b,
int4 prefilter_rect,
int r)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
}
}
__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
ccl_global float *transform,
ccl_global int *rank,
int4 filter_area,
int4 rect,
int pass_stride,
int radius,
float pca_threshold)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < filter_area.z && y < filter_area.w) {
ccl_global int *l_rank = rank + y*filter_area.z + x;
ccl_global float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
x + filter_area.x, y + filter_area.y,
rect, pass_stride,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
int dy,
ccl_global float ccl_restrict_ptr weightImage,
ccl_global float ccl_restrict_ptr varianceImage,
ccl_global float *differenceImage,
int4 rect,
int w,
int channel_offset,
float a,
float k_2) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
}
}
__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr image,
ccl_global float *outImage,
ccl_global float *accumImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
ccl_global float ccl_restrict_ptr accumImage,
int4 rect,
int w) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
}
}
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 rect,
int4 filter_rect,
int w,
int h,
int f,
int pass_stride) {
int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
kernel_filter_nlm_construct_gramian(x, y,
dx, dy,
differenceImage,
buffer,
color_pass, variance_pass,
transform, rank,
XtWX, XtWY,
rect, filter_rect,
w, h, f,
pass_stride,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}
__kernel void kernel_ocl_filter_finalize(int w,
int h,
ccl_global float *buffer,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 filter_area,
int4 buffer_params,
int sample) {
int x = get_global_id(0);
int y = get_global_id(1);
if(x < filter_area.z && y < filter_area.w) {
int storage_ofs = y*filter_area.z+x;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
}
}
__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
ccl_global float *buffer_1,
ccl_global float *buffer_2,
ccl_global float *buffer_3,
ccl_global float *buffer_4,
ccl_global float *buffer_5,
ccl_global float *buffer_6,
ccl_global float *buffer_7,
ccl_global float *buffer_8,
ccl_global float *buffer_9)
{
if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
tiles->buffers[0] = buffer_1;
tiles->buffers[1] = buffer_2;
tiles->buffers[2] = buffer_3;
tiles->buffers[3] = buffer_4;
tiles->buffers[4] = buffer_5;
tiles->buffers[5] = buffer_6;
tiles->buffers[6] = buffer_7;
tiles->buffers[7] = buffer_8;
tiles->buffers[8] = buffer_9;
}
}

@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
float3 throughput = branched_state->throughput;
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
float sum_sample_weight = 0.0f;
#ifdef __DENOISING_FEATURES__
if(ps->denoising_feature_weight > 0.0f) {
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
/* transparency is not handled here, but in outer loop */
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
sum_sample_weight += sc->sample_weight;
}
}
else {
sum_sample_weight = 1.0f;
}
#endif /* __DENOISING_FEATURES__ */
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG bsdf_rng = cmj_hash(rng, i);
for(int j = branched_state->next_sample; j < num_samples; j++) {
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
if(reset_path_state) {
*ps = branched_state->path_state;
}
@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
tp,
ps,
L,
bsdf_ray))
bsdf_ray,
sum_sample_weight))
{
continue;
}

@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state->flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
}
else
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, L);
}
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L_rad);
bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);

@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state->flag & PATH_RAY_CAMERA) {
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
kernel_update_denoising_features(kg, sd, state, L);
}
}

@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
&shadow))
{
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput, &L_light);
path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}

@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we

@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
if(is_zero(normal_out)) {
normal_out = normal_in;
}
else {
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
}
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);

@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);

@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
co = normalize(co);
co = safe_normalize(co);
if(projection == 0)
uv = direction_to_equirectangular(co);
else

@ -402,7 +402,6 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@ -423,14 +422,13 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@ -465,13 +463,16 @@ typedef enum ClosureType {
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
(type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@ -480,7 +481,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f

@ -42,6 +42,9 @@ BufferParams::BufferParams()
full_width = 0;
full_height = 0;
denoising_data_pass = false;
denoising_clean_pass = false;
Pass::add(PASS_COMBINED, passes);
}
@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
for(size_t i = 0; i < passes.size(); i++)
size += passes[i].components;
if(denoising_data_pass) {
size += DENOISING_PASS_SIZE_BASE;
if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
}
return align_up(size, 4);
}
int BufferParams::get_denoising_offset()
{
int offset = 0;
for(size_t i = 0; i < passes.size(); i++)
offset += passes[i].components;
return offset;
}
/* Render Buffer Task */
RenderTile::RenderTile()
@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
}
bool RenderBuffers::copy_from_device()
bool RenderBuffers::copy_from_device(Device *from_device)
{
if(!buffer.device_pointer)
return false;
device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
if(!from_device) {
from_device = device;
}
from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
return true;
}
bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
{
float scale = 1.0f/sample;
if(offset == DENOISING_PASS_COLOR) {
scale *= exposure;
}
else if(offset == DENOISING_PASS_COLOR_VAR) {
scale *= exposure*exposure;
}
offset += params.get_denoising_offset();
float *in = (float*)buffer.data_pointer + offset;
int pass_stride = params.get_passes_size();
int size = params.width*params.height;
if(components == 1) {
for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
pixels[0] = in[0]*scale;
}
}
else if(components == 3) {
for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
pixels[0] = in[0]*scale;
pixels[1] = in[1]*scale;
pixels[2] = in[2]*scale;
}
}
else {
return false;
}
return true;
}

@ -51,6 +51,9 @@ public:
/* passes */
array<Pass> passes;
bool denoising_data_pass;
/* If only some light path types should be denoised, an additional pass is needed. */
bool denoising_clean_pass;
/* functions */
BufferParams();
@ -59,6 +62,7 @@ public:
bool modified(const BufferParams& params);
void add_pass(PassType type);
int get_passes_size();
int get_denoising_offset();
};
/* Render Buffers */
@ -73,18 +77,19 @@ public:
/* random number generator state */
device_vector<uint> rng_state;
Device *device;
explicit RenderBuffers(Device *device);
~RenderBuffers();
void reset(Device *device, BufferParams& params);
bool copy_from_device();
bool copy_from_device(Device *from_device = NULL);
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
protected:
void device_free();
Device *device;
};
/* Display Buffer
@ -131,6 +136,9 @@ protected:
class RenderTile {
public:
typedef enum { PATH_TRACE, DENOISE } Task;
Task task;
int x, y, w, h;
int start_sample;
int num_samples;
@ -138,6 +146,7 @@ public:
int resolution;
int offset;
int stride;
int tile_index;
device_ptr buffer;
device_ptr rng_state;

@ -279,6 +279,10 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
return type;
}
@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride += pass.components;
}
kfilm->pass_denoising_data = 0;
kfilm->pass_denoising_clean = 0;
kfilm->denoising_flags = 0;
if(denoising_data_pass) {
kfilm->pass_denoising_data = kfilm->pass_stride;
kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
kfilm->denoising_flags = denoising_flags;
if(denoising_clean_pass) {
kfilm->pass_denoising_clean = kfilm->pass_stride;
kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
kfilm->use_light_pass = 1;
}
}
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
kfilm->pass_alpha_threshold = pass_alpha_threshold;
@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
kfilm->mist_falloff = mist_falloff;
pass_stride = kfilm->pass_stride;
denoising_data_offset = kfilm->pass_denoising_data;
denoising_clean_offset = kfilm->pass_denoising_clean;
need_update = false;
}

Some files were not shown because too many files have changed in this diff Show More