Merge branch 'master' into blender2.8
This commit is contained in:
commit
9181f13af7
@ -241,3 +241,15 @@ def register_passes(engine, scene, srl):
|
||||
if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
|
||||
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
|
||||
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
|
||||
|
||||
if crl.use_denoising and crl.denoising_store_passes:
|
||||
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR');
|
||||
engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
|
||||
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR');
|
||||
engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
|
||||
engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE');
|
||||
engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE');
|
||||
engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR');
|
||||
engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR');
|
||||
engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR');
|
||||
engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR');
|
@ -1189,6 +1189,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
|
||||
default=False,
|
||||
)
|
||||
|
||||
cls.use_denoising = BoolProperty(
|
||||
name="Use Denoising",
|
||||
description="Denoise the rendered image",
|
||||
default=False,
|
||||
)
|
||||
cls.denoising_diffuse_direct = BoolProperty(
|
||||
name="Diffuse Direct",
|
||||
description="Denoise the direct diffuse lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_diffuse_indirect = BoolProperty(
|
||||
name="Diffuse Indirect",
|
||||
description="Denoise the indirect diffuse lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_glossy_direct = BoolProperty(
|
||||
name="Glossy Direct",
|
||||
description="Denoise the direct glossy lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_glossy_indirect = BoolProperty(
|
||||
name="Glossy Indirect",
|
||||
description="Denoise the indirect glossy lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_transmission_direct = BoolProperty(
|
||||
name="Transmission Direct",
|
||||
description="Denoise the direct transmission lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_transmission_indirect = BoolProperty(
|
||||
name="Transmission Indirect",
|
||||
description="Denoise the indirect transmission lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_subsurface_direct = BoolProperty(
|
||||
name="Subsurface Direct",
|
||||
description="Denoise the direct subsurface lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_subsurface_indirect = BoolProperty(
|
||||
name="Subsurface Indirect",
|
||||
description="Denoise the indirect subsurface lighting",
|
||||
default=True,
|
||||
)
|
||||
cls.denoising_strength = FloatProperty(
|
||||
name="Denoising Strength",
|
||||
description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
|
||||
min=0.0, max=1.0,
|
||||
default=0.5,
|
||||
)
|
||||
cls.denoising_feature_strength = FloatProperty(
|
||||
name="Denoising Feature Strength",
|
||||
description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
|
||||
min=0.0, max=1.0,
|
||||
default=0.5,
|
||||
)
|
||||
cls.denoising_radius = IntProperty(
|
||||
name="Denoising Radius",
|
||||
description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
|
||||
min=1, max=50,
|
||||
default=8,
|
||||
)
|
||||
cls.denoising_relative_pca = BoolProperty(
|
||||
name="Relative filter",
|
||||
description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
|
||||
default=False,
|
||||
)
|
||||
cls.denoising_store_passes = BoolProperty(
|
||||
name="Store denoising passes",
|
||||
description="Store the denoising feature passes and the noisy image",
|
||||
default=False,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def unregister(cls):
|
||||
del bpy.types.SceneRenderLayer.cycles
|
||||
|
@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
|
||||
col.prop(rl, "use_pass_emit", text="Emission")
|
||||
col.prop(rl, "use_pass_environment")
|
||||
|
||||
if context.scene.cycles.feature_set == 'EXPERIMENTAL':
|
||||
col.separator()
|
||||
sub = col.column()
|
||||
sub.active = crl.use_denoising
|
||||
sub.prop(crl, "denoising_store_passes", text="Denoising")
|
||||
|
||||
if _cycles.with_cycles_debug:
|
||||
col = layout.column()
|
||||
col.prop(crl, "pass_debug_bvh_traversed_nodes")
|
||||
@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
|
||||
row.prop(rv, "camera_suffix", text="")
|
||||
|
||||
|
||||
class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
|
||||
bl_label = "Denoising"
|
||||
bl_context = "render_layer"
|
||||
bl_options = {'DEFAULT_CLOSED'}
|
||||
|
||||
def draw_header(self, context):
|
||||
rd = context.scene.render
|
||||
rl = rd.layers.active
|
||||
crl = rl.cycles
|
||||
cscene = context.scene.cycles
|
||||
layout = self.layout
|
||||
|
||||
layout.active = not cscene.use_progressive_refine
|
||||
layout.prop(crl, "use_denoising", text="")
|
||||
|
||||
def draw(self, context):
|
||||
layout = self.layout
|
||||
|
||||
scene = context.scene
|
||||
cscene = scene.cycles
|
||||
rd = scene.render
|
||||
rl = rd.layers.active
|
||||
crl = rl.cycles
|
||||
|
||||
layout.active = crl.use_denoising and not cscene.use_progressive_refine
|
||||
|
||||
split = layout.split()
|
||||
|
||||
col = split.column()
|
||||
sub = col.column(align=True)
|
||||
sub.prop(crl, "denoising_radius", text="Radius")
|
||||
sub.prop(crl, "denoising_strength", slider=True, text="Strength")
|
||||
|
||||
col = split.column()
|
||||
sub = col.column(align=True)
|
||||
sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
|
||||
sub.prop(crl, "denoising_relative_pca")
|
||||
|
||||
layout.separator()
|
||||
|
||||
row = layout.row()
|
||||
row.label(text="Diffuse:")
|
||||
sub = row.row(align=True)
|
||||
sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
|
||||
sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
|
||||
|
||||
row = layout.row()
|
||||
row.label(text="Glossy:")
|
||||
sub = row.row(align=True)
|
||||
sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
|
||||
sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
|
||||
|
||||
row = layout.row()
|
||||
row.label(text="Transmission:")
|
||||
sub = row.row(align=True)
|
||||
sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
|
||||
sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
|
||||
|
||||
row = layout.row()
|
||||
row.label(text="Subsurface:")
|
||||
sub = row.row(align=True)
|
||||
sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
|
||||
sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
|
||||
|
||||
|
||||
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
|
||||
bl_label = "Post Processing"
|
||||
bl_options = {'DEFAULT_CLOSED'}
|
||||
@ -1732,6 +1803,7 @@ classes = (
|
||||
CyclesRender_PT_layer_options,
|
||||
CyclesRender_PT_layer_passes,
|
||||
CyclesRender_PT_views,
|
||||
CyclesRender_PT_denoising,
|
||||
Cycles_PT_post_processing,
|
||||
CyclesCamera_PT_dof,
|
||||
Cycles_PT_context_material,
|
||||
|
@ -303,12 +303,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
|
||||
static void end_render_result(BL::RenderEngine& b_engine,
|
||||
BL::RenderResult& b_rr,
|
||||
bool cancel,
|
||||
bool highlight,
|
||||
bool do_merge_results)
|
||||
{
|
||||
b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
|
||||
b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
|
||||
}
|
||||
|
||||
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
|
||||
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
|
||||
{
|
||||
BufferParams& params = rtile.buffers->params;
|
||||
int x = params.full_x - session->tile_manager.params.full_x;
|
||||
@ -344,37 +345,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
|
||||
update_render_result(b_rr, b_rlay, rtile);
|
||||
}
|
||||
|
||||
end_render_result(b_engine, b_rr, true, true);
|
||||
end_render_result(b_engine, b_rr, true, highlight, true);
|
||||
}
|
||||
else {
|
||||
/* write result */
|
||||
write_render_result(b_rr, b_rlay, rtile);
|
||||
end_render_result(b_engine, b_rr, false, true);
|
||||
end_render_result(b_engine, b_rr, false, false, true);
|
||||
}
|
||||
}
|
||||
|
||||
void BlenderSession::write_render_tile(RenderTile& rtile)
|
||||
{
|
||||
do_write_update_render_tile(rtile, false);
|
||||
do_write_update_render_tile(rtile, false, false);
|
||||
}
|
||||
|
||||
void BlenderSession::update_render_tile(RenderTile& rtile)
|
||||
void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
|
||||
{
|
||||
/* use final write for preview renders, otherwise render result wouldn't be
|
||||
* be updated in blender side
|
||||
* would need to be investigated a bit further, but for now shall be fine
|
||||
*/
|
||||
if(!b_engine.is_preview())
|
||||
do_write_update_render_tile(rtile, true);
|
||||
do_write_update_render_tile(rtile, true, highlight);
|
||||
else
|
||||
do_write_update_render_tile(rtile, false);
|
||||
do_write_update_render_tile(rtile, false, false);
|
||||
}
|
||||
|
||||
void BlenderSession::render()
|
||||
{
|
||||
/* set callback to write out render results */
|
||||
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
|
||||
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
|
||||
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
|
||||
|
||||
/* get buffer parameters */
|
||||
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
|
||||
@ -395,7 +396,7 @@ void BlenderSession::render()
|
||||
|
||||
/* layer will be missing if it was disabled in the UI */
|
||||
if(b_single_rlay == b_rr.layers.end()) {
|
||||
end_render_result(b_engine, b_rr, true, false);
|
||||
end_render_result(b_engine, b_rr, true, true, false);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -411,6 +412,29 @@ void BlenderSession::render()
|
||||
}
|
||||
|
||||
buffer_params.passes = passes;
|
||||
|
||||
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
|
||||
bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
|
||||
buffer_params.denoising_data_pass = use_denoising;
|
||||
session->tile_manager.schedule_denoising = use_denoising;
|
||||
session->params.use_denoising = use_denoising;
|
||||
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
|
||||
scene->film->denoising_flags = 0;
|
||||
if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
|
||||
if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
|
||||
if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
|
||||
if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
|
||||
if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
|
||||
if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
|
||||
if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
|
||||
if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
|
||||
scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
|
||||
buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
|
||||
session->params.denoising_radius = get_int(crl, "denoising_radius");
|
||||
session->params.denoising_strength = get_float(crl, "denoising_strength");
|
||||
session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
|
||||
session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
|
||||
|
||||
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
|
||||
scene->film->tag_passes_update(scene, passes);
|
||||
scene->film->tag_update(scene);
|
||||
@ -464,7 +488,7 @@ void BlenderSession::render()
|
||||
}
|
||||
|
||||
/* free result without merging */
|
||||
end_render_result(b_engine, b_rr, true, false);
|
||||
end_render_result(b_engine, b_rr, true, true, false);
|
||||
|
||||
if(session->progress.get_cancel())
|
||||
break;
|
||||
@ -670,6 +694,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
|
||||
/* copy pixels */
|
||||
read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
|
||||
}
|
||||
else {
|
||||
int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
|
||||
if(denoising_offset >= 0) {
|
||||
read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if(!read) {
|
||||
memset(&pixels[0], 0, pixels.size()*sizeof(float));
|
||||
|
@ -81,7 +81,7 @@ public:
|
||||
void update_render_result(BL::RenderResult& b_rr,
|
||||
BL::RenderLayer& b_rlay,
|
||||
RenderTile& rtile);
|
||||
void update_render_tile(RenderTile& rtile);
|
||||
void update_render_tile(RenderTile& rtile, bool highlight);
|
||||
|
||||
/* interactive updates */
|
||||
void synchronize();
|
||||
@ -150,7 +150,7 @@ protected:
|
||||
BL::RenderLayer& b_rlay,
|
||||
RenderTile& rtile,
|
||||
bool do_update_only);
|
||||
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
|
||||
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
|
||||
|
||||
int builtin_image_frame(const string &builtin_name);
|
||||
void builtin_image_info(const string &builtin_name,
|
||||
|
@ -509,6 +509,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
|
||||
return PASS_NONE;
|
||||
}
|
||||
|
||||
int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
|
||||
{
|
||||
string name = b_pass.name();
|
||||
if(name.substr(0, 10) != "Denoising ") {
|
||||
return -1;
|
||||
}
|
||||
name = name.substr(10);
|
||||
|
||||
#define MAP_PASS(passname, offset) if(name == passname) return offset;
|
||||
MAP_PASS("Normal", DENOISING_PASS_NORMAL);
|
||||
MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
|
||||
MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
|
||||
MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
|
||||
MAP_PASS("Depth", DENOISING_PASS_DEPTH);
|
||||
MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
|
||||
MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
|
||||
MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
|
||||
MAP_PASS("Image", DENOISING_PASS_COLOR);
|
||||
MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
|
||||
#undef MAP_PASS
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
|
||||
BL::SceneRenderLayer& b_srlay)
|
||||
{
|
||||
@ -528,8 +552,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
|
||||
Pass::add(pass_type, passes);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
|
||||
if(get_boolean(crp, "denoising_store_passes")) {
|
||||
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str());
|
||||
b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
|
||||
}
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
|
||||
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
|
||||
Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
|
||||
|
@ -97,6 +97,7 @@ public:
|
||||
int width, int height);
|
||||
|
||||
static PassType get_pass_type(BL::RenderPass& b_pass);
|
||||
static int get_denoising_pass(BL::RenderPass& b_pass);
|
||||
|
||||
private:
|
||||
/* sync */
|
||||
|
@ -25,6 +25,7 @@ set(SRC
|
||||
device.cpp
|
||||
device_cpu.cpp
|
||||
device_cuda.cpp
|
||||
device_denoising.cpp
|
||||
device_multi.cpp
|
||||
device_opencl.cpp
|
||||
device_split_kernel.cpp
|
||||
@ -48,6 +49,7 @@ endif()
|
||||
|
||||
set(SRC_HEADERS
|
||||
device.h
|
||||
device_denoising.h
|
||||
device_memory.h
|
||||
device_intern.h
|
||||
device_network.h
|
||||
|
@ -549,4 +549,16 @@ void Device::free_memory()
|
||||
devices.free_memory();
|
||||
}
|
||||
|
||||
|
||||
device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
|
||||
: device(device)
|
||||
{
|
||||
ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
|
||||
}
|
||||
|
||||
device_sub_ptr::~device_sub_ptr()
|
||||
{
|
||||
device->mem_free_sub_ptr(ptr);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -228,6 +228,7 @@ struct DeviceDrawParams {
|
||||
};
|
||||
|
||||
class Device {
|
||||
friend class device_sub_ptr;
|
||||
protected:
|
||||
enum {
|
||||
FALLBACK_SHADER_STATUS_NONE = 0,
|
||||
@ -250,6 +251,14 @@ protected:
|
||||
|
||||
bool bind_fallback_display_space_shader(const float width, const float height);
|
||||
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
|
||||
{
|
||||
/* Only required for devices that implement denoising. */
|
||||
assert(false);
|
||||
return (device_ptr) 0;
|
||||
}
|
||||
virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
|
||||
|
||||
public:
|
||||
virtual ~Device();
|
||||
|
||||
@ -278,6 +287,8 @@ public:
|
||||
virtual void mem_zero(device_memory& mem) = 0;
|
||||
virtual void mem_free(device_memory& mem) = 0;
|
||||
|
||||
virtual int mem_address_alignment() { return 16; }
|
||||
|
||||
/* constant memory */
|
||||
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
|
||||
|
||||
@ -326,6 +337,8 @@ public:
|
||||
/* multi device */
|
||||
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
|
||||
virtual int device_number(Device * /*sub_device*/) { return 0; }
|
||||
virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
|
||||
virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
|
||||
|
||||
/* static */
|
||||
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -21,11 +21,14 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "device/device.h"
|
||||
#include "device/device_denoising.h"
|
||||
#include "device/device_intern.h"
|
||||
#include "device/device_split_kernel.h"
|
||||
|
||||
#include "render/buffers.h"
|
||||
|
||||
#include "kernel/filter/filter_defines.h"
|
||||
|
||||
#ifdef WITH_CUDA_DYNLOAD
|
||||
# include "cuew.h"
|
||||
#else
|
||||
@ -113,7 +116,7 @@ public:
|
||||
DedicatedTaskPool task_pool;
|
||||
CUdevice cuDevice;
|
||||
CUcontext cuContext;
|
||||
CUmodule cuModule;
|
||||
CUmodule cuModule, cuFilterModule;
|
||||
map<device_ptr, bool> tex_interp_map;
|
||||
map<device_ptr, uint> tex_bindless_map;
|
||||
int cuDevId;
|
||||
@ -170,7 +173,7 @@ public:
|
||||
CUresult result = stmt; \
|
||||
\
|
||||
if(result != CUDA_SUCCESS) { \
|
||||
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
|
||||
string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
|
||||
if(error_msg == "") \
|
||||
error_msg = message; \
|
||||
fprintf(stderr, "%s\n", message.c_str()); \
|
||||
@ -301,7 +304,8 @@ public:
|
||||
* kernel sources md5 and only depends on compiler or compilation settings.
|
||||
*/
|
||||
string compile_kernel_get_common_cflags(
|
||||
const DeviceRequestedFeatures& requested_features, bool split=false)
|
||||
const DeviceRequestedFeatures& requested_features,
|
||||
bool filter=false, bool split=false)
|
||||
{
|
||||
const int cuda_version = cuewCompilerVersion();
|
||||
const int machine = system_cpu_bits();
|
||||
@ -316,7 +320,7 @@ public:
|
||||
machine,
|
||||
cuda_version,
|
||||
include_path.c_str());
|
||||
if(use_adaptive_compilation()) {
|
||||
if(!filter && use_adaptive_compilation()) {
|
||||
cflags += " " + requested_features.get_build_options();
|
||||
}
|
||||
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
|
||||
@ -364,8 +368,22 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
|
||||
string compile_kernel(const DeviceRequestedFeatures& requested_features,
|
||||
bool filter=false, bool split=false)
|
||||
{
|
||||
const char *name, *source;
|
||||
if(filter) {
|
||||
name = "filter";
|
||||
source = "filter.cu";
|
||||
}
|
||||
else if(split) {
|
||||
name = "kernel_split";
|
||||
source = "kernel_split.cu";
|
||||
}
|
||||
else {
|
||||
name = "kernel";
|
||||
source = "kernel.cu";
|
||||
}
|
||||
/* Compute cubin name. */
|
||||
int major, minor;
|
||||
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
||||
@ -373,9 +391,8 @@ public:
|
||||
|
||||
/* Attempt to use kernel provided with Blender. */
|
||||
if(!use_adaptive_compilation()) {
|
||||
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
|
||||
: "lib/kernel_sm_%d%d.cubin",
|
||||
major, minor));
|
||||
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
|
||||
name, major, minor));
|
||||
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
|
||||
if(path_exists(cubin)) {
|
||||
VLOG(1) << "Using precompiled kernel.";
|
||||
@ -384,7 +401,7 @@ public:
|
||||
}
|
||||
|
||||
const string common_cflags =
|
||||
compile_kernel_get_common_cflags(requested_features, split);
|
||||
compile_kernel_get_common_cflags(requested_features, filter, split);
|
||||
|
||||
/* Try to use locally compiled kernel. */
|
||||
const string source_path = path_get("source");
|
||||
@ -395,9 +412,8 @@ public:
|
||||
*/
|
||||
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
|
||||
|
||||
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
|
||||
: "cycles_kernel_sm%d%d_%s.cubin",
|
||||
major, minor,
|
||||
const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
|
||||
name, major, minor,
|
||||
cubin_md5.c_str());
|
||||
const string cubin = path_cache_get(path_join("kernels", cubin_file));
|
||||
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
|
||||
@ -432,7 +448,7 @@ public:
|
||||
const string kernel = path_join(
|
||||
path_join(source_path, "kernel"),
|
||||
path_join("kernels",
|
||||
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
|
||||
path_join("cuda", source)));
|
||||
double starttime = time_dt();
|
||||
printf("Compiling CUDA kernel ...\n");
|
||||
|
||||
@ -480,11 +496,14 @@ public:
|
||||
return false;
|
||||
|
||||
/* get kernel */
|
||||
string cubin = compile_kernel(requested_features, use_split_kernel());
|
||||
|
||||
string cubin = compile_kernel(requested_features, false, use_split_kernel());
|
||||
if(cubin == "")
|
||||
return false;
|
||||
|
||||
string filter_cubin = compile_kernel(requested_features, true, false);
|
||||
if(filter_cubin == "")
|
||||
return false;
|
||||
|
||||
/* open module */
|
||||
cuda_push_context();
|
||||
|
||||
@ -499,6 +518,14 @@ public:
|
||||
if(cuda_error_(result, "cuModuleLoad"))
|
||||
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
|
||||
|
||||
if(path_read_text(filter_cubin, cubin_data))
|
||||
result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
|
||||
else
|
||||
result = CUDA_ERROR_FILE_NOT_FOUND;
|
||||
|
||||
if(cuda_error_(result, "cuModuleLoad"))
|
||||
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
|
||||
|
||||
cuda_pop_context();
|
||||
|
||||
return (result == CUDA_SUCCESS);
|
||||
@ -581,6 +608,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
|
||||
{
|
||||
return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
|
||||
}
|
||||
|
||||
void const_copy_to(const char *name, void *host, size_t size)
|
||||
{
|
||||
CUdeviceptr mem;
|
||||
@ -881,6 +913,368 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
|
||||
{
|
||||
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
|
||||
|
||||
TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
|
||||
for(int i = 0; i < 9; i++) {
|
||||
tiles->buffers[i] = buffers[i];
|
||||
}
|
||||
|
||||
mem_copy_to(task->tiles_mem);
|
||||
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
#define CUDA_GET_BLOCKSIZE(func, w, h) \
|
||||
int threads_per_block; \
|
||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
||||
int threads = (int)sqrt((float)threads_per_block); \
|
||||
int xblocks = ((w) + threads - 1)/threads; \
|
||||
int yblocks = ((h) + threads - 1)/threads;
|
||||
|
||||
#define CUDA_LAUNCH_KERNEL(func, args) \
|
||||
cuda_assert(cuLaunchKernel(func, \
|
||||
xblocks, yblocks, 1, \
|
||||
threads, threads, 1, \
|
||||
0, 0, args, 0));
|
||||
|
||||
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
int4 rect = task->rect;
|
||||
int w = rect.z-rect.x;
|
||||
int h = rect.w-rect.y;
|
||||
int r = task->nlm_state.r;
|
||||
int f = task->nlm_state.f;
|
||||
float a = task->nlm_state.a;
|
||||
float k_2 = task->nlm_state.k_2;
|
||||
|
||||
CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
|
||||
CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
|
||||
CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
|
||||
|
||||
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
|
||||
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
|
||||
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
|
||||
|
||||
int dx, dy;
|
||||
int4 local_rect;
|
||||
int channel_offset = 0;
|
||||
void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
|
||||
void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
|
||||
void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
|
||||
void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
|
||||
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
dy = i / (2*r+1) - r;
|
||||
dx = i % (2*r+1) - r;
|
||||
local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
|
||||
}
|
||||
|
||||
local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
|
||||
void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
bool denoising_construct_transform(DenoisingTask *task)
|
||||
{
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
CUfunction cuFilterConstructTransform;
|
||||
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
|
||||
CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
|
||||
task->storage.w,
|
||||
task->storage.h);
|
||||
|
||||
void *args[] = {&task->buffer.mem.device_pointer,
|
||||
&task->storage.transform.device_pointer,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->filter_area,
|
||||
&task->rect,
|
||||
&task->radius,
|
||||
&task->pca_threshold,
|
||||
&task->buffer.pass_stride};
|
||||
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
bool denoising_reconstruct(device_ptr color_ptr,
|
||||
device_ptr color_variance_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr guide_variance_ptr,
|
||||
device_ptr output_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
mem_zero(task->storage.XtWX);
|
||||
mem_zero(task->storage.XtWY);
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
|
||||
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
|
||||
CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
|
||||
|
||||
int r = task->radius;
|
||||
int f = 4;
|
||||
float a = 1.0f;
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
|
||||
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||
task->reconstruction_state.source_w - max(0, dx),
|
||||
task->reconstruction_state.source_h - max(0, dy)};
|
||||
|
||||
void *calc_difference_args[] = {&dx, &dy,
|
||||
&guide_ptr,
|
||||
&guide_variance_ptr,
|
||||
&difference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&task->buffer.pass_stride,
|
||||
&a,
|
||||
&task->nlm_k_2};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||
|
||||
void *blur_args[] = {&difference,
|
||||
&blurDifference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&f};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
|
||||
void *calc_weight_args[] = {&blurDifference,
|
||||
&difference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&f};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||
|
||||
/* Reuse previous arguments. */
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
|
||||
void *construct_gramian_args[] = {&dx, &dy,
|
||||
&blurDifference,
|
||||
&task->buffer.mem.device_pointer,
|
||||
&color_ptr,
|
||||
&color_variance_ptr,
|
||||
&task->storage.transform.device_pointer,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->storage.XtWX.device_pointer,
|
||||
&task->storage.XtWY.device_pointer,
|
||||
&local_rect,
|
||||
&task->reconstruction_state.filter_rect,
|
||||
&task->buffer.w,
|
||||
&task->buffer.h,
|
||||
&f,
|
||||
&task->buffer.pass_stride};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
|
||||
}
|
||||
|
||||
void *finalize_args[] = {&task->buffer.w,
|
||||
&task->buffer.h,
|
||||
&output_ptr,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->storage.XtWX.device_pointer,
|
||||
&task->storage.XtWY.device_pointer,
|
||||
&task->filter_area,
|
||||
&task->reconstruction_state.buffer_params.x,
|
||||
&task->render_buffer.samples};
|
||||
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
|
||||
device_ptr mean_ptr, device_ptr variance_ptr,
|
||||
int r, int4 rect, DenoisingTask *task)
|
||||
{
|
||||
(void) task;
|
||||
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
CUfunction cuFilterCombineHalves;
|
||||
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
|
||||
CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
void *args[] = {&mean_ptr,
|
||||
&variance_ptr,
|
||||
&a_ptr,
|
||||
&b_ptr,
|
||||
&rect,
|
||||
&r};
|
||||
CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
|
||||
device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
|
||||
device_ptr buffer_variance_ptr, DenoisingTask *task)
|
||||
{
|
||||
(void) task;
|
||||
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
CUfunction cuFilterDivideShadow;
|
||||
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
|
||||
CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
bool use_split_variance = use_split_kernel();
|
||||
void *args[] = {&task->render_buffer.samples,
|
||||
&task->tiles_mem.device_pointer,
|
||||
&a_ptr,
|
||||
&b_ptr,
|
||||
&sample_variance_ptr,
|
||||
&sv_variance_ptr,
|
||||
&buffer_variance_ptr,
|
||||
&task->rect,
|
||||
&task->render_buffer.pass_stride,
|
||||
&task->render_buffer.denoising_data_offset,
|
||||
&use_split_variance};
|
||||
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
bool denoising_get_feature(int mean_offset,
|
||||
int variance_offset,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
if(have_error())
|
||||
return false;
|
||||
|
||||
cuda_push_context();
|
||||
|
||||
CUfunction cuFilterGetFeature;
|
||||
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
|
||||
CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
bool use_split_variance = use_split_kernel();
|
||||
void *args[] = {&task->render_buffer.samples,
|
||||
&task->tiles_mem.device_pointer,
|
||||
&mean_offset,
|
||||
&variance_offset,
|
||||
&mean_ptr,
|
||||
&variance_ptr,
|
||||
&task->rect,
|
||||
&task->render_buffer.pass_stride,
|
||||
&task->render_buffer.denoising_data_offset,
|
||||
&use_split_variance};
|
||||
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
cuda_pop_context();
|
||||
return !have_error();
|
||||
}
|
||||
|
||||
void denoise(RenderTile &rtile, const DeviceTask &task)
|
||||
{
|
||||
DenoisingTask denoising(this);
|
||||
|
||||
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
|
||||
denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
|
||||
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
|
||||
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
|
||||
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
|
||||
denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
|
||||
denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
|
||||
|
||||
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
||||
denoising.render_buffer.samples = rtile.sample;
|
||||
|
||||
RenderTile rtiles[9];
|
||||
rtiles[4] = rtile;
|
||||
task.map_neighbor_tiles(rtiles, this);
|
||||
denoising.tiles_from_rendertiles(rtiles);
|
||||
|
||||
denoising.init_from_devicetask(task);
|
||||
|
||||
denoising.run_denoising();
|
||||
|
||||
task.unmap_neighbor_tiles(rtiles, this);
|
||||
}
|
||||
|
||||
void path_trace(RenderTile& rtile, int sample, bool branched)
|
||||
{
|
||||
if(have_error())
|
||||
@ -1326,7 +1720,7 @@ public:
|
||||
|
||||
void thread_run(DeviceTask *task)
|
||||
{
|
||||
if(task->type == DeviceTask::PATH_TRACE) {
|
||||
if(task->type == DeviceTask::RENDER) {
|
||||
RenderTile tile;
|
||||
|
||||
bool branched = task->integrator_branched;
|
||||
@ -1334,30 +1728,8 @@ public:
|
||||
/* Upload Bindless Mapping */
|
||||
load_bindless_mapping();
|
||||
|
||||
if(!use_split_kernel()) {
|
||||
/* keep rendering tiles until done */
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
|
||||
path_trace(tile, sample, branched);
|
||||
|
||||
tile.sample = sample + 1;
|
||||
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
|
||||
task->release_tile(tile);
|
||||
}
|
||||
}
|
||||
else {
|
||||
DeviceRequestedFeatures requested_features;
|
||||
DeviceRequestedFeatures requested_features;
|
||||
if(use_split_kernel()) {
|
||||
if(!use_adaptive_compilation()) {
|
||||
requested_features.max_closure = 64;
|
||||
}
|
||||
@ -1366,17 +1738,46 @@ public:
|
||||
split_kernel = new CUDASplitKernel(this);
|
||||
split_kernel->load_kernels(requested_features);
|
||||
}
|
||||
}
|
||||
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
device_memory void_buffer;
|
||||
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
|
||||
|
||||
task->release_tile(tile);
|
||||
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
/* keep rendering tiles until done */
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
if(tile.task == RenderTile::PATH_TRACE) {
|
||||
if(use_split_kernel()) {
|
||||
device_memory void_buffer;
|
||||
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
|
||||
}
|
||||
else {
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
|
||||
path_trace(tile, sample, branched);
|
||||
|
||||
tile.sample = sample + 1;
|
||||
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(tile.task == RenderTile::DENOISE) {
|
||||
tile.sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
denoise(tile, *task);
|
||||
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
|
||||
task->release_tile(tile);
|
||||
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
218
intern/cycles/device/device_denoising.cpp
Normal file
218
intern/cycles/device/device_denoising.cpp
Normal file
@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "device/device_denoising.h"
|
||||
|
||||
#include "kernel/filter/filter_defines.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void DenoisingTask::init_from_devicetask(const DeviceTask &task)
|
||||
{
|
||||
radius = task.denoising_radius;
|
||||
nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
|
||||
if(task.denoising_relative_pca) {
|
||||
pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
|
||||
}
|
||||
else {
|
||||
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
|
||||
}
|
||||
|
||||
render_buffer.pass_stride = task.pass_stride;
|
||||
render_buffer.denoising_data_offset = task.pass_denoising_data;
|
||||
render_buffer.denoising_clean_offset = task.pass_denoising_clean;
|
||||
|
||||
/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
|
||||
rect = make_int4(max(tiles->x[0], filter_area.x - radius),
|
||||
max(tiles->y[0], filter_area.y - radius),
|
||||
min(tiles->x[3], filter_area.x + filter_area.z + radius),
|
||||
min(tiles->y[3], filter_area.y + filter_area.w + radius));
|
||||
}
|
||||
|
||||
void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
|
||||
{
|
||||
tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
|
||||
|
||||
device_ptr buffers[9];
|
||||
for(int i = 0; i < 9; i++) {
|
||||
buffers[i] = rtiles[i].buffer;
|
||||
tiles->offsets[i] = rtiles[i].offset;
|
||||
tiles->strides[i] = rtiles[i].stride;
|
||||
}
|
||||
tiles->x[0] = rtiles[3].x;
|
||||
tiles->x[1] = rtiles[4].x;
|
||||
tiles->x[2] = rtiles[5].x;
|
||||
tiles->x[3] = rtiles[5].x + rtiles[5].w;
|
||||
tiles->y[0] = rtiles[1].y;
|
||||
tiles->y[1] = rtiles[4].y;
|
||||
tiles->y[2] = rtiles[7].y;
|
||||
tiles->y[3] = rtiles[7].y + rtiles[7].h;
|
||||
|
||||
render_buffer.offset = rtiles[4].offset;
|
||||
render_buffer.stride = rtiles[4].stride;
|
||||
render_buffer.ptr = rtiles[4].buffer;
|
||||
|
||||
functions.set_tiles(buffers);
|
||||
}
|
||||
|
||||
bool DenoisingTask::run_denoising()
|
||||
{
|
||||
/* Allocate denoising buffer. */
|
||||
buffer.passes = 14;
|
||||
buffer.w = align_up(rect.z - rect.x, 4);
|
||||
buffer.h = rect.w - rect.y;
|
||||
buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
|
||||
buffer.mem.resize(buffer.pass_stride * buffer.passes);
|
||||
device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
|
||||
|
||||
device_ptr null_ptr = (device_ptr) 0;
|
||||
|
||||
/* Prefilter shadow feature. */
|
||||
{
|
||||
device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
|
||||
nlm_state.temporary_1_ptr = *nlm_temporary_1;
|
||||
nlm_state.temporary_2_ptr = *nlm_temporary_2;
|
||||
nlm_state.temporary_3_ptr = *nlm_temporary_3;
|
||||
|
||||
/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
|
||||
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
|
||||
|
||||
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
|
||||
nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
|
||||
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
|
||||
|
||||
/* Reuse memory, the previous data isn't needed anymore. */
|
||||
device_ptr filtered_a = *buffer_var,
|
||||
filtered_b = *sample_var;
|
||||
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
|
||||
nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
|
||||
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
|
||||
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
|
||||
|
||||
device_ptr residual_var = *sample_var_var;
|
||||
/* Estimate the residual variance between the two filtered halves. */
|
||||
functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
|
||||
|
||||
device_ptr final_a = *unfiltered_a,
|
||||
final_b = *unfiltered_b;
|
||||
/* Use the residual variance for a second filter pass. */
|
||||
nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
|
||||
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
|
||||
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
|
||||
|
||||
/* Combine the two double-filtered halves to a final shadow feature. */
|
||||
device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
|
||||
}
|
||||
|
||||
/* Prefilter general features. */
|
||||
{
|
||||
device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
|
||||
nlm_state.temporary_1_ptr = *nlm_temporary_1;
|
||||
nlm_state.temporary_2_ptr = *nlm_temporary_2;
|
||||
nlm_state.temporary_3_ptr = *nlm_temporary_3;
|
||||
|
||||
int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
|
||||
int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
|
||||
int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 };
|
||||
for(int pass = 0; pass < 7; pass++) {
|
||||
device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
/* Get the unfiltered pass and its variance from the RenderBuffers. */
|
||||
functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
|
||||
/* Smooth the pass and store the result in the denoising buffers. */
|
||||
nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
|
||||
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy color passes. */
|
||||
{
|
||||
int mean_from[] = {20, 21, 22};
|
||||
int variance_from[] = {23, 24, 25};
|
||||
int mean_to[] = { 8, 9, 10};
|
||||
int variance_to[] = {11, 12, 13};
|
||||
int num_color_passes = 3;
|
||||
for(int pass = 0; pass < num_color_passes; pass++) {
|
||||
device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||
functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
|
||||
}
|
||||
}
|
||||
|
||||
storage.w = filter_area.z;
|
||||
storage.h = filter_area.w;
|
||||
storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
|
||||
storage.rank.resize(storage.w*storage.h);
|
||||
device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
|
||||
device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
|
||||
|
||||
functions.construct_transform();
|
||||
|
||||
device_only_memory<float> temporary_1;
|
||||
device_only_memory<float> temporary_2;
|
||||
temporary_1.resize(buffer.w*buffer.h);
|
||||
temporary_2.resize(buffer.w*buffer.h);
|
||||
device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
|
||||
device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
|
||||
reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
|
||||
reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
|
||||
|
||||
storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
|
||||
storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
|
||||
device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
|
||||
device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
|
||||
|
||||
reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
|
||||
int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
|
||||
reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
|
||||
render_buffer.stride,
|
||||
render_buffer.pass_stride,
|
||||
render_buffer.denoising_clean_offset);
|
||||
reconstruction_state.source_w = rect.z-rect.x;
|
||||
reconstruction_state.source_h = rect.w-rect.y;
|
||||
|
||||
{
|
||||
device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
|
||||
device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
|
||||
functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
|
||||
}
|
||||
|
||||
device->mem_free(storage.XtWX);
|
||||
device->mem_free(storage.XtWY);
|
||||
device->mem_free(storage.transform);
|
||||
device->mem_free(storage.rank);
|
||||
device->mem_free(temporary_1);
|
||||
device->mem_free(temporary_2);
|
||||
device->mem_free(buffer.mem);
|
||||
device->mem_free(tiles_mem);
|
||||
return true;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
145
intern/cycles/device/device_denoising.h
Normal file
145
intern/cycles/device/device_denoising.h
Normal file
@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __DEVICE_DENOISING_H__
|
||||
#define __DEVICE_DENOISING_H__
|
||||
|
||||
#include "device/device.h"
|
||||
|
||||
#include "render/buffers.h"
|
||||
|
||||
#include "kernel/filter/filter_defines.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class DenoisingTask {
|
||||
public:
|
||||
/* Parameters of the denoising algorithm. */
|
||||
int radius;
|
||||
float nlm_k_2;
|
||||
float pca_threshold;
|
||||
|
||||
/* Pointer and parameters of the RenderBuffers. */
|
||||
struct RenderBuffers {
|
||||
int denoising_data_offset;
|
||||
int denoising_clean_offset;
|
||||
int pass_stride;
|
||||
int offset;
|
||||
int stride;
|
||||
device_ptr ptr;
|
||||
int samples;
|
||||
} render_buffer;
|
||||
|
||||
TilesInfo *tiles;
|
||||
device_vector<int> tiles_mem;
|
||||
void tiles_from_rendertiles(RenderTile *rtiles);
|
||||
|
||||
int4 rect;
|
||||
int4 filter_area;
|
||||
|
||||
struct DeviceFunctions {
|
||||
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
|
||||
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
|
||||
device_ptr variance_ptr, /* Contains the variance of the guide image. */
|
||||
device_ptr out_ptr /* The filtered output is written into this image. */
|
||||
)> non_local_means;
|
||||
function<bool(device_ptr color_ptr,
|
||||
device_ptr color_variance_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr guide_variance_ptr,
|
||||
device_ptr output_ptr
|
||||
)> reconstruct;
|
||||
function<bool()> construct_transform;
|
||||
|
||||
function<bool(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
int r,
|
||||
int4 rect
|
||||
)> combine_halves;
|
||||
function<bool(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr sample_variance_ptr,
|
||||
device_ptr sv_variance_ptr,
|
||||
device_ptr buffer_variance_ptr
|
||||
)> divide_shadow;
|
||||
function<bool(int mean_offset,
|
||||
int variance_offset,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr
|
||||
)> get_feature;
|
||||
function<bool(device_ptr*)> set_tiles;
|
||||
} functions;
|
||||
|
||||
/* Stores state of the current Reconstruction operation,
|
||||
* which is accessed by the device in order to perform the operation. */
|
||||
struct ReconstructionState {
|
||||
device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
|
||||
device_ptr temporary_2_ptr;
|
||||
|
||||
int4 filter_rect;
|
||||
int4 buffer_params;
|
||||
|
||||
int source_w;
|
||||
int source_h;
|
||||
} reconstruction_state;
|
||||
|
||||
/* Stores state of the current NLM operation,
|
||||
* which is accessed by the device in order to perform the operation. */
|
||||
struct NLMState {
|
||||
device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
|
||||
device_ptr temporary_2_ptr;
|
||||
device_ptr temporary_3_ptr;
|
||||
|
||||
int r; /* Search radius of the filter. */
|
||||
int f; /* Patch size of the filter. */
|
||||
float a; /* Variance compensation factor in the MSE estimation. */
|
||||
float k_2; /* Squared value of the k parameter of the filter. */
|
||||
|
||||
void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
|
||||
} nlm_state;
|
||||
|
||||
struct Storage {
|
||||
device_only_memory<float> transform;
|
||||
device_only_memory<int> rank;
|
||||
device_only_memory<float> XtWX;
|
||||
device_only_memory<float3> XtWY;
|
||||
int w;
|
||||
int h;
|
||||
} storage;
|
||||
|
||||
DenoisingTask(Device *device) : device(device) {}
|
||||
|
||||
void init_from_devicetask(const DeviceTask &task);
|
||||
|
||||
bool run_denoising();
|
||||
|
||||
struct DenoiseBuffers {
|
||||
int pass_stride;
|
||||
int passes;
|
||||
int w;
|
||||
int h;
|
||||
device_only_memory<float> mem;
|
||||
} buffer;
|
||||
|
||||
protected:
|
||||
Device *device;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __DEVICE_DENOISING_H__ */
|
@ -35,6 +35,8 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class Device;
|
||||
|
||||
enum MemoryType {
|
||||
MEM_READ_ONLY,
|
||||
MEM_WRITE_ONLY,
|
||||
@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
|
||||
|
||||
template<> struct device_type_traits<float3> {
|
||||
static const DataType data_type = TYPE_FLOAT;
|
||||
static const int num_elements = 3;
|
||||
static const int num_elements = 4;
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<float4> {
|
||||
@ -173,6 +175,9 @@ class device_memory
|
||||
{
|
||||
public:
|
||||
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
|
||||
size_t memory_elements_size(int elements) {
|
||||
return elements*data_elements*datatype_size(data_type);
|
||||
}
|
||||
|
||||
/* data information */
|
||||
DataType data_type;
|
||||
@ -213,6 +218,22 @@ protected:
|
||||
device_memory& operator = (const device_memory&);
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class device_only_memory : public device_memory
|
||||
{
|
||||
public:
|
||||
device_only_memory()
|
||||
{
|
||||
data_type = device_type_traits<T>::data_type;
|
||||
data_elements = max(device_type_traits<T>::num_elements, 1);
|
||||
}
|
||||
|
||||
void resize(size_t num)
|
||||
{
|
||||
device_memory::resize(num*sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
/* Device Vector */
|
||||
|
||||
template<typename T> class device_vector : public device_memory
|
||||
@ -299,6 +320,27 @@ private:
|
||||
array<T> data;
|
||||
};
|
||||
|
||||
/* A device_sub_ptr is a pointer into another existing memory.
|
||||
* Therefore, it is not allocated separately, but just created from the already allocated base memory.
|
||||
* It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
|
||||
* Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
|
||||
class device_sub_ptr
|
||||
{
|
||||
public:
|
||||
device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
|
||||
~device_sub_ptr();
|
||||
/* No copying. */
|
||||
device_sub_ptr& operator = (const device_sub_ptr&);
|
||||
|
||||
device_ptr operator*() const
|
||||
{
|
||||
return ptr;
|
||||
}
|
||||
protected:
|
||||
Device *device;
|
||||
device_ptr ptr;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __DEVICE_MEMORY_H__ */
|
||||
|
@ -302,6 +302,60 @@ public:
|
||||
return -1;
|
||||
}
|
||||
|
||||
void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
|
||||
{
|
||||
for(int i = 0; i < 9; i++) {
|
||||
if(!tiles[i].buffers) {
|
||||
continue;
|
||||
}
|
||||
/* If the tile was rendered on another device, copy its memory to
|
||||
* to the current device now, for the duration of the denoising task.
|
||||
* Note that this temporarily modifies the RenderBuffers and calls
|
||||
* the device, so this function is not thread safe. */
|
||||
if(tiles[i].buffers->device != sub_device) {
|
||||
device_vector<float> &mem = tiles[i].buffers->buffer;
|
||||
|
||||
tiles[i].buffers->copy_from_device();
|
||||
device_ptr original_ptr = mem.device_pointer;
|
||||
mem.device_pointer = 0;
|
||||
sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
|
||||
sub_device->mem_copy_to(mem);
|
||||
tiles[i].buffer = mem.device_pointer;
|
||||
mem.device_pointer = original_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
|
||||
{
|
||||
for(int i = 0; i < 9; i++) {
|
||||
if(!tiles[i].buffers) {
|
||||
continue;
|
||||
}
|
||||
if(tiles[i].buffers->device != sub_device) {
|
||||
device_vector<float> &mem = tiles[i].buffers->buffer;
|
||||
|
||||
device_ptr original_ptr = mem.device_pointer;
|
||||
mem.device_pointer = tiles[i].buffer;
|
||||
|
||||
/* Copy denoised tile to the host. */
|
||||
if(i == 4) {
|
||||
tiles[i].buffers->copy_from_device(sub_device);
|
||||
}
|
||||
|
||||
size_t mem_size = mem.device_size;
|
||||
sub_device->mem_free(mem);
|
||||
mem.device_pointer = original_ptr;
|
||||
mem.device_size = mem_size;
|
||||
|
||||
/* Copy denoised tile to the original device. */
|
||||
if(i == 4) {
|
||||
tiles[i].buffers->device->mem_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int get_split_task_count(DeviceTask& task)
|
||||
{
|
||||
int total_tasks = 0;
|
||||
|
@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
|
||||
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
|
||||
|
||||
/* Allocate work_pool_wgs memory. */
|
||||
work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
|
||||
work_pool_wgs.resize(max_work_groups);
|
||||
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
|
||||
|
||||
queue_index.resize(NUM_QUEUES * sizeof(int));
|
||||
queue_index.resize(NUM_QUEUES);
|
||||
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
|
||||
|
||||
use_queues_flag.resize(sizeof(char));
|
||||
use_queues_flag.resize(1);
|
||||
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
|
||||
|
||||
ray_state.resize(num_global_elements);
|
||||
|
@ -80,16 +80,16 @@ private:
|
||||
*/
|
||||
device_memory split_data;
|
||||
device_vector<uchar> ray_state;
|
||||
device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
|
||||
device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
|
||||
|
||||
/* Flag to make sceneintersect and lampemission kernel use queues. */
|
||||
device_memory use_queues_flag;
|
||||
device_only_memory<char> use_queues_flag;
|
||||
|
||||
/* Approximate time it takes to complete one sample */
|
||||
double avg_time_per_sample;
|
||||
|
||||
/* Work pool with respect to each work group. */
|
||||
device_memory work_pool_wgs;
|
||||
device_only_memory<unsigned int> work_pool_wgs;
|
||||
|
||||
/* clos_max value for which the kernels have been loaded currently. */
|
||||
int current_max_closure;
|
||||
|
@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
|
||||
if(type == SHADER) {
|
||||
num = min(shader_w, num);
|
||||
}
|
||||
else if(type == PATH_TRACE) {
|
||||
else if(type == RENDER) {
|
||||
}
|
||||
else {
|
||||
num = min(h, num);
|
||||
@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
|
||||
tasks.push_back(task);
|
||||
}
|
||||
}
|
||||
else if(type == PATH_TRACE) {
|
||||
else if(type == RENDER) {
|
||||
for(int i = 0; i < num; i++)
|
||||
tasks.push_back(*this);
|
||||
}
|
||||
@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
|
||||
|
||||
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
|
||||
{
|
||||
if((type != PATH_TRACE) &&
|
||||
if((type != RENDER) &&
|
||||
(type != SHADER))
|
||||
return;
|
||||
|
||||
|
@ -34,7 +34,7 @@ class Tile;
|
||||
|
||||
class DeviceTask : public Task {
|
||||
public:
|
||||
typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
|
||||
typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
|
||||
Type type;
|
||||
|
||||
int x, y, w, h;
|
||||
@ -53,7 +53,7 @@ public:
|
||||
|
||||
int passes_size;
|
||||
|
||||
explicit DeviceTask(Type type = PATH_TRACE);
|
||||
explicit DeviceTask(Type type = RENDER);
|
||||
|
||||
int get_subtask_count(int num, int max_size = 0);
|
||||
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
|
||||
@ -65,6 +65,16 @@ public:
|
||||
function<void(RenderTile&)> update_tile_sample;
|
||||
function<void(RenderTile&)> release_tile;
|
||||
function<bool(void)> get_cancel;
|
||||
function<void(RenderTile*, Device*)> map_neighbor_tiles;
|
||||
function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
|
||||
|
||||
int denoising_radius;
|
||||
float denoising_strength;
|
||||
float denoising_feature_strength;
|
||||
bool denoising_relative_pca;
|
||||
int pass_stride;
|
||||
int pass_denoising_data;
|
||||
int pass_denoising_clean;
|
||||
|
||||
bool need_finish_queue;
|
||||
bool integrator_branched;
|
||||
|
@ -17,6 +17,7 @@
|
||||
#ifdef WITH_OPENCL
|
||||
|
||||
#include "device/device.h"
|
||||
#include "device/device_denoising.h"
|
||||
|
||||
#include "util/util_map.h"
|
||||
#include "util/util_param.h"
|
||||
@ -129,6 +130,8 @@ public:
|
||||
cl_int* error = NULL);
|
||||
static cl_device_type get_device_type(cl_device_id device_id);
|
||||
|
||||
static int mem_address_alignment(cl_device_id device_id);
|
||||
|
||||
/* Get somewhat more readable device name.
|
||||
* Main difference is AMD OpenCL here which only gives code name
|
||||
* for the regular device name. This will give more sane device
|
||||
@ -218,7 +221,7 @@ public:
|
||||
cl_int err = stmt; \
|
||||
\
|
||||
if(err != CL_SUCCESS) { \
|
||||
string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
|
||||
string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
|
||||
if(error_msg == "") \
|
||||
error_msg = message; \
|
||||
fprintf(stderr, "%s\n", message.c_str()); \
|
||||
@ -282,7 +285,7 @@ public:
|
||||
map<ustring, cl_kernel> kernels;
|
||||
};
|
||||
|
||||
OpenCLProgram base_program;
|
||||
OpenCLProgram base_program, denoising_program;
|
||||
|
||||
typedef map<string, device_vector<uchar>*> ConstMemMap;
|
||||
typedef map<string, device_ptr> MemMap;
|
||||
@ -320,6 +323,9 @@ public:
|
||||
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
|
||||
void mem_zero(device_memory& mem);
|
||||
void mem_free(device_memory& mem);
|
||||
|
||||
int mem_address_alignment();
|
||||
|
||||
void const_copy_to(const char *name, void *host, size_t size);
|
||||
void tex_alloc(const char *name,
|
||||
device_memory& mem,
|
||||
@ -328,12 +334,14 @@ public:
|
||||
void tex_free(device_memory& mem);
|
||||
|
||||
size_t global_size_round_up(int group_size, int global_size);
|
||||
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
|
||||
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
|
||||
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
|
||||
|
||||
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
|
||||
void shader(DeviceTask& task);
|
||||
|
||||
void denoise(RenderTile& tile, const DeviceTask& task);
|
||||
|
||||
class OpenCLDeviceTask : public DeviceTask {
|
||||
public:
|
||||
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
|
||||
@ -367,9 +375,48 @@ public:
|
||||
|
||||
virtual void thread_run(DeviceTask * /*task*/) = 0;
|
||||
|
||||
virtual bool is_split_kernel() = 0;
|
||||
|
||||
protected:
|
||||
string kernel_build_options(const string *debug_src = NULL);
|
||||
|
||||
void mem_zero_kernel(device_ptr ptr, size_t size);
|
||||
|
||||
bool denoising_non_local_means(device_ptr image_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr variance_ptr,
|
||||
device_ptr out_ptr,
|
||||
DenoisingTask *task);
|
||||
bool denoising_construct_transform(DenoisingTask *task);
|
||||
bool denoising_reconstruct(device_ptr color_ptr,
|
||||
device_ptr color_variance_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr guide_variance_ptr,
|
||||
device_ptr output_ptr,
|
||||
DenoisingTask *task);
|
||||
bool denoising_combine_halves(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
int r, int4 rect,
|
||||
DenoisingTask *task);
|
||||
bool denoising_divide_shadow(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr sample_variance_ptr,
|
||||
device_ptr sv_variance_ptr,
|
||||
device_ptr buffer_variance_ptr,
|
||||
DenoisingTask *task);
|
||||
bool denoising_get_feature(int mean_offset,
|
||||
int variance_offset,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
DenoisingTask *task);
|
||||
bool denoising_set_tiles(device_ptr *buffers,
|
||||
DenoisingTask *task);
|
||||
|
||||
device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
|
||||
void mem_free_sub_ptr(device_ptr ptr);
|
||||
|
||||
class ArgumentWrapper {
|
||||
public:
|
||||
ArgumentWrapper() : size(0), pointer(NULL)
|
||||
|
@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
|
||||
base_program.add_kernel(ustring("bake"));
|
||||
base_program.add_kernel(ustring("zero_buffer"));
|
||||
|
||||
denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
|
||||
denoising_program.add_kernel(ustring("filter_divide_shadow"));
|
||||
denoising_program.add_kernel(ustring("filter_get_feature"));
|
||||
denoising_program.add_kernel(ustring("filter_combine_halves"));
|
||||
denoising_program.add_kernel(ustring("filter_construct_transform"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_blur"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_update_output"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_normalize"));
|
||||
denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
|
||||
denoising_program.add_kernel(ustring("filter_finalize"));
|
||||
denoising_program.add_kernel(ustring("filter_set_tiles"));
|
||||
|
||||
vector<OpenCLProgram*> programs;
|
||||
programs.push_back(&base_program);
|
||||
programs.push_back(&denoising_program);
|
||||
/* Call actual class to fill the vector with its programs. */
|
||||
if(!load_kernels(requested_features, programs)) {
|
||||
return false;
|
||||
@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
|
||||
NULL, NULL));
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
|
||||
{
|
||||
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
|
||||
|
||||
size_t global_size[] = {1024, 1024};
|
||||
size_t num_threads = global_size[0] * global_size[1];
|
||||
|
||||
cl_mem d_buffer = CL_MEM_PTR(mem);
|
||||
cl_ulong d_offset = 0;
|
||||
cl_ulong d_size = 0;
|
||||
|
||||
while(d_offset < size) {
|
||||
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
|
||||
|
||||
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
|
||||
|
||||
ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
|
||||
ckZeroBuffer,
|
||||
2,
|
||||
NULL,
|
||||
global_size,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL);
|
||||
opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
|
||||
|
||||
d_offset += d_size;
|
||||
}
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::mem_zero(device_memory& mem)
|
||||
{
|
||||
if(mem.device_pointer) {
|
||||
if(base_program.is_loaded()) {
|
||||
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
|
||||
|
||||
size_t global_size[] = {1024, 1024};
|
||||
size_t num_threads = global_size[0] * global_size[1];
|
||||
|
||||
cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
|
||||
cl_ulong d_offset = 0;
|
||||
cl_ulong d_size = 0;
|
||||
|
||||
while(d_offset < mem.memory_size()) {
|
||||
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
|
||||
|
||||
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
|
||||
|
||||
ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
|
||||
ckZeroBuffer,
|
||||
2,
|
||||
NULL,
|
||||
global_size,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL);
|
||||
opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
|
||||
|
||||
d_offset += d_size;
|
||||
}
|
||||
mem_zero_kernel(mem.device_pointer, mem.memory_size());
|
||||
}
|
||||
|
||||
if(mem.data_pointer) {
|
||||
@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
|
||||
}
|
||||
}
|
||||
|
||||
int OpenCLDeviceBase::mem_address_alignment()
|
||||
{
|
||||
return OpenCLInfo::mem_address_alignment(cdDevice);
|
||||
}
|
||||
|
||||
device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
|
||||
{
|
||||
cl_mem_flags mem_flag;
|
||||
if(type == MEM_READ_ONLY)
|
||||
mem_flag = CL_MEM_READ_ONLY;
|
||||
else if(type == MEM_WRITE_ONLY)
|
||||
mem_flag = CL_MEM_WRITE_ONLY;
|
||||
else
|
||||
mem_flag = CL_MEM_READ_WRITE;
|
||||
|
||||
cl_buffer_region info;
|
||||
info.origin = mem.memory_elements_size(offset);
|
||||
info.size = mem.memory_elements_size(size);
|
||||
|
||||
device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
|
||||
mem_flag,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
&info,
|
||||
&ciErr);
|
||||
opencl_assert_err(ciErr, "clCreateSubBuffer");
|
||||
return sub_buf;
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
|
||||
{
|
||||
if(device_pointer && device_pointer != null_mem) {
|
||||
opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
|
||||
}
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
|
||||
{
|
||||
ConstMemMap::iterator i = const_mem_map.find(name);
|
||||
@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
|
||||
return global_size + ((r == 0)? 0: group_size - r);
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
|
||||
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
|
||||
{
|
||||
size_t workgroup_size, max_work_items[3];
|
||||
|
||||
@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
|
||||
clGetDeviceInfo(cdDevice,
|
||||
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
|
||||
|
||||
if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
|
||||
workgroup_size = max_workgroup_size;
|
||||
}
|
||||
|
||||
/* Try to divide evenly over 2 dimensions. */
|
||||
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
||||
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
|
||||
@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
|
||||
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr variance_ptr,
|
||||
device_ptr out_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
int4 rect = task->rect;
|
||||
int w = rect.z-rect.x;
|
||||
int h = rect.w-rect.y;
|
||||
int r = task->nlm_state.r;
|
||||
int f = task->nlm_state.f;
|
||||
float a = task->nlm_state.a;
|
||||
float k_2 = task->nlm_state.k_2;
|
||||
|
||||
cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
|
||||
cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
|
||||
cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
|
||||
|
||||
cl_mem image_mem = CL_MEM_PTR(image_ptr);
|
||||
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
|
||||
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||
cl_mem out_mem = CL_MEM_PTR(out_ptr);
|
||||
|
||||
mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
|
||||
mem_zero_kernel(out_ptr, sizeof(float)*w*h);
|
||||
|
||||
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
|
||||
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
|
||||
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
|
||||
cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
|
||||
cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
|
||||
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
dx, dy, guide_mem, variance_mem,
|
||||
difference, local_rect, w, 0, a, k_2);
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference, blurDifference, local_rect, w, f);
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference, difference, local_rect, w, f);
|
||||
kernel_set_args(ckNLMUpdateOutput, 0,
|
||||
dx, dy, blurDifference, image_mem,
|
||||
out_mem, weightAccum, local_rect, w, f);
|
||||
|
||||
enqueue_kernel(ckNLMCalcDifference, w, h);
|
||||
enqueue_kernel(ckNLMBlur, w, h);
|
||||
enqueue_kernel(ckNLMCalcWeight, w, h);
|
||||
enqueue_kernel(ckNLMBlur, w, h);
|
||||
enqueue_kernel(ckNLMUpdateOutput, w, h);
|
||||
}
|
||||
|
||||
int4 local_rect = make_int4(0, 0, w, h);
|
||||
kernel_set_args(ckNLMNormalize, 0,
|
||||
out_mem, weightAccum, local_rect, w);
|
||||
enqueue_kernel(ckNLMNormalize, w, h);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
|
||||
{
|
||||
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
|
||||
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
|
||||
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
|
||||
|
||||
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
|
||||
|
||||
kernel_set_args(ckFilterConstructTransform, 0,
|
||||
buffer_mem,
|
||||
transform_mem,
|
||||
rank_mem,
|
||||
task->filter_area,
|
||||
task->rect,
|
||||
task->buffer.pass_stride,
|
||||
task->radius,
|
||||
task->pca_threshold);
|
||||
|
||||
enqueue_kernel(ckFilterConstructTransform,
|
||||
task->storage.w,
|
||||
task->storage.h,
|
||||
256);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
|
||||
device_ptr color_variance_ptr,
|
||||
device_ptr guide_ptr,
|
||||
device_ptr guide_variance_ptr,
|
||||
device_ptr output_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
mem_zero(task->storage.XtWX);
|
||||
mem_zero(task->storage.XtWY);
|
||||
|
||||
cl_mem color_mem = CL_MEM_PTR(color_ptr);
|
||||
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
|
||||
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
|
||||
cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
|
||||
cl_mem output_mem = CL_MEM_PTR(output_ptr);
|
||||
|
||||
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
|
||||
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
|
||||
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
|
||||
cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
|
||||
cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
|
||||
|
||||
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
|
||||
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
|
||||
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
|
||||
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
|
||||
cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
|
||||
|
||||
cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
|
||||
cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
|
||||
|
||||
int r = task->radius;
|
||||
int f = 4;
|
||||
float a = 1.0f;
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
|
||||
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||
task->reconstruction_state.source_w - max(0, dx),
|
||||
task->reconstruction_state.source_h - max(0, dy)};
|
||||
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
dx, dy,
|
||||
guide_mem,
|
||||
guide_variance_mem,
|
||||
difference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
task->buffer.pass_stride,
|
||||
a, task->nlm_k_2);
|
||||
enqueue_kernel(ckNLMCalcDifference,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference,
|
||||
blurDifference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
f);
|
||||
enqueue_kernel(ckNLMBlur,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference,
|
||||
difference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
f);
|
||||
enqueue_kernel(ckNLMCalcWeight,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
/* Reuse previous arguments. */
|
||||
enqueue_kernel(ckNLMBlur,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
kernel_set_args(ckNLMConstructGramian, 0,
|
||||
dx, dy,
|
||||
blurDifference,
|
||||
buffer_mem,
|
||||
color_mem,
|
||||
color_variance_mem,
|
||||
transform_mem,
|
||||
rank_mem,
|
||||
XtWX_mem,
|
||||
XtWY_mem,
|
||||
local_rect,
|
||||
task->reconstruction_state.filter_rect,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
f,
|
||||
task->buffer.pass_stride);
|
||||
enqueue_kernel(ckNLMConstructGramian,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h,
|
||||
256);
|
||||
}
|
||||
|
||||
kernel_set_args(ckFinalize, 0,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
output_mem,
|
||||
rank_mem,
|
||||
XtWX_mem,
|
||||
XtWY_mem,
|
||||
task->filter_area,
|
||||
task->reconstruction_state.buffer_params,
|
||||
task->render_buffer.samples);
|
||||
enqueue_kernel(ckFinalize,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
int r, int4 rect,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
(void) task;
|
||||
|
||||
cl_mem a_mem = CL_MEM_PTR(a_ptr);
|
||||
cl_mem b_mem = CL_MEM_PTR(b_ptr);
|
||||
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
|
||||
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||
|
||||
cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
|
||||
|
||||
kernel_set_args(ckFilterCombineHalves, 0,
|
||||
mean_mem,
|
||||
variance_mem,
|
||||
a_mem,
|
||||
b_mem,
|
||||
rect,
|
||||
r);
|
||||
enqueue_kernel(ckFilterCombineHalves,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
|
||||
device_ptr b_ptr,
|
||||
device_ptr sample_variance_ptr,
|
||||
device_ptr sv_variance_ptr,
|
||||
device_ptr buffer_variance_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
(void) task;
|
||||
|
||||
cl_mem a_mem = CL_MEM_PTR(a_ptr);
|
||||
cl_mem b_mem = CL_MEM_PTR(b_ptr);
|
||||
cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
|
||||
cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
|
||||
cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
|
||||
|
||||
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||
|
||||
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
|
||||
|
||||
char split_kernel = is_split_kernel()? 1 : 0;
|
||||
kernel_set_args(ckFilterDivideShadow, 0,
|
||||
task->render_buffer.samples,
|
||||
tiles_mem,
|
||||
a_mem,
|
||||
b_mem,
|
||||
sample_variance_mem,
|
||||
sv_variance_mem,
|
||||
buffer_variance_mem,
|
||||
task->rect,
|
||||
task->render_buffer.pass_stride,
|
||||
task->render_buffer.denoising_data_offset,
|
||||
split_kernel);
|
||||
enqueue_kernel(ckFilterDivideShadow,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
|
||||
int variance_offset,
|
||||
device_ptr mean_ptr,
|
||||
device_ptr variance_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
|
||||
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||
|
||||
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||
|
||||
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
|
||||
|
||||
char split_kernel = is_split_kernel()? 1 : 0;
|
||||
kernel_set_args(ckFilterGetFeature, 0,
|
||||
task->render_buffer.samples,
|
||||
tiles_mem,
|
||||
mean_offset,
|
||||
variance_offset,
|
||||
mean_mem,
|
||||
variance_mem,
|
||||
task->rect,
|
||||
task->render_buffer.pass_stride,
|
||||
task->render_buffer.denoising_data_offset,
|
||||
split_kernel);
|
||||
enqueue_kernel(ckFilterGetFeature,
|
||||
task->rect.z-task->rect.x,
|
||||
task->rect.w-task->rect.y);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
|
||||
mem_copy_to(task->tiles_mem);
|
||||
|
||||
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||
|
||||
cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
|
||||
|
||||
kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
|
||||
for(int i = 0; i < 9; i++) {
|
||||
cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
|
||||
kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
|
||||
}
|
||||
|
||||
enqueue_kernel(ckFilterSetTiles, 1, 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
|
||||
{
|
||||
DenoisingTask denoising(this);
|
||||
|
||||
denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
|
||||
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
|
||||
denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
|
||||
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
|
||||
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
|
||||
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
|
||||
denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
|
||||
|
||||
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
||||
denoising.render_buffer.samples = rtile.sample;
|
||||
|
||||
RenderTile rtiles[9];
|
||||
rtiles[4] = rtile;
|
||||
task.map_neighbor_tiles(rtiles, this);
|
||||
denoising.tiles_from_rendertiles(rtiles);
|
||||
|
||||
denoising.init_from_devicetask(task);
|
||||
|
||||
denoising.run_denoising();
|
||||
|
||||
task.unmap_neighbor_tiles(rtiles, this);
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::shader(DeviceTask& task)
|
||||
{
|
||||
/* cast arguments to cl types */
|
||||
|
@ -108,41 +108,53 @@ public:
|
||||
else if(task->type == DeviceTask::SHADER) {
|
||||
shader(*task);
|
||||
}
|
||||
else if(task->type == DeviceTask::PATH_TRACE) {
|
||||
else if(task->type == DeviceTask::RENDER) {
|
||||
RenderTile tile;
|
||||
/* Keep rendering tiles until done. */
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
if(tile.task == RenderTile::PATH_TRACE) {
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
|
||||
path_trace(tile, sample);
|
||||
|
||||
tile.sample = sample + 1;
|
||||
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
|
||||
path_trace(tile, sample);
|
||||
|
||||
tile.sample = sample + 1;
|
||||
|
||||
/* Complete kernel execution before release tile */
|
||||
/* This helps in multi-device render;
|
||||
* The device that reaches the critical-section function
|
||||
* release_tile waits (stalling other devices from entering
|
||||
* release_tile) for all kernels to complete. If device1 (a
|
||||
* slow-render device) reaches release_tile first then it would
|
||||
* stall device2 (a fast-render device) from proceeding to render
|
||||
* next tile.
|
||||
*/
|
||||
clFinish(cqCommandQueue);
|
||||
}
|
||||
else if(tile.task == RenderTile::DENOISE) {
|
||||
tile.sample = tile.start_sample + tile.num_samples;
|
||||
denoise(tile, *task);
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
|
||||
/* Complete kernel execution before release tile */
|
||||
/* This helps in multi-device render;
|
||||
* The device that reaches the critical-section function
|
||||
* release_tile waits (stalling other devices from entering
|
||||
* release_tile) for all kernels to complete. If device1 (a
|
||||
* slow-render device) reaches release_tile first then it would
|
||||
* stall device2 (a fast-render device) from proceeding to render
|
||||
* next tile.
|
||||
*/
|
||||
clFinish(cqCommandQueue);
|
||||
|
||||
task->release_tile(tile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_split_kernel()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
|
||||
|
@ -104,7 +104,7 @@ public:
|
||||
else if(task->type == DeviceTask::SHADER) {
|
||||
shader(*task);
|
||||
}
|
||||
else if(task->type == DeviceTask::PATH_TRACE) {
|
||||
else if(task->type == DeviceTask::RENDER) {
|
||||
RenderTile tile;
|
||||
|
||||
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
|
||||
@ -127,21 +127,29 @@ public:
|
||||
|
||||
/* Keep rendering tiles until done. */
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
split_kernel->path_trace(task,
|
||||
tile,
|
||||
kgbuffer,
|
||||
*const_mem_map["__data"]);
|
||||
if(tile.task == RenderTile::PATH_TRACE) {
|
||||
assert(tile.task == RenderTile::PATH_TRACE);
|
||||
split_kernel->path_trace(task,
|
||||
tile,
|
||||
kgbuffer,
|
||||
*const_mem_map["__data"]);
|
||||
|
||||
/* Complete kernel execution before release tile. */
|
||||
/* This helps in multi-device render;
|
||||
* The device that reaches the critical-section function
|
||||
* release_tile waits (stalling other devices from entering
|
||||
* release_tile) for all kernels to complete. If device1 (a
|
||||
* slow-render device) reaches release_tile first then it would
|
||||
* stall device2 (a fast-render device) from proceeding to render
|
||||
* next tile.
|
||||
*/
|
||||
clFinish(cqCommandQueue);
|
||||
/* Complete kernel execution before release tile. */
|
||||
/* This helps in multi-device render;
|
||||
* The device that reaches the critical-section function
|
||||
* release_tile waits (stalling other devices from entering
|
||||
* release_tile) for all kernels to complete. If device1 (a
|
||||
* slow-render device) reaches release_tile first then it would
|
||||
* stall device2 (a fast-render device) from proceeding to render
|
||||
* next tile.
|
||||
*/
|
||||
clFinish(cqCommandQueue);
|
||||
}
|
||||
else if(tile.task == RenderTile::DENOISE) {
|
||||
tile.sample = tile.start_sample + tile.num_samples;
|
||||
denoise(tile, *task);
|
||||
task->update_progress(&tile, tile.w*tile.h);
|
||||
}
|
||||
|
||||
task->release_tile(tile);
|
||||
}
|
||||
@ -150,6 +158,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
bool is_split_kernel()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
/* ** Those guys are for workign around some compiler-specific bugs ** */
|
||||
|
||||
|
@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
|
||||
return get_device_name(device_id);
|
||||
}
|
||||
|
||||
int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
|
||||
{
|
||||
int base_align_bits;
|
||||
if(clGetDeviceInfo(device_id,
|
||||
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||
sizeof(int),
|
||||
&base_align_bits,
|
||||
NULL) == CL_SUCCESS)
|
||||
{
|
||||
return base_align_bits/8;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
@ -10,7 +10,23 @@ set(INC_SYS
|
||||
|
||||
set(SRC
|
||||
kernels/cpu/kernel.cpp
|
||||
kernels/cpu/kernel_sse2.cpp
|
||||
kernels/cpu/kernel_sse3.cpp
|
||||
kernels/cpu/kernel_sse41.cpp
|
||||
kernels/cpu/kernel_avx.cpp
|
||||
kernels/cpu/kernel_avx2.cpp
|
||||
kernels/cpu/kernel_split.cpp
|
||||
kernels/cpu/kernel_split_sse2.cpp
|
||||
kernels/cpu/kernel_split_sse3.cpp
|
||||
kernels/cpu/kernel_split_sse41.cpp
|
||||
kernels/cpu/kernel_split_avx.cpp
|
||||
kernels/cpu/kernel_split_avx2.cpp
|
||||
kernels/cpu/filter.cpp
|
||||
kernels/cpu/filter_sse2.cpp
|
||||
kernels/cpu/filter_sse3.cpp
|
||||
kernels/cpu/filter_sse41.cpp
|
||||
kernels/cpu/filter_avx.cpp
|
||||
kernels/cpu/filter_avx2.cpp
|
||||
kernels/opencl/kernel.cl
|
||||
kernels/opencl/kernel_state_buffer_size.cl
|
||||
kernels/opencl/kernel_split.cl
|
||||
@ -32,8 +48,10 @@ set(SRC
|
||||
kernels/opencl/kernel_next_iteration_setup.cl
|
||||
kernels/opencl/kernel_indirect_subsurface.cl
|
||||
kernels/opencl/kernel_buffer_update.cl
|
||||
kernels/opencl/filter.cl
|
||||
kernels/cuda/kernel.cu
|
||||
kernels/cuda/kernel_split.cu
|
||||
kernels/cuda/filter.cu
|
||||
)
|
||||
|
||||
set(SRC_BVH_HEADERS
|
||||
@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
|
||||
kernels/cpu/kernel_cpu.h
|
||||
kernels/cpu/kernel_cpu_impl.h
|
||||
kernels/cpu/kernel_cpu_image.h
|
||||
kernels/cpu/filter_cpu.h
|
||||
kernels/cpu/filter_cpu_impl.h
|
||||
)
|
||||
|
||||
set(SRC_KERNELS_CUDA_HEADERS
|
||||
@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
|
||||
geom/geom_volume.h
|
||||
)
|
||||
|
||||
set(SRC_FILTER_HEADERS
|
||||
filter/filter.h
|
||||
filter/filter_defines.h
|
||||
filter/filter_features.h
|
||||
filter/filter_features_sse.h
|
||||
filter/filter_kernel.h
|
||||
filter/filter_nlm_cpu.h
|
||||
filter/filter_nlm_gpu.h
|
||||
filter/filter_prefilter.h
|
||||
filter/filter_reconstruction.h
|
||||
filter/filter_transform.h
|
||||
filter/filter_transform_gpu.h
|
||||
filter/filter_transform_sse.h
|
||||
)
|
||||
|
||||
set(SRC_UTIL_HEADERS
|
||||
../util/util_atomic.h
|
||||
../util/util_color.h
|
||||
@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
|
||||
../util/util_math_int2.h
|
||||
../util/util_math_int3.h
|
||||
../util/util_math_int4.h
|
||||
../util/util_math_matrix.h
|
||||
../util/util_static_assert.h
|
||||
../util/util_transform.h
|
||||
../util/util_texture.h
|
||||
@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
${SRC_UTIL_HEADERS}
|
||||
)
|
||||
set(cuda_filter_sources kernels/cuda/filter.cu
|
||||
${SRC_HEADERS}
|
||||
${SRC_KERNELS_CUDA_HEADERS}
|
||||
${SRC_FILTER_HEADERS}
|
||||
${SRC_UTIL_HEADERS}
|
||||
)
|
||||
set(cuda_cubins)
|
||||
|
||||
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
|
||||
if(${split})
|
||||
set(cuda_extra_flags "-D__SPLIT__")
|
||||
set(cuda_cubin kernel_split)
|
||||
else()
|
||||
set(cuda_extra_flags "")
|
||||
set(cuda_cubin kernel)
|
||||
endif()
|
||||
|
||||
macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
|
||||
if(${experimental})
|
||||
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
|
||||
set(cuda_cubin ${cuda_cubin}_experimental)
|
||||
set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
|
||||
set(name ${name}_experimental)
|
||||
endif()
|
||||
|
||||
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
|
||||
set(cuda_cubin ${name}_${arch}.cubin)
|
||||
|
||||
if(WITH_CYCLES_DEBUG)
|
||||
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
|
||||
@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
|
||||
set(cuda_math_flags "--use_fast_math")
|
||||
|
||||
if(split)
|
||||
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
|
||||
else()
|
||||
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
|
||||
endif()
|
||||
set(cuda_kernel_src "/kernels/cuda/${name}.cu")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${cuda_cubin}
|
||||
@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
${cuda_arch_flags}
|
||||
${cuda_version_flags}
|
||||
${cuda_math_flags}
|
||||
${cuda_extra_flags}
|
||||
${flags}
|
||||
${cuda_debug_flags}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/..
|
||||
-DCCL_NAMESPACE_BEGIN=
|
||||
-DCCL_NAMESPACE_END=
|
||||
-DNVCC
|
||||
DEPENDS ${cuda_sources})
|
||||
DEPENDS ${sources})
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND cuda_cubins ${cuda_cubin})
|
||||
@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
|
||||
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
|
||||
# Compile regular kernel
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
|
||||
|
||||
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
|
||||
# Compile split kernel
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
|
||||
|
||||
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||
|
||||
if(CXX_HAS_SSE)
|
||||
list(APPEND SRC
|
||||
kernels/cpu/kernel_sse2.cpp
|
||||
kernels/cpu/kernel_sse3.cpp
|
||||
kernels/cpu/kernel_sse41.cpp
|
||||
kernels/cpu/kernel_split_sse2.cpp
|
||||
kernels/cpu/kernel_split_sse3.cpp
|
||||
kernels/cpu/kernel_split_sse41.cpp
|
||||
)
|
||||
|
||||
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(CXX_HAS_AVX)
|
||||
list(APPEND SRC
|
||||
kernels/cpu/kernel_avx.cpp
|
||||
kernels/cpu/kernel_split_avx.cpp
|
||||
)
|
||||
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(CXX_HAS_AVX2)
|
||||
list(APPEND SRC
|
||||
kernels/cpu/kernel_avx2.cpp
|
||||
kernels/cpu/kernel_split_avx2.cpp
|
||||
)
|
||||
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
add_library(cycles_kernel
|
||||
@ -432,6 +452,7 @@ add_library(cycles_kernel
|
||||
${SRC_KERNELS_CUDA_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
${SRC_FILTER_HEADERS}
|
||||
${SRC_SVM_HEADERS}
|
||||
${SRC_GEOM_HEADERS}
|
||||
${SRC_SPLIT_HEADERS}
|
||||
@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
|
||||
|
@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Classifies a closure as diffuse-like or specular-like.
|
||||
* This is needed for the denoising feature pass generation,
|
||||
* which are written on the first bounce where more than 25%
|
||||
* of the sampling weight belongs to diffuse-line closures. */
|
||||
ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
|
||||
{
|
||||
if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
|
||||
MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
|
||||
return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
|
||||
|
||||
float sigma;
|
||||
float invsigma2;
|
||||
float3 N;
|
||||
} VelvetBsdf;
|
||||
|
||||
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
|
||||
|
@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
typedef ccl_addr_space struct DiffuseBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
float3 N;
|
||||
} DiffuseBsdf;
|
||||
|
||||
/* DIFFUSE */
|
||||
|
@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
|
||||
typedef ccl_addr_space struct DiffuseRampBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float3 N;
|
||||
float3 *colors;
|
||||
} DiffuseRampBsdf;
|
||||
|
||||
|
@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
|
||||
float alpha_x, alpha_y, ior;
|
||||
MicrofacetExtra *extra;
|
||||
float3 T;
|
||||
float3 N;
|
||||
} MicrofacetBsdf;
|
||||
|
||||
/* Beckmann and GGX microfacet importance sampling. */
|
||||
|
@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
|
||||
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
|
||||
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
|
||||
{
|
||||
if(cosI > 0.9999f || cosI < 1e-6f) {
|
||||
if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
|
||||
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
|
||||
const float phi = M_2PI_F * randU.y;
|
||||
return make_float2(r*cosf(phi), r*sinf(phi));
|
||||
|
@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
|
||||
typedef ccl_addr_space struct OrenNayarBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float3 N;
|
||||
float roughness;
|
||||
float a;
|
||||
float b;
|
||||
|
@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
|
||||
typedef ccl_addr_space struct PhongRampBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float3 N;
|
||||
float exponent;
|
||||
float3 *colors;
|
||||
} PhongRampBsdf;
|
||||
|
@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float roughness;
|
||||
float3 N;
|
||||
} PrincipledDiffuseBsdf;
|
||||
|
||||
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
|
||||
|
@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
typedef ccl_addr_space struct PrincipledSheenBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
float3 N;
|
||||
} PrincipledSheenBsdf;
|
||||
|
||||
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
|
||||
|
@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
|
||||
typedef ccl_addr_space struct ToonBsdf {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float3 N;
|
||||
float size;
|
||||
float smooth;
|
||||
} ToonBsdf;
|
||||
|
@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
|
||||
float texture_blur;
|
||||
float albedo;
|
||||
float roughness;
|
||||
float3 N;
|
||||
} Bssrdf;
|
||||
|
||||
/* Planar Truncated Gaussian
|
||||
|
52
intern/cycles/kernel/filter/filter.h
Normal file
52
intern/cycles/kernel/filter/filter.h
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __FILTER_H__
|
||||
#define __FILTER_H__
|
||||
|
||||
/* CPU Filter Kernel Interface */
|
||||
|
||||
#include "util/util_types.h"
|
||||
|
||||
#include "kernel/filter/filter_defines.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
|
||||
#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
|
||||
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
|
||||
|
||||
#define KERNEL_ARCH cpu
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
#define KERNEL_ARCH cpu_sse2
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
#define KERNEL_ARCH cpu_sse3
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
#define KERNEL_ARCH cpu_sse41
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
#define KERNEL_ARCH cpu_avx
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
#define KERNEL_ARCH cpu_avx2
|
||||
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __FILTER_H__ */
|
38
intern/cycles/kernel/filter/filter_defines.h
Normal file
38
intern/cycles/kernel/filter/filter_defines.h
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __FILTER_DEFINES_H__
|
||||
#define __FILTER_DEFINES_H__
|
||||
|
||||
#define DENOISE_FEATURES 10
|
||||
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
|
||||
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
|
||||
#define XTWY_SIZE (DENOISE_FEATURES+1)
|
||||
|
||||
typedef struct TilesInfo {
|
||||
int offsets[9];
|
||||
int strides[9];
|
||||
int x[4];
|
||||
int y[4];
|
||||
/* TODO(lukas): CUDA doesn't have uint64_t... */
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
ccl_global float *buffers[9];
|
||||
#else
|
||||
long long int buffers[9];
|
||||
#endif
|
||||
} TilesInfo;
|
||||
|
||||
#endif /* __FILTER_DEFINES_H__*/
|
120
intern/cycles/kernel/filter/filter_features.h
Normal file
120
intern/cycles/kernel/filter/filter_features.h
Normal file
@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
|
||||
|
||||
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
|
||||
* pixel_buffer always points to the current pixel in the first pass. */
|
||||
#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
|
||||
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
|
||||
for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
|
||||
|
||||
#define END_FOR_PIXEL_WINDOW } \
|
||||
pixel_buffer += buffer_w - (high.x - low.x); \
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
|
||||
{
|
||||
features[0] = pixel.x;
|
||||
features[1] = pixel.y;
|
||||
features[2] = ccl_get_feature(buffer, 0);
|
||||
features[3] = ccl_get_feature(buffer, 1);
|
||||
features[4] = ccl_get_feature(buffer, 2);
|
||||
features[5] = ccl_get_feature(buffer, 3);
|
||||
features[6] = ccl_get_feature(buffer, 4);
|
||||
features[7] = ccl_get_feature(buffer, 5);
|
||||
features[8] = ccl_get_feature(buffer, 6);
|
||||
features[9] = ccl_get_feature(buffer, 7);
|
||||
if(mean) {
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||
features[i] -= mean[i];
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
|
||||
{
|
||||
scales[0] = fabsf(pixel.x - mean[0]);
|
||||
scales[1] = fabsf(pixel.y - mean[1]);
|
||||
scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
|
||||
scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
|
||||
ccl_get_feature(buffer, 2) - mean[4],
|
||||
ccl_get_feature(buffer, 3) - mean[5]));
|
||||
scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
|
||||
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
|
||||
ccl_get_feature(buffer, 6) - mean[8],
|
||||
ccl_get_feature(buffer, 7) - mean[9]));
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_calculate_scale(float *scale)
|
||||
{
|
||||
scale[0] = 1.0f/max(scale[0], 0.01f);
|
||||
scale[1] = 1.0f/max(scale[1], 0.01f);
|
||||
scale[2] = 1.0f/max(scale[2], 0.01f);
|
||||
scale[6] = 1.0f/max(scale[4], 0.01f);
|
||||
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
|
||||
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
|
||||
}
|
||||
|
||||
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||
{
|
||||
return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
|
||||
}
|
||||
|
||||
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||
{
|
||||
return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
|
||||
}
|
||||
|
||||
ccl_device_inline void design_row_add(float *design_row,
|
||||
int rank,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
int stride,
|
||||
int row,
|
||||
float feature)
|
||||
{
|
||||
for(int i = 0; i < rank; i++) {
|
||||
design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
|
||||
}
|
||||
}
|
||||
|
||||
/* Fill the design row. */
|
||||
ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
|
||||
ccl_global float ccl_restrict_ptr p_buffer,
|
||||
int2 q_pixel,
|
||||
ccl_global float ccl_restrict_ptr q_buffer,
|
||||
int pass_stride,
|
||||
int rank,
|
||||
float *design_row,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
int stride)
|
||||
{
|
||||
design_row[0] = 1.0f;
|
||||
math_vector_zero(design_row+1, rank);
|
||||
design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
|
||||
design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
|
||||
design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
|
||||
design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
|
||||
design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
|
||||
design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
|
||||
design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
|
||||
design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
|
||||
design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
|
||||
design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
95
intern/cycles/kernel/filter/filter_features_sse.h
Normal file
95
intern/cycles/kernel/filter/filter_features_sse.h
Normal file
@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
|
||||
|
||||
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
|
||||
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
|
||||
* x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
|
||||
|
||||
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
|
||||
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
|
||||
__m128 y4 = _mm_set1_ps(pixel.y); \
|
||||
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
|
||||
__m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
|
||||
__m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
|
||||
|
||||
#define END_FOR_PIXEL_WINDOW_SSE } \
|
||||
pixel_buffer += buffer_w - (pixel.x - low.x); \
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||
{
|
||||
features[0] = x;
|
||||
features[1] = y;
|
||||
features[2] = ccl_get_feature_sse(0);
|
||||
features[3] = ccl_get_feature_sse(1);
|
||||
features[4] = ccl_get_feature_sse(2);
|
||||
features[5] = ccl_get_feature_sse(3);
|
||||
features[6] = ccl_get_feature_sse(4);
|
||||
features[7] = ccl_get_feature_sse(5);
|
||||
features[8] = ccl_get_feature_sse(6);
|
||||
features[9] = ccl_get_feature_sse(7);
|
||||
if(mean) {
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||
features[i] = _mm_sub_ps(features[i], mean[i]);
|
||||
}
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||
features[i] = _mm_mask_ps(features[i], active_pixels);
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||
{
|
||||
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
|
||||
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
|
||||
|
||||
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
|
||||
|
||||
__m128 diff, scale;
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
|
||||
scale = _mm_mul_ps(diff, diff);
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
|
||||
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
|
||||
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||
scales[3] = _mm_mask_ps(scale, active_pixels);
|
||||
|
||||
scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
|
||||
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
|
||||
scale = _mm_mul_ps(diff, diff);
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
|
||||
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||
diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
|
||||
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||
scales[5] = _mm_mask_ps(scale, active_pixels);
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
|
||||
{
|
||||
scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
|
||||
scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
|
||||
scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
|
||||
scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
|
||||
|
||||
scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
|
||||
scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
|
||||
}
|
||||
|
||||
|
||||
CCL_NAMESPACE_END
|
50
intern/cycles/kernel/filter/filter_kernel.h
Normal file
50
intern/cycles/kernel/filter/filter_kernel.h
Normal file
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "util/util_color.h"
|
||||
#include "util/util_math.h"
|
||||
#include "util/util_math_fast.h"
|
||||
#include "util/util_texture.h"
|
||||
|
||||
#include "util/util_atomic.h"
|
||||
#include "util/util_math_matrix.h"
|
||||
|
||||
#include "kernel/filter/filter_defines.h"
|
||||
|
||||
#include "kernel/filter/filter_features.h"
|
||||
#ifdef __KERNEL_SSE3__
|
||||
# include "kernel/filter/filter_features_sse.h"
|
||||
#endif
|
||||
|
||||
#include "kernel/filter/filter_prefilter.h"
|
||||
|
||||
#ifdef __KERNEL_GPU__
|
||||
# include "kernel/filter/filter_transform_gpu.h"
|
||||
#else
|
||||
# ifdef __KERNEL_SSE3__
|
||||
# include "kernel/filter/filter_transform_sse.h"
|
||||
# else
|
||||
# include "kernel/filter/filter_transform.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "kernel/filter/filter_reconstruction.h"
|
||||
|
||||
#ifdef __KERNEL_CPU__
|
||||
# include "kernel/filter/filter_nlm_cpu.h"
|
||||
#else
|
||||
# include "kernel/filter/filter_nlm_gpu.h"
|
||||
#endif
|
163
intern/cycles/kernel/filter/filter_nlm_cpu.h
Normal file
163
intern/cycles/kernel/filter/filter_nlm_cpu.h
Normal file
@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
float diff = 0.0f;
|
||||
int numChannels = channel_offset? 3 : 1;
|
||||
for(int c = 0; c < numChannels; c++) {
|
||||
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float pvar = varianceImage[c*channel_offset + y*w+x];
|
||||
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||
}
|
||||
if(numChannels > 1) {
|
||||
diff *= 1.0f/numChannels;
|
||||
}
|
||||
differenceImage[y*w+x] = diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE3__
|
||||
int aligned_lowx = (rect.x & ~(3));
|
||||
int aligned_highx = ((rect.z + 3) & ~(3));
|
||||
#endif
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
const int low = max(rect.y, y-f);
|
||||
const int high = min(rect.w, y+f+1);
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
outImage[y*w+x] = 0.0f;
|
||||
}
|
||||
for(int y1 = low; y1 < high; y1++) {
|
||||
#ifdef __KERNEL_SSE3__
|
||||
for(int x = aligned_lowx; x < aligned_highx; x+=4) {
|
||||
_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
|
||||
}
|
||||
#else
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
outImage[y*w+x] += differenceImage[y1*w+x];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
outImage[y*w+x] *= 1.0f/(high - low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
outImage[y*w+x] = 0.0f;
|
||||
}
|
||||
}
|
||||
for(int dx = -f; dx <= f; dx++) {
|
||||
int pos_dx = max(0, dx);
|
||||
int neg_dx = min(0, dx);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
|
||||
outImage[y*w+x] += differenceImage[y*w+dx+x];
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += differenceImage[y*w+x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
accumImage[y*w+x] += weight;
|
||||
outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float *transform,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int pass_stride)
|
||||
{
|
||||
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
|
||||
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
|
||||
int y = fy + filter_rect.y;
|
||||
for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
|
||||
int x = fx + filter_rect.x;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += differenceImage[y*w+x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
|
||||
int storage_ofs = fy*filter_rect.z + fx;
|
||||
float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
|
||||
float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
|
||||
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
|
||||
int *l_rank = rank + storage_ofs;
|
||||
|
||||
kernel_filter_construct_gramian(x, y, 1,
|
||||
dx, dy, w, h,
|
||||
pass_stride,
|
||||
buffer,
|
||||
color_pass, variance_pass,
|
||||
l_transform, l_rank,
|
||||
weight, l_XtWX, l_XtWY, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
outImage[y*w+x] /= accumImage[y*w+x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
147
intern/cycles/kernel/filter/filter_nlm_gpu.h
Normal file
147
intern/cycles/kernel/filter/filter_nlm_gpu.h
Normal file
@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr weightImage,
|
||||
ccl_global float ccl_restrict_ptr varianceImage,
|
||||
ccl_global float *differenceImage,
|
||||
int4 rect, int w,
|
||||
int channel_offset,
|
||||
float a, float k_2)
|
||||
{
|
||||
float diff = 0.0f;
|
||||
int numChannels = channel_offset? 3 : 1;
|
||||
for(int c = 0; c < numChannels; c++) {
|
||||
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float pvar = varianceImage[c*channel_offset + y*w+x];
|
||||
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||
}
|
||||
if(numChannels > 1) {
|
||||
diff *= 1.0f/numChannels;
|
||||
}
|
||||
differenceImage[y*w+x] = diff;
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect, int w, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.y, y-f);
|
||||
const int high = min(rect.w, y+f+1);
|
||||
for(int y1 = low; y1 < high; y1++) {
|
||||
sum += differenceImage[y1*w+x];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
outImage[y*w+x] = sum;
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect, int w, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += differenceImage[y*w+x1];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
outImage[y*w+x] = expf(-max(sum, 0.0f));
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr image,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float *accumImage,
|
||||
int4 rect, int w, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += differenceImage[y*w+x1];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
if(outImage) {
|
||||
accumImage[y*w+x] += sum;
|
||||
outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
|
||||
}
|
||||
else {
|
||||
accumImage[y*w+x] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int pass_stride,
|
||||
int localIdx)
|
||||
{
|
||||
int y = fy + filter_rect.y;
|
||||
int x = fx + filter_rect.x;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += differenceImage[y*w+x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
|
||||
int storage_ofs = fy*filter_rect.z + fx;
|
||||
transform += storage_ofs;
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
|
||||
kernel_filter_construct_gramian(x, y,
|
||||
filter_rect.z*filter_rect.w,
|
||||
dx, dy, w, h,
|
||||
pass_stride,
|
||||
buffer,
|
||||
color_pass, variance_pass,
|
||||
transform, rank,
|
||||
weight, XtWX, XtWY,
|
||||
localIdx);
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float ccl_restrict_ptr accumImage,
|
||||
int4 rect, int w)
|
||||
{
|
||||
outImage[y*w+x] /= accumImage[y*w+x];
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
145
intern/cycles/kernel/filter/filter_prefilter.h
Normal file
145
intern/cycles/kernel/filter/filter_prefilter.h
Normal file
@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* First step of the shadow prefiltering, performs the shadow division and stores all data
|
||||
* in a nice and easy rectangular array that can be passed to the NLM filter.
|
||||
*
|
||||
* Calculates:
|
||||
* unfiltered: Contains the two half images of the shadow feature pass
|
||||
* sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
|
||||
* sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
|
||||
* bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
|
||||
*/
|
||||
ccl_device void kernel_filter_divide_shadow(int sample,
|
||||
ccl_global TilesInfo *tiles,
|
||||
int x, int y,
|
||||
ccl_global float *unfilteredA,
|
||||
ccl_global float *unfilteredB,
|
||||
ccl_global float *sampleVariance,
|
||||
ccl_global float *sampleVarianceV,
|
||||
ccl_global float *bufferVariance,
|
||||
int4 rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
|
||||
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
|
||||
int tile = ytile*3+xtile;
|
||||
|
||||
int offset = tiles->offsets[tile];
|
||||
int stride = tiles->strides[tile];
|
||||
ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
|
||||
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
|
||||
center_buffer += buffer_denoising_offset + 14;
|
||||
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||
unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
|
||||
unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
|
||||
|
||||
float varA = center_buffer[2];
|
||||
float varB = center_buffer[5];
|
||||
int odd_sample = (sample+1)/2;
|
||||
int even_sample = sample/2;
|
||||
if(use_split_variance) {
|
||||
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
|
||||
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
|
||||
}
|
||||
varA /= (odd_sample - 1);
|
||||
varB /= (even_sample - 1);
|
||||
|
||||
sampleVariance[idx] = 0.5f*(varA + varB) / sample;
|
||||
sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
|
||||
bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
|
||||
}
|
||||
|
||||
/* Load a regular feature from the render buffers into the denoise buffer.
|
||||
* Parameters:
|
||||
* - sample: The sample amount in the buffer, used to normalize the buffer.
|
||||
* - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
|
||||
* - x, y: Current pixel
|
||||
* - mean, variance: Target denoise buffers.
|
||||
* - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
|
||||
*/
|
||||
ccl_device void kernel_filter_get_feature(int sample,
|
||||
ccl_global TilesInfo *tiles,
|
||||
int m_offset, int v_offset,
|
||||
int x, int y,
|
||||
ccl_global float *mean,
|
||||
ccl_global float *variance,
|
||||
int4 rect, int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
|
||||
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
|
||||
int tile = ytile*3+xtile;
|
||||
ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
|
||||
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||
|
||||
mean[idx] = center_buffer[m_offset] / sample;
|
||||
if(use_split_variance) {
|
||||
variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
|
||||
}
|
||||
else {
|
||||
variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
|
||||
}
|
||||
}
|
||||
|
||||
/* Combine A/B buffers.
|
||||
* Calculates the combined mean and the buffer variance. */
|
||||
ccl_device void kernel_filter_combine_halves(int x, int y,
|
||||
ccl_global float *mean,
|
||||
ccl_global float *variance,
|
||||
ccl_global float *a,
|
||||
ccl_global float *b,
|
||||
int4 rect, int r)
|
||||
{
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||
|
||||
if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
|
||||
if(variance) {
|
||||
if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
|
||||
else {
|
||||
variance[idx] = 0.0f;
|
||||
float values[25];
|
||||
int numValues = 0;
|
||||
for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
|
||||
for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
|
||||
int pidx = (py-rect.y)*buffer_w + (px-rect.x);
|
||||
values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
|
||||
}
|
||||
}
|
||||
/* Insertion-sort the variances (fast enough for 25 elements). */
|
||||
for(int i = 1; i < numValues; i++) {
|
||||
float v = values[i];
|
||||
int j;
|
||||
for(j = i-1; j >= 0 && values[j] > v; j--)
|
||||
values[j+1] = values[j];
|
||||
values[j+1] = v;
|
||||
}
|
||||
variance[idx] = values[(7*numValues)/8];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
103
intern/cycles/kernel/filter/filter_reconstruction.h
Normal file
103
intern/cycles/kernel/filter/filter_reconstruction.h
Normal file
@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||
int storage_stride,
|
||||
int dx, int dy,
|
||||
int w, int h,
|
||||
int pass_stride,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
ccl_global int *rank,
|
||||
float weight,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int localIdx)
|
||||
{
|
||||
int p_offset = y *w + x;
|
||||
int q_offset = (y+dy)*w + (x+dx);
|
||||
|
||||
#ifdef __KERNEL_CPU__
|
||||
const int stride = 1;
|
||||
(void)storage_stride;
|
||||
(void)localIdx;
|
||||
float design_row[DENOISE_FEATURES+1];
|
||||
#elif defined(__KERNEL_CUDA__)
|
||||
const int stride = storage_stride;
|
||||
ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
|
||||
ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
|
||||
#else
|
||||
const int stride = storage_stride;
|
||||
float design_row[DENOISE_FEATURES+1];
|
||||
#endif
|
||||
|
||||
float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
|
||||
float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
|
||||
|
||||
float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
|
||||
float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
|
||||
|
||||
if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
|
||||
make_int2(x+dx, y+dy), buffer + q_offset,
|
||||
pass_stride, *rank, design_row, transform, stride);
|
||||
|
||||
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
|
||||
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
|
||||
ccl_global float *buffer,
|
||||
ccl_global int *rank,
|
||||
int storage_stride,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 buffer_params,
|
||||
int sample)
|
||||
{
|
||||
#ifdef __KERNEL_CPU__
|
||||
const int stride = 1;
|
||||
(void)storage_stride;
|
||||
#else
|
||||
const int stride = storage_stride;
|
||||
#endif
|
||||
|
||||
math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
|
||||
|
||||
float3 final_color = XtWY[0];
|
||||
|
||||
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
|
||||
final_color *= sample;
|
||||
if(buffer_params.w) {
|
||||
final_color.x += combined_buffer[buffer_params.w+0];
|
||||
final_color.y += combined_buffer[buffer_params.w+1];
|
||||
final_color.z += combined_buffer[buffer_params.w+2];
|
||||
}
|
||||
combined_buffer[0] = final_color.x;
|
||||
combined_buffer[1] = final_color.y;
|
||||
combined_buffer[2] = final_color.z;
|
||||
}
|
||||
|
||||
#undef STORAGE_TYPE
|
||||
|
||||
CCL_NAMESPACE_END
|
113
intern/cycles/kernel/filter/filter_transform.h
Normal file
113
intern/cycles/kernel/filter/filter_transform.h
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
float *transform, int *rank,
|
||||
int radius, float pca_threshold)
|
||||
{
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
|
||||
float features[DENOISE_FEATURES];
|
||||
|
||||
/* Temporary storage, used in different steps of the algorithm. */
|
||||
float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||
float tempvector[2*DENOISE_FEATURES];
|
||||
float ccl_restrict_ptr pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
|
||||
|
||||
|
||||
/* === Calculate denoising window. === */
|
||||
int2 low = make_int2(max(rect.x, x - radius),
|
||||
max(rect.y, y - radius));
|
||||
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||
min(rect.w, y + radius + 1));
|
||||
|
||||
|
||||
|
||||
|
||||
/* === Shift feature passes to have mean 0. === */
|
||||
float feature_means[DENOISE_FEATURES];
|
||||
math_vector_zero(feature_means, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
|
||||
math_vector_add(feature_means, features, DENOISE_FEATURES);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
|
||||
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
|
||||
|
||||
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
|
||||
float *feature_scale = tempvector;
|
||||
math_vector_zero(feature_scale, DENOISE_FEATURES);
|
||||
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_max(feature_scale, features, DENOISE_FEATURES);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
filter_calculate_scale(feature_scale);
|
||||
|
||||
|
||||
/* === Generate the feature transformation. ===
|
||||
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
|
||||
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
|
||||
float* feature_matrix = tempmatrix;
|
||||
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
|
||||
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
|
||||
*rank = 0;
|
||||
if(pca_threshold < 0.0f) {
|
||||
float threshold_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||
}
|
||||
threshold_energy *= 1.0f - (-pca_threshold);
|
||||
|
||||
float reduced_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||
break;
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
reduced_energy += s;
|
||||
}
|
||||
}
|
||||
else {
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Bake the feature scaling into the transformation matrix. */
|
||||
for(int i = 0; i < (*rank); i++) {
|
||||
math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
|
||||
}
|
||||
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
117
intern/cycles/kernel/filter/filter_transform_gpu.h
Normal file
117
intern/cycles/kernel/filter/filter_transform_gpu.h
Normal file
@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
ccl_global float *transform,
|
||||
ccl_global int *rank,
|
||||
int radius, float pca_threshold,
|
||||
int transform_stride, int localIdx)
|
||||
{
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
|
||||
#ifdef __KERNEL_CUDA__
|
||||
ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
|
||||
ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
|
||||
#else
|
||||
float features[DENOISE_FEATURES];
|
||||
#endif
|
||||
|
||||
/* === Calculate denoising window. === */
|
||||
int2 low = make_int2(max(rect.x, x - radius),
|
||||
max(rect.y, y - radius));
|
||||
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||
min(rect.w, y + radius + 1));
|
||||
ccl_global float ccl_restrict_ptr pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
|
||||
|
||||
|
||||
/* === Shift feature passes to have mean 0. === */
|
||||
float feature_means[DENOISE_FEATURES];
|
||||
math_vector_zero(feature_means, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
|
||||
math_vector_add(feature_means, features, DENOISE_FEATURES);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
|
||||
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
|
||||
|
||||
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
|
||||
float feature_scale[DENOISE_FEATURES];
|
||||
math_vector_zero(feature_scale, DENOISE_FEATURES);
|
||||
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_max(feature_scale, features, DENOISE_FEATURES);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
filter_calculate_scale(feature_scale);
|
||||
|
||||
|
||||
|
||||
/* === Generate the feature transformation. ===
|
||||
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
|
||||
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
|
||||
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW {
|
||||
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
|
||||
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
|
||||
} END_FOR_PIXEL_WINDOW
|
||||
|
||||
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
|
||||
*rank = 0;
|
||||
if(pca_threshold < 0.0f) {
|
||||
float threshold_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||
}
|
||||
threshold_energy *= 1.0f - (-pca_threshold);
|
||||
|
||||
float reduced_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||
break;
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
reduced_energy += s;
|
||||
}
|
||||
}
|
||||
else {
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
|
||||
|
||||
/* Bake the feature scaling into the transformation matrix. */
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
for(int j = 0; j < (*rank); j++) {
|
||||
transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
102
intern/cycles/kernel/filter/filter_transform_sse.h
Normal file
102
intern/cycles/kernel/filter/filter_transform_sse.h
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
float *transform, int *rank,
|
||||
int radius, float pca_threshold)
|
||||
{
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
|
||||
__m128 features[DENOISE_FEATURES];
|
||||
float ccl_restrict_ptr pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
int2 low = make_int2(max(rect.x, x - radius),
|
||||
max(rect.y, y - radius));
|
||||
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||
min(rect.w, y + radius + 1));
|
||||
|
||||
__m128 feature_means[DENOISE_FEATURES];
|
||||
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW_SSE {
|
||||
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
|
||||
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
|
||||
} END_FOR_PIXEL_WINDOW_SSE
|
||||
|
||||
__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
|
||||
}
|
||||
|
||||
__m128 feature_scale[DENOISE_FEATURES];
|
||||
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW_SSE {
|
||||
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
|
||||
} END_FOR_PIXEL_WINDOW_SSE
|
||||
|
||||
filter_calculate_scale_sse(feature_scale);
|
||||
|
||||
__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
|
||||
FOR_PIXEL_WINDOW_SSE {
|
||||
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
|
||||
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
|
||||
math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
|
||||
} END_FOR_PIXEL_WINDOW_SSE
|
||||
|
||||
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||
math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
|
||||
|
||||
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
|
||||
|
||||
*rank = 0;
|
||||
if(pca_threshold < 0.0f) {
|
||||
float threshold_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||
}
|
||||
threshold_energy *= 1.0f - (-pca_threshold);
|
||||
|
||||
float reduced_energy = 0.0f;
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||
break;
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
reduced_energy += s;
|
||||
}
|
||||
}
|
||||
else {
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
|
||||
|
||||
/* Bake the feature scaling into the transformation matrix. */
|
||||
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||
math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
|
||||
|
||||
/* Interpolate smooth vertex normal from vertices */
|
||||
|
||||
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
|
||||
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
|
||||
{
|
||||
/* load triangle vertices */
|
||||
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
|
||||
@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
|
||||
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
|
||||
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
|
||||
|
||||
return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
|
||||
float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
|
||||
|
||||
return is_zero(N)? Ng: N;
|
||||
}
|
||||
|
||||
/* Ray differentials on triangle */
|
||||
|
@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
|
||||
#define KERNEL_ARCH cpu
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# define KERNEL_ARCH cpu_sse2
|
||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||
#define KERNEL_ARCH cpu_sse2
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# define KERNEL_ARCH cpu_sse3
|
||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||
#define KERNEL_ARCH cpu_sse3
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# define KERNEL_ARCH cpu_sse41
|
||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||
#define KERNEL_ARCH cpu_sse41
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# define KERNEL_ARCH cpu_avx
|
||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||
#define KERNEL_ARCH cpu_avx
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# define KERNEL_ARCH cpu_avx2
|
||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||
#define KERNEL_ARCH cpu_avx2
|
||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
|
||||
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
|
||||
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
|
||||
#endif
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
|
||||
L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
|
||||
L->denoising_depth = 0.0f;
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
|
||||
@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 throughput,
|
||||
float3 alpha,
|
||||
float3 bsdf,
|
||||
float3 ao,
|
||||
int bounce)
|
||||
float3 ao)
|
||||
{
|
||||
#ifdef __PASSES__
|
||||
if(L->use_light_pass) {
|
||||
if(bounce == 0) {
|
||||
if(state->bounce == 0) {
|
||||
/* directly visible lighting */
|
||||
L->direct_diffuse += throughput*bsdf*ao;
|
||||
L->ao += alpha*throughput*ao;
|
||||
@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
|
||||
}
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
float3 light = throughput * bsdf;
|
||||
L->path_total += light;
|
||||
L->path_total_shaded += ao * light;
|
||||
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||
float3 light = throughput * bsdf;
|
||||
L->path_total += light;
|
||||
L->path_total_shaded += ao * light;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_accum_total_ao(
|
||||
PathRadiance *L,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 throughput,
|
||||
float3 bsdf)
|
||||
{
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
L->path_total += throughput * bsdf;
|
||||
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||
L->path_total += throughput * bsdf;
|
||||
}
|
||||
#else
|
||||
(void) L;
|
||||
(void) state;
|
||||
(void) throughput;
|
||||
(void) bsdf;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
|
||||
ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 throughput,
|
||||
BsdfEval *bsdf_eval,
|
||||
float3 shadow,
|
||||
float shadow_fac,
|
||||
bool is_lamp)
|
||||
{
|
||||
#ifdef __PASSES__
|
||||
if(L->use_light_pass) {
|
||||
if(bounce == 0) {
|
||||
if(state->bounce == 0) {
|
||||
/* directly visible lighting */
|
||||
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
|
||||
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
|
||||
@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
|
||||
}
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
float3 light = throughput * bsdf_eval->sum_no_mis;
|
||||
L->path_total += light;
|
||||
L->path_total_shaded += shadow * light;
|
||||
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||
float3 light = throughput * bsdf_eval->sum_no_mis;
|
||||
L->path_total += light;
|
||||
L->path_total_shaded += shadow * light;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_accum_total_light(
|
||||
PathRadiance *L,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 throughput,
|
||||
const BsdfEval *bsdf_eval)
|
||||
{
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
L->path_total += throughput * bsdf_eval->sum_no_mis;
|
||||
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||
L->path_total += throughput * bsdf_eval->sum_no_mis;
|
||||
}
|
||||
#else
|
||||
(void) L;
|
||||
(void) state;
|
||||
(void) throughput;
|
||||
(void) bsdf_eval;
|
||||
#endif
|
||||
@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
|
||||
}
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
L->path_total += throughput * value;
|
||||
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
|
||||
L->path_total_shaded += throughput * value;
|
||||
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||
L->path_total += throughput * value;
|
||||
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
|
||||
L->path_total_shaded += throughput * value;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
L->denoising_albedo += state->denoising_feature_weight * value;
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
|
||||
@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
|
||||
return L_sum;
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
|
||||
{
|
||||
#ifdef __PASSES__
|
||||
kernel_assert(L->use_light_pass);
|
||||
|
||||
*clean = L->emission + L->background;
|
||||
*noisy = L->direct_scatter + L->indirect_scatter;
|
||||
|
||||
# define ADD_COMPONENT(flag, component) \
|
||||
if(kernel_data.film.denoising_flags & flag) \
|
||||
*clean += component; \
|
||||
else \
|
||||
*noisy += component;
|
||||
|
||||
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
|
||||
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
|
||||
# undef ADD_COMPONENT
|
||||
#else
|
||||
*noisy = L->emission;
|
||||
*clean = make_float3(0.0f, 0.0f, 0.0f);
|
||||
#endif
|
||||
|
||||
*noisy = ensure_finite3(*noisy);
|
||||
*clean = ensure_finite3(*clean);
|
||||
}
|
||||
|
||||
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
|
||||
{
|
||||
float fac = 1.0f/num_samples;
|
||||
@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
|
||||
/* Calculate final light sum and transparency for shadow catcher object. */
|
||||
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
|
||||
const PathRadiance *L,
|
||||
ccl_addr_space float* L_transparent)
|
||||
float* alpha)
|
||||
{
|
||||
const float shadow = path_radiance_sum_shadow(L);
|
||||
float3 L_sum;
|
||||
if(kernel_data.background.transparent) {
|
||||
*L_transparent = shadow;
|
||||
*alpha = 1.0f-shadow;
|
||||
L_sum = make_float3(0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
else {
|
||||
|
@ -42,6 +42,8 @@
|
||||
#include "util/util_types.h"
|
||||
#include "util/util_texture.h"
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict
|
||||
|
||||
#define ccl_addr_space
|
||||
|
||||
#define ccl_local_id(d) 0
|
||||
|
@ -55,6 +55,10 @@
|
||||
#define ccl_restrict __restrict__
|
||||
#define ccl_align(n) __align__(n)
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict__
|
||||
#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
|
||||
|
||||
|
||||
/* No assert supported for CUDA */
|
||||
|
||||
#define kernel_assert(cond)
|
||||
|
@ -50,6 +50,8 @@
|
||||
# define ccl_addr_space
|
||||
#endif
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict__
|
||||
|
||||
#define ccl_local_id(d) get_local_id(d)
|
||||
#define ccl_global_id(d) get_global_id(d)
|
||||
|
||||
|
@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
|
||||
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
|
||||
cu = clamp(cu, -1.0f, 1.0f);
|
||||
/* Compute xu. */
|
||||
float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
|
||||
float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
|
||||
xu = clamp(xu, x0, x1);
|
||||
/* Compute yv. */
|
||||
float z0sq = z0 * z0;
|
||||
|
@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
|
||||
#endif /* __SPLIT_KERNEL__ */
|
||||
}
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
|
||||
{
|
||||
kernel_write_pass_float(buffer, sample, value);
|
||||
|
||||
/* The online one-pass variance update that's used for the megakernel can't easily be implemented
|
||||
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
|
||||
# ifdef __SPLIT_KERNEL__
|
||||
kernel_write_pass_float(buffer+1, sample, value*value);
|
||||
# else
|
||||
if(sample == 0) {
|
||||
kernel_write_pass_float(buffer+1, sample, 0.0f);
|
||||
}
|
||||
else {
|
||||
float new_mean = buffer[0] * (1.0f / (sample + 1));
|
||||
float old_mean = (buffer[0] - value) * (1.0f / sample);
|
||||
kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
# if defined(__SPLIT_KERNEL__)
|
||||
# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
|
||||
# else
|
||||
ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
|
||||
{
|
||||
buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
|
||||
buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
|
||||
buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
|
||||
}
|
||||
# endif
|
||||
|
||||
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
|
||||
{
|
||||
kernel_write_pass_float3_unaligned(buffer, sample, value);
|
||||
# ifdef __SPLIT_KERNEL__
|
||||
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
|
||||
# else
|
||||
if(sample == 0) {
|
||||
kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||
}
|
||||
else {
|
||||
float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
|
||||
float3 new_mean = sum * (1.0f / (sample + 1));
|
||||
float3 old_mean = (sum - value) * (1.0f / sample);
|
||||
kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
|
||||
}
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
|
||||
int sample, float path_total, float path_total_shaded)
|
||||
{
|
||||
if(kernel_data.film.pass_denoising_data == 0)
|
||||
return;
|
||||
|
||||
buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
|
||||
|
||||
path_total = ensure_finite(path_total);
|
||||
path_total_shaded = ensure_finite(path_total_shaded);
|
||||
|
||||
kernel_write_pass_float(buffer, sample/2, path_total);
|
||||
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
|
||||
|
||||
float value = path_total_shaded / max(path_total, 1e-7f);
|
||||
# ifdef __SPLIT_KERNEL__
|
||||
kernel_write_pass_float(buffer+2, sample/2, value*value);
|
||||
# else
|
||||
if(sample < 2) {
|
||||
kernel_write_pass_float(buffer+2, sample/2, 0.0f);
|
||||
}
|
||||
else {
|
||||
float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
|
||||
float new_value = buffer[1] / max(buffer[0], 1e-7f);
|
||||
kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
|
||||
ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ccl_global PathState *state,
|
||||
PathRadiance *L)
|
||||
{
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(state->denoising_feature_weight == 0.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
|
||||
|
||||
float3 normal = make_float3(0.0f, 0.0f, 0.0f);
|
||||
float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
|
||||
float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
|
||||
|
||||
for(int i = 0; i < sd->num_closure; i++) {
|
||||
ShaderClosure *sc = &sd->closure[i];
|
||||
|
||||
if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
|
||||
continue;
|
||||
|
||||
/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
|
||||
normal += sc->N * sc->sample_weight;
|
||||
sum_weight += sc->sample_weight;
|
||||
if(!bsdf_is_specular_like(sc)) {
|
||||
albedo += sc->weight;
|
||||
sum_nonspecular_weight += sc->sample_weight;
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
|
||||
if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
|
||||
if(sum_weight != 0.0f) {
|
||||
normal /= sum_weight;
|
||||
}
|
||||
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
|
||||
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
|
||||
|
||||
state->denoising_feature_weight = 0.0f;
|
||||
}
|
||||
#else
|
||||
(void) kg;
|
||||
(void) sd;
|
||||
(void) state;
|
||||
(void) L;
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
|
||||
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
|
||||
{
|
||||
@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
|
||||
int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
|
||||
{
|
||||
if(L) {
|
||||
float3 L_sum;
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if(is_shadow_catcher) {
|
||||
L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
|
||||
}
|
||||
else
|
||||
#endif /* __SHADOW_TRICKS__ */
|
||||
{
|
||||
L_sum = path_radiance_clamp_and_sum(kg, L);
|
||||
}
|
||||
|
||||
kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
|
||||
|
||||
kernel_write_light_passes(kg, buffer, L, sample);
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(kernel_data.film.pass_denoising_data) {
|
||||
# ifdef __SHADOW_TRICKS__
|
||||
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
|
||||
# else
|
||||
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
|
||||
# endif
|
||||
if(kernel_data.film.pass_denoising_clean) {
|
||||
float3 noisy, clean;
|
||||
path_radiance_split_denoising(kg, L, &noisy, &clean);
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||
sample, noisy);
|
||||
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
|
||||
sample, clean);
|
||||
}
|
||||
else {
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||
sample, L_sum);
|
||||
}
|
||||
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
|
||||
sample, L->denoising_normal);
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
|
||||
sample, L->denoising_albedo);
|
||||
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
|
||||
sample, L->denoising_depth);
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
}
|
||||
else {
|
||||
kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(kernel_data.film.pass_denoising_data) {
|
||||
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
|
||||
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
|
||||
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
|
||||
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
|
||||
sample, 0.0f);
|
||||
|
||||
if(kernel_data.film.pass_denoising_clean) {
|
||||
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
|
||||
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||
}
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
|
||||
light_ray.dD = differential3_zero();
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
||||
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
|
||||
path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_ao(L, throughput, ao_bsdf);
|
||||
path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
|
||||
throughput /= probability;
|
||||
}
|
||||
|
||||
kernel_update_denoising_features(kg, sd, state, L);
|
||||
|
||||
#ifdef __AO__
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
|
||||
@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
|
||||
}
|
||||
|
||||
|
||||
ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
RNG *rng,
|
||||
int sample,
|
||||
Ray ray,
|
||||
ccl_global float *buffer)
|
||||
ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
|
||||
RNG *rng,
|
||||
int sample,
|
||||
Ray ray,
|
||||
ccl_global float *buffer,
|
||||
PathRadiance *L,
|
||||
bool *is_shadow_catcher)
|
||||
{
|
||||
/* initialize */
|
||||
PathRadiance L;
|
||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||
float L_transparent = 0.0f;
|
||||
|
||||
path_radiance_init(&L, kernel_data.film.use_light_pass);
|
||||
path_radiance_init(L, kernel_data.film.use_light_pass);
|
||||
|
||||
/* shader data memory used for both volumes and surfaces, saves stack space */
|
||||
ShaderData sd;
|
||||
@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
float3 emission;
|
||||
|
||||
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
|
||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||
}
|
||||
#endif /* __LAMP_MIS__ */
|
||||
|
||||
@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
|
||||
/* emission */
|
||||
if(volume_segment.closure_flag & SD_EMISSION)
|
||||
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
|
||||
|
||||
/* scattering */
|
||||
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
|
||||
@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
|
||||
/* direct light sampling */
|
||||
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
||||
&emission_sd, throughput, &state, &L, all,
|
||||
&emission_sd, throughput, &state, L, all,
|
||||
&volume_ray, &volume_segment);
|
||||
|
||||
/* indirect sample. if we use distance sampling and take just
|
||||
@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
kernel_volume_decoupled_free(kg, &volume_segment);
|
||||
|
||||
if(result == VOLUME_PATH_SCATTERED) {
|
||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||
continue;
|
||||
else
|
||||
break;
|
||||
@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
{
|
||||
/* integrate along volume segment with distance sampling */
|
||||
VolumeIntegrateResult result = kernel_volume_integrate(
|
||||
kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
|
||||
kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
|
||||
|
||||
# ifdef __VOLUME_SCATTER__
|
||||
if(result == VOLUME_PATH_SCATTERED) {
|
||||
/* direct lighting */
|
||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
|
||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
|
||||
|
||||
/* indirect light bounce */
|
||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||
continue;
|
||||
else
|
||||
break;
|
||||
@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
#ifdef __BACKGROUND__
|
||||
/* sample background shader */
|
||||
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
path_radiance_accum_background(&L, &state, throughput, L_background);
|
||||
path_radiance_accum_background(L, &state, throughput, L_background);
|
||||
#endif /* __BACKGROUND__ */
|
||||
|
||||
break;
|
||||
@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||
if(state.flag & PATH_RAY_CAMERA) {
|
||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||
state.catcher_object = sd.object;
|
||||
if(!kernel_data.background.transparent) {
|
||||
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __HOLDOUT__ */
|
||||
|
||||
/* holdout mask objects do not write data passes */
|
||||
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
|
||||
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
|
||||
|
||||
/* blurring of bsdf after bounces, for rays that have a small likelihood
|
||||
* of following this particular path (diffuse, rough glossy) */
|
||||
@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
if(sd.flag & SD_EMISSION) {
|
||||
/* todo: is isect.t wrong here for transparent surfaces? */
|
||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||
}
|
||||
#endif /* __EMISSION__ */
|
||||
|
||||
@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
throughput /= probability;
|
||||
}
|
||||
|
||||
kernel_update_denoising_features(kg, &sd, &state, L);
|
||||
|
||||
#ifdef __AO__
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||
kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
|
||||
kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
|
||||
}
|
||||
#endif /* __AO__ */
|
||||
|
||||
@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
if(kernel_path_subsurface_scatter(kg,
|
||||
&sd,
|
||||
&emission_sd,
|
||||
&L,
|
||||
L,
|
||||
&state,
|
||||
rng,
|
||||
&ray,
|
||||
@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
/* direct lighting */
|
||||
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
|
||||
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
|
||||
|
||||
/* compute direct lighting and next bounce */
|
||||
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
||||
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef __SUBSURFACE__
|
||||
kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
|
||||
kernel_path_subsurface_accum_indirect(&ss_indirect, L);
|
||||
|
||||
/* Trace indirect subsurface rays by restarting the loop. this uses less
|
||||
* stack memory than invoking kernel_path_indirect.
|
||||
@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
&ss_indirect,
|
||||
&state,
|
||||
&ray,
|
||||
&L,
|
||||
L,
|
||||
&throughput);
|
||||
}
|
||||
else {
|
||||
@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
||||
}
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
float3 L_sum;
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
|
||||
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
|
||||
}
|
||||
else
|
||||
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||
#endif /* __SHADOW_TRICKS__ */
|
||||
{
|
||||
L_sum = path_radiance_clamp_and_sum(kg, &L);
|
||||
}
|
||||
|
||||
kernel_write_light_passes(kg, buffer, &L, sample);
|
||||
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
||||
#endif /* __KERNEL_DEBUG__ */
|
||||
|
||||
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
|
||||
return 1.0f - L_transparent;
|
||||
}
|
||||
|
||||
ccl_device void kernel_path_trace(KernelGlobals *kg,
|
||||
@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
|
||||
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
||||
|
||||
/* integrate */
|
||||
float4 L;
|
||||
PathRadiance L;
|
||||
bool is_shadow_catcher;
|
||||
|
||||
if(ray.t != 0.0f)
|
||||
L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
|
||||
else
|
||||
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
/* accumulate result in output buffer */
|
||||
kernel_write_pass_float4(buffer, sample, L);
|
||||
if(ray.t != 0.0f) {
|
||||
float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
|
||||
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
|
||||
}
|
||||
else {
|
||||
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
|
||||
}
|
||||
|
||||
path_rng_end(kg, rng_state, rng);
|
||||
}
|
||||
|
@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
|
||||
light_ray.dD = differential3_zero();
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
||||
path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
|
||||
path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
|
||||
path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
|
||||
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
|
||||
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
|
||||
{
|
||||
float sum_sample_weight = 0.0f;
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(state->denoising_feature_weight > 0.0f) {
|
||||
for(int i = 0; i < sd->num_closure; i++) {
|
||||
const ShaderClosure *sc = &sd->closure[i];
|
||||
|
||||
/* transparency is not handled here, but in outer loop */
|
||||
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sum_sample_weight += sc->sample_weight;
|
||||
}
|
||||
}
|
||||
else {
|
||||
sum_sample_weight = 1.0f;
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
|
||||
for(int i = 0; i < sd->num_closure; i++) {
|
||||
const ShaderClosure *sc = &sd->closure[i];
|
||||
|
||||
if(!CLOSURE_IS_BSDF(sc->type))
|
||||
continue;
|
||||
/* transparency is not handled here, but in outer loop */
|
||||
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
|
||||
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int num_samples;
|
||||
|
||||
@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
|
||||
&tp,
|
||||
&ps,
|
||||
L,
|
||||
&bsdf_ray))
|
||||
&bsdf_ray,
|
||||
sum_sample_weight))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
|
||||
}
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
|
||||
ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
|
||||
RNG *rng,
|
||||
int sample,
|
||||
Ray ray,
|
||||
ccl_global float *buffer,
|
||||
PathRadiance *L,
|
||||
bool *is_shadow_catcher)
|
||||
{
|
||||
/* initialize */
|
||||
PathRadiance L;
|
||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||
float L_transparent = 0.0f;
|
||||
|
||||
path_radiance_init(&L, kernel_data.film.use_light_pass);
|
||||
path_radiance_init(L, kernel_data.film.use_light_pass);
|
||||
|
||||
/* shader data memory used for both volumes and surfaces, saves stack space */
|
||||
ShaderData sd;
|
||||
@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
int all = kernel_data.integrator.sample_all_lights_direct;
|
||||
|
||||
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
||||
&emission_sd, throughput, &state, &L, all,
|
||||
&emission_sd, throughput, &state, L, all,
|
||||
&volume_ray, &volume_segment);
|
||||
|
||||
/* indirect light sampling */
|
||||
@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
&sd,
|
||||
&tp,
|
||||
&ps,
|
||||
&L,
|
||||
L,
|
||||
&pray))
|
||||
{
|
||||
kernel_path_indirect(kg,
|
||||
@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
tp*num_samples_inv,
|
||||
num_samples,
|
||||
&ps,
|
||||
&L);
|
||||
L);
|
||||
|
||||
/* for render passes, sum and reset indirect light pass variables
|
||||
* for the next samples */
|
||||
path_radiance_sum_indirect(&L);
|
||||
path_radiance_reset_indirect(&L);
|
||||
path_radiance_sum_indirect(L);
|
||||
path_radiance_reset_indirect(L);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* emission and transmittance */
|
||||
if(volume_segment.closure_flag & SD_EMISSION)
|
||||
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
|
||||
throughput *= volume_segment.accum_transmittance;
|
||||
|
||||
/* free cached steps */
|
||||
@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
path_state_branch(&ps, j, num_samples);
|
||||
|
||||
VolumeIntegrateResult result = kernel_volume_integrate(
|
||||
kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
|
||||
kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
|
||||
|
||||
#ifdef __VOLUME_SCATTER__
|
||||
if(result == VOLUME_PATH_SCATTERED) {
|
||||
/* todo: support equiangular, MIS and all light sampling.
|
||||
* alternatively get decoupled ray marching working on the GPU */
|
||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
|
||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
|
||||
|
||||
if(kernel_path_volume_bounce(kg,
|
||||
rng,
|
||||
&sd,
|
||||
&tp,
|
||||
&ps,
|
||||
&L,
|
||||
L,
|
||||
&pray))
|
||||
{
|
||||
kernel_path_indirect(kg,
|
||||
@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
tp,
|
||||
num_samples,
|
||||
&ps,
|
||||
&L);
|
||||
L);
|
||||
|
||||
/* for render passes, sum and reset indirect light pass variables
|
||||
* for the next samples */
|
||||
path_radiance_sum_indirect(&L);
|
||||
path_radiance_reset_indirect(&L);
|
||||
path_radiance_sum_indirect(L);
|
||||
path_radiance_reset_indirect(L);
|
||||
}
|
||||
}
|
||||
#endif /* __VOLUME_SCATTER__ */
|
||||
@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
#ifdef __BACKGROUND__
|
||||
/* sample background shader */
|
||||
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
path_radiance_accum_background(&L, &state, throughput, L_background);
|
||||
path_radiance_accum_background(L, &state, throughput, L_background);
|
||||
#endif /* __BACKGROUND__ */
|
||||
|
||||
break;
|
||||
@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||
if(state.flag & PATH_RAY_CAMERA) {
|
||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||
state.catcher_object = sd.object;
|
||||
if(!kernel_data.background.transparent) {
|
||||
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
#endif /* __HOLDOUT__ */
|
||||
|
||||
/* holdout mask objects do not write data passes */
|
||||
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
|
||||
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
|
||||
|
||||
#ifdef __EMISSION__
|
||||
/* emission */
|
||||
if(sd.flag & SD_EMISSION) {
|
||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||
}
|
||||
#endif /* __EMISSION__ */
|
||||
|
||||
@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
}
|
||||
}
|
||||
|
||||
kernel_update_denoising_features(kg, &sd, &state, L);
|
||||
|
||||
#ifdef __AO__
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||
kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
|
||||
kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
|
||||
}
|
||||
#endif /* __AO__ */
|
||||
|
||||
@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
/* bssrdf scatter to a different location on the same object */
|
||||
if(sd.flag & SD_BSSRDF) {
|
||||
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
|
||||
&L, &state, rng, &ray, throughput);
|
||||
L, &state, rng, &ray, throughput);
|
||||
}
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
int all = (kernel_data.integrator.sample_all_lights_direct) ||
|
||||
(state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||
kernel_branched_path_surface_connect_light(kg, rng,
|
||||
&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
|
||||
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
|
||||
}
|
||||
#endif /* __EMISSION__ */
|
||||
|
||||
/* indirect light */
|
||||
kernel_branched_path_surface_indirect_light(kg, rng,
|
||||
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
|
||||
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
|
||||
|
||||
/* continue in case of transparency */
|
||||
throughput *= shader_bsdf_transparency(kg, &sd);
|
||||
@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
||||
#endif /* __VOLUME__ */
|
||||
}
|
||||
|
||||
float3 L_sum;
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
|
||||
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
|
||||
}
|
||||
else
|
||||
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||
#endif /* __SHADOW_TRICKS__ */
|
||||
{
|
||||
L_sum = path_radiance_clamp_and_sum(kg, &L);
|
||||
}
|
||||
|
||||
kernel_write_light_passes(kg, buffer, &L, sample);
|
||||
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
||||
#endif /* __KERNEL_DEBUG__ */
|
||||
|
||||
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
|
||||
return 1.0f - L_transparent;
|
||||
}
|
||||
|
||||
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
||||
@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
||||
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
||||
|
||||
/* integrate */
|
||||
float4 L;
|
||||
PathRadiance L;
|
||||
bool is_shadow_catcher;
|
||||
|
||||
if(ray.t != 0.0f)
|
||||
L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
|
||||
else
|
||||
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
/* accumulate result in output buffer */
|
||||
kernel_write_pass_float4(buffer, sample, L);
|
||||
if(ray.t != 0.0f) {
|
||||
float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
|
||||
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
|
||||
}
|
||||
else {
|
||||
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
|
||||
}
|
||||
|
||||
path_rng_end(kg, rng_state, rng);
|
||||
}
|
||||
@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
||||
#endif /* __BRANCHED_PATH__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
|
||||
state->transmission_bounce = 0;
|
||||
state->transparent_bounce = 0;
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(kernel_data.film.pass_denoising_data) {
|
||||
state->flag |= PATH_RAY_STORE_SHADOW_INFO;
|
||||
state->denoising_feature_weight = 1.0f;
|
||||
}
|
||||
else {
|
||||
state->denoising_feature_weight = 0.0f;
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
|
||||
state->min_ray_pdf = FLT_MAX;
|
||||
state->ray_pdf = 0.0f;
|
||||
#ifdef __LAMP_MIS__
|
||||
@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
|
||||
|
||||
/* random number generator next bounce */
|
||||
state->rng_offset += PRNG_BOUNCE_NUM;
|
||||
|
||||
if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
|
||||
state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
|
||||
|
@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
|
||||
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
|
||||
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
|
||||
path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
|
||||
ccl_addr_space float3 *throughput,
|
||||
ccl_addr_space PathState *state,
|
||||
PathRadiance *L,
|
||||
ccl_addr_space Ray *ray)
|
||||
ccl_addr_space Ray *ray,
|
||||
float sum_sample_weight)
|
||||
{
|
||||
/* sample BSDF */
|
||||
float bsdf_pdf;
|
||||
@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
|
||||
/* modify throughput */
|
||||
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
|
||||
#endif
|
||||
|
||||
/* modify path state */
|
||||
path_state_next(kg, state, label);
|
||||
|
||||
@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_light(L, throughput, &L_light);
|
||||
path_radiance_accum_total_light(L, state, throughput, &L_light);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
||||
|
||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
|
||||
|
||||
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
|
||||
{
|
||||
if(is_zero(dir))
|
||||
return make_float2(0.0f, 0.0f);
|
||||
|
||||
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
|
||||
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
|
||||
|
||||
|
@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
|
||||
|
||||
/* smooth normal */
|
||||
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
||||
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||
|
||||
#ifdef __DPDU__
|
||||
/* dPdu/dPdv */
|
||||
@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
|
||||
sd->N = Ng;
|
||||
|
||||
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
||||
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||
|
||||
# ifdef __DPDU__
|
||||
/* dPdu/dPdv */
|
||||
@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
|
||||
if(sd->type & PRIMITIVE_TRIANGLE) {
|
||||
/* smooth normal */
|
||||
if(sd->shader & SHADER_SMOOTH_NORMAL) {
|
||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
||||
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||
|
||||
#ifdef __INSTANCING__
|
||||
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
|
||||
|
@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
|
||||
#define __PATCH_EVAL__
|
||||
#define __SHADOW_TRICKS__
|
||||
|
||||
#define __DENOISING_FEATURES__
|
||||
|
||||
#ifdef __KERNEL_SHADING__
|
||||
# define __SVM__
|
||||
# define __EMISSION__
|
||||
@ -314,31 +316,32 @@ enum SamplingPattern {
|
||||
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
|
||||
|
||||
enum PathRayFlag {
|
||||
PATH_RAY_CAMERA = 1,
|
||||
PATH_RAY_REFLECT = 2,
|
||||
PATH_RAY_TRANSMIT = 4,
|
||||
PATH_RAY_DIFFUSE = 8,
|
||||
PATH_RAY_GLOSSY = 16,
|
||||
PATH_RAY_SINGULAR = 32,
|
||||
PATH_RAY_TRANSPARENT = 64,
|
||||
PATH_RAY_CAMERA = (1 << 0),
|
||||
PATH_RAY_REFLECT = (1 << 1),
|
||||
PATH_RAY_TRANSMIT = (1 << 2),
|
||||
PATH_RAY_DIFFUSE = (1 << 3),
|
||||
PATH_RAY_GLOSSY = (1 << 4),
|
||||
PATH_RAY_SINGULAR = (1 << 5),
|
||||
PATH_RAY_TRANSPARENT = (1 << 6),
|
||||
|
||||
PATH_RAY_SHADOW_OPAQUE = 128,
|
||||
PATH_RAY_SHADOW_TRANSPARENT = 256,
|
||||
PATH_RAY_SHADOW_OPAQUE = (1 << 7),
|
||||
PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
|
||||
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
|
||||
|
||||
PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
|
||||
PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
|
||||
PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
|
||||
PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
|
||||
|
||||
/* Special flag to tag unaligned BVH nodes. */
|
||||
PATH_RAY_NODE_UNALIGNED = 2048,
|
||||
PATH_RAY_NODE_UNALIGNED = (1 << 11),
|
||||
|
||||
PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
|
||||
PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
|
||||
|
||||
PATH_RAY_MIS_SKIP = 4096,
|
||||
PATH_RAY_DIFFUSE_ANCESTOR = 8192,
|
||||
PATH_RAY_SINGLE_PASS_DONE = 16384,
|
||||
PATH_RAY_SHADOW_CATCHER = 32768,
|
||||
PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
|
||||
PATH_RAY_MIS_SKIP = (1 << 12),
|
||||
PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
|
||||
PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
|
||||
PATH_RAY_SHADOW_CATCHER = (1 << 15),
|
||||
PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
|
||||
PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
|
||||
};
|
||||
|
||||
/* Closure Label */
|
||||
@ -394,6 +397,22 @@ typedef enum PassType {
|
||||
|
||||
#define PASS_ALL (~0)
|
||||
|
||||
typedef enum DenoisingPassOffsets {
|
||||
DENOISING_PASS_NORMAL = 0,
|
||||
DENOISING_PASS_NORMAL_VAR = 3,
|
||||
DENOISING_PASS_ALBEDO = 6,
|
||||
DENOISING_PASS_ALBEDO_VAR = 9,
|
||||
DENOISING_PASS_DEPTH = 12,
|
||||
DENOISING_PASS_DEPTH_VAR = 13,
|
||||
DENOISING_PASS_SHADOW_A = 14,
|
||||
DENOISING_PASS_SHADOW_B = 17,
|
||||
DENOISING_PASS_COLOR = 20,
|
||||
DENOISING_PASS_COLOR_VAR = 23,
|
||||
|
||||
DENOISING_PASS_SIZE_BASE = 26,
|
||||
DENOISING_PASS_SIZE_CLEAN = 3,
|
||||
} DenoisingPassOffsets;
|
||||
|
||||
typedef enum BakePassFilter {
|
||||
BAKE_FILTER_NONE = 0,
|
||||
BAKE_FILTER_DIRECT = (1 << 0),
|
||||
@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
|
||||
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
|
||||
} BakePassFilterCombos;
|
||||
|
||||
typedef enum DenoiseFlag {
|
||||
DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
|
||||
DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
|
||||
DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
|
||||
DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
|
||||
DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
|
||||
DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
|
||||
DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
|
||||
DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
|
||||
DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
|
||||
} DenoiseFlag;
|
||||
|
||||
typedef ccl_addr_space struct PathRadiance {
|
||||
#ifdef __PASSES__
|
||||
int use_light_pass;
|
||||
@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
|
||||
/* Color of the background on which shadow is alpha-overed. */
|
||||
float3 shadow_color;
|
||||
#endif
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
float3 denoising_normal;
|
||||
float3 denoising_albedo;
|
||||
float denoising_depth;
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
} PathRadiance;
|
||||
|
||||
typedef struct BsdfEval {
|
||||
@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
|
||||
#define SHADER_CLOSURE_BASE \
|
||||
float3 weight; \
|
||||
ClosureType type; \
|
||||
float sample_weight \
|
||||
float sample_weight; \
|
||||
float3 N
|
||||
|
||||
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
|
||||
SHADER_CLOSURE_BASE;
|
||||
|
||||
float data[14]; /* pad to 80 bytes */
|
||||
float data[10]; /* pad to 80 bytes */
|
||||
} ShaderClosure;
|
||||
|
||||
/* Shader Context
|
||||
@ -960,6 +998,10 @@ typedef struct PathState {
|
||||
int transmission_bounce;
|
||||
int transparent_bounce;
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
float denoising_feature_weight;
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
|
||||
/* multiple importance sampling */
|
||||
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
|
||||
float ray_pdf; /* last bounce pdf */
|
||||
@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
|
||||
float mist_inv_depth;
|
||||
float mist_falloff;
|
||||
|
||||
int pass_denoising_data;
|
||||
int pass_denoising_clean;
|
||||
int denoising_flags;
|
||||
int pad;
|
||||
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
int pass_bvh_traversed_nodes;
|
||||
int pass_bvh_traversed_instances;
|
||||
|
61
intern/cycles/kernel/kernels/cpu/filter.cpp
Normal file
61
intern/cycles/kernel/kernels/cpu/filter.cpp
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* CPU kernel entry points */
|
||||
|
||||
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
|
||||
* one with SSE2 intrinsics.
|
||||
*/
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
# define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
/* When building kernel for native machine detect kernel features from the flags
|
||||
* set by compiler.
|
||||
*/
|
||||
#ifdef WITH_KERNEL_NATIVE
|
||||
# ifdef __SSE2__
|
||||
# ifndef __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE2__
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __SSE3__
|
||||
# define __KERNEL_SSE3__
|
||||
# endif
|
||||
# ifdef __SSSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# endif
|
||||
# ifdef __SSE4_1__
|
||||
# define __KERNEL_SSE41__
|
||||
# endif
|
||||
# ifdef __AVX__
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
# ifdef __AVX2__
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* quiet unused define warnings */
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
/* do nothing */
|
||||
#endif
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
39
intern/cycles/kernel/kernels/cpu/filter_avx.cpp
Normal file
39
intern/cycles/kernel/kernels/cpu/filter_avx.cpp
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu_avx
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
40
intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
Normal file
40
intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX2
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu_avx2
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
132
intern/cycles/kernel/kernels/cpu/filter_cpu.h
Normal file
132
intern/cycles/kernel/kernels/cpu/filter_cpu.h
Normal file
@ -0,0 +1,132 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Templated common declaration part of all CPU kernels. */
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
|
||||
TilesInfo *tiles,
|
||||
int x,
|
||||
int y,
|
||||
float *unfilteredA,
|
||||
float *unfilteredB,
|
||||
float *sampleV,
|
||||
float *sampleVV,
|
||||
float *bufferV,
|
||||
int* prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
|
||||
TilesInfo *tiles,
|
||||
int m_offset,
|
||||
int v_offset,
|
||||
int x,
|
||||
int y,
|
||||
float *mean,
|
||||
float *variance,
|
||||
int* prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
|
||||
float *mean,
|
||||
float *variance,
|
||||
float *a,
|
||||
float *b,
|
||||
int* prefilter_rect,
|
||||
int r);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
|
||||
int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
float *transform,
|
||||
int *rank,
|
||||
int* rect,
|
||||
int pass_stride,
|
||||
int radius,
|
||||
float pca_threshold);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
int dy,
|
||||
float *weightImage,
|
||||
float *variance,
|
||||
float *differenceImage,
|
||||
int* rect,
|
||||
int w,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
|
||||
float *outImage,
|
||||
int* rect,
|
||||
int w,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
|
||||
float *outImage,
|
||||
int* rect,
|
||||
int w,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
int dy,
|
||||
float *differenceImage,
|
||||
float *image,
|
||||
float *outImage,
|
||||
float *accumImage,
|
||||
int* rect,
|
||||
int w,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||
int dy,
|
||||
float *differenceImage,
|
||||
float *buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float *transform,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *rect,
|
||||
int *filter_rect,
|
||||
int w,
|
||||
int h,
|
||||
int f,
|
||||
int pass_stride);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
|
||||
float *accumImage,
|
||||
int* rect,
|
||||
int w);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
int w,
|
||||
int h,
|
||||
float *buffer,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *buffer_params,
|
||||
int sample);
|
||||
|
||||
#undef KERNEL_ARCH
|
259
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
Normal file
259
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
Normal file
@ -0,0 +1,259 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Templated common implementation part of all CPU kernels.
|
||||
*
|
||||
* The idea is that particular .cpp files sets needed optimization flags and
|
||||
* simply includes this file without worry of copying actual implementation over.
|
||||
*/
|
||||
|
||||
#include "kernel/kernel_compat_cpu.h"
|
||||
|
||||
#include "kernel/filter/filter_kernel.h"
|
||||
|
||||
#ifdef KERNEL_STUB
|
||||
# include "util/util_debug.h"
|
||||
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
/* Denoise filter */
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
|
||||
TilesInfo *tiles,
|
||||
int x,
|
||||
int y,
|
||||
float *unfilteredA,
|
||||
float *unfilteredB,
|
||||
float *sampleVariance,
|
||||
float *sampleVarianceV,
|
||||
float *bufferVariance,
|
||||
int* prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
|
||||
#else
|
||||
kernel_filter_divide_shadow(sample, tiles,
|
||||
x, y,
|
||||
unfilteredA,
|
||||
unfilteredB,
|
||||
sampleVariance,
|
||||
sampleVarianceV,
|
||||
bufferVariance,
|
||||
load_int4(prefilter_rect),
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
|
||||
TilesInfo *tiles,
|
||||
int m_offset,
|
||||
int v_offset,
|
||||
int x,
|
||||
int y,
|
||||
float *mean, float *variance,
|
||||
int* prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
|
||||
#else
|
||||
kernel_filter_get_feature(sample, tiles,
|
||||
m_offset, v_offset,
|
||||
x, y,
|
||||
mean, variance,
|
||||
load_int4(prefilter_rect),
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
|
||||
float *mean,
|
||||
float *variance,
|
||||
float *a,
|
||||
float *b,
|
||||
int* prefilter_rect,
|
||||
int r)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
|
||||
#else
|
||||
kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
|
||||
int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
float *transform,
|
||||
int *rank,
|
||||
int* prefilter_rect,
|
||||
int pass_stride,
|
||||
int radius,
|
||||
float pca_threshold)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
|
||||
#else
|
||||
rank += storage_ofs;
|
||||
transform += storage_ofs*TRANSFORM_SIZE;
|
||||
kernel_filter_construct_transform(buffer,
|
||||
x, y,
|
||||
load_int4(prefilter_rect),
|
||||
pass_stride,
|
||||
transform,
|
||||
rank,
|
||||
radius,
|
||||
pca_threshold);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
int dy,
|
||||
float *weightImage,
|
||||
float *variance,
|
||||
float *differenceImage,
|
||||
int *rect,
|
||||
int w,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
|
||||
#else
|
||||
kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
|
||||
float *outImage,
|
||||
int *rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
|
||||
#else
|
||||
kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
|
||||
float *outImage,
|
||||
int *rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
|
||||
#else
|
||||
kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
int dy,
|
||||
float *differenceImage,
|
||||
float *image,
|
||||
float *outImage,
|
||||
float *accumImage,
|
||||
int *rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
|
||||
#else
|
||||
kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||
int dy,
|
||||
float *differenceImage,
|
||||
float *buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float *transform,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *rect,
|
||||
int *filter_rect,
|
||||
int w,
|
||||
int h,
|
||||
int f,
|
||||
int pass_stride)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
|
||||
#else
|
||||
kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
|
||||
float *accumImage,
|
||||
int *rect,
|
||||
int w)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
|
||||
#else
|
||||
kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
int w,
|
||||
int h,
|
||||
float *buffer,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *buffer_params,
|
||||
int sample)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_finalize);
|
||||
#else
|
||||
XtWX += storage_ofs*XTWX_SIZE;
|
||||
XtWY += storage_ofs*XTWY_SIZE;
|
||||
rank += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef KERNEL_STUB
|
||||
#undef STUB_ASSERT
|
||||
#undef KERNEL_ARCH
|
||||
|
||||
CCL_NAMESPACE_END
|
34
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
Normal file
34
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE2
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu_sse2
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
36
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
Normal file
36
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu_sse3
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
37
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
Normal file
37
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||
|
||||
#include "kernel/filter/filter.h"
|
||||
#define KERNEL_ARCH cpu_sse41
|
||||
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
@ -17,21 +17,23 @@
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
#endif
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_avx
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,21 +18,23 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
#endif
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx2
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_avx2
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
|
||||
|
||||
#undef KERNEL_ARCH
|
||||
|
@ -57,6 +57,11 @@
|
||||
# include "kernel/split/kernel_buffer_update.h"
|
||||
#endif
|
||||
|
||||
#ifdef KERNEL_STUB
|
||||
# include "util/util_debug.h"
|
||||
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __SPLIT_KERNEL__
|
||||
@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
#ifdef __BRANCHED_PATH__
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, path_trace);
|
||||
#else
|
||||
# ifdef __BRANCHED_PATH__
|
||||
if(kernel_data.integrator.branched) {
|
||||
kernel_branched_path_trace(kg,
|
||||
buffer,
|
||||
@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
|
||||
stride);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
# endif
|
||||
{
|
||||
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||
}
|
||||
#endif /* KERNEL_STUB */
|
||||
}
|
||||
|
||||
/* Film */
|
||||
@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
|
||||
#else
|
||||
kernel_film_convert_to_byte(kg,
|
||||
rgba,
|
||||
buffer,
|
||||
@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
|
||||
x, y,
|
||||
offset,
|
||||
stride);
|
||||
#endif /* KERNEL_STUB */
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
||||
@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
|
||||
#else
|
||||
kernel_film_convert_to_half_float(kg,
|
||||
rgba,
|
||||
buffer,
|
||||
@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
||||
x, y,
|
||||
offset,
|
||||
stride);
|
||||
#endif /* KERNEL_STUB */
|
||||
}
|
||||
|
||||
/* Shader Evaluate */
|
||||
@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||
int offset,
|
||||
int sample)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, shader);
|
||||
#else
|
||||
if(type >= SHADER_EVAL_BAKE) {
|
||||
kernel_assert(output_luma == NULL);
|
||||
#ifdef __BAKING__
|
||||
# ifdef __BAKING__
|
||||
kernel_bake_evaluate(kg,
|
||||
input,
|
||||
output,
|
||||
@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||
i,
|
||||
offset,
|
||||
sample);
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
else {
|
||||
kernel_shader_evaluate(kg,
|
||||
@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||
i,
|
||||
sample);
|
||||
}
|
||||
#endif /* KERNEL_STUB */
|
||||
}
|
||||
|
||||
#else /* __SPLIT_KERNEL__ */
|
||||
|
||||
/* Split Kernel Path Tracing */
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
#ifdef KERNEL_STUB
|
||||
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||
{ \
|
||||
STUB_ASSERT(KERNEL_ARCH, name); \
|
||||
}
|
||||
#else
|
||||
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||
{ \
|
||||
kernel_##name(kg); \
|
||||
}
|
||||
#endif /* KERNEL_STUB */
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||
@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
|
||||
{
|
||||
#define REGISTER_NAME_STRING(name) #name
|
||||
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
|
||||
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
|
||||
|
||||
REGISTER(path_trace);
|
||||
REGISTER(convert_to_byte);
|
||||
REGISTER(convert_to_half_float);
|
||||
REGISTER(shader);
|
||||
|
||||
REGISTER(data_init);
|
||||
REGISTER(path_init);
|
||||
REGISTER(scene_intersect);
|
||||
REGISTER(lamp_emission);
|
||||
REGISTER(do_volume);
|
||||
REGISTER(queue_enqueue);
|
||||
REGISTER(indirect_background);
|
||||
REGISTER(shader_setup);
|
||||
REGISTER(shader_sort);
|
||||
REGISTER(shader_eval);
|
||||
REGISTER(holdout_emission_blurring_pathtermination_ao);
|
||||
REGISTER(subsurface_scatter);
|
||||
REGISTER(direct_lighting);
|
||||
REGISTER(shadow_blocked_ao);
|
||||
REGISTER(shadow_blocked_dl);
|
||||
REGISTER(next_iteration_setup);
|
||||
REGISTER(indirect_subsurface);
|
||||
REGISTER(buffer_update);
|
||||
|
||||
#undef REGISTER
|
||||
#undef REGISTER_EVAL_NAME
|
||||
#undef REGISTER_NAME_STRING
|
||||
}
|
||||
|
||||
#endif /* __SPLIT_KERNEL__ */
|
||||
|
||||
#undef KERNEL_STUB
|
||||
#undef STUB_ASSERT
|
||||
#undef KERNEL_ARCH
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -17,22 +17,25 @@
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_avx
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,23 +18,25 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx2
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_avx2
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,17 +18,19 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse2
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse2
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,19 +18,21 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse3
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse3
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,20 +18,22 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse41
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse41
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,15 +18,17 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse2
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse2
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,17 +18,19 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
#endif
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse3
|
||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse3
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
@ -18,18 +18,20 @@
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
#endif
|
||||
|
||||
#include "util/util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# include "kernel/kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse41
|
||||
# include "kernel/kernels/cpu//kernel_cpu_impl.h"
|
||||
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# define KERNEL_STUB
|
||||
#else
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# endif
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||
|
||||
#include "kernel/kernel.h"
|
||||
#define KERNEL_ARCH cpu_sse41
|
||||
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||
|
235
intern/cycles/kernel/kernels/cuda/filter.cu
Normal file
235
intern/cycles/kernel/kernels/cuda/filter.cu
Normal file
@ -0,0 +1,235 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* CUDA kernel entry points */
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
#include "kernel_config.h"
|
||||
|
||||
#include "kernel/kernel_compat_cuda.h"
|
||||
|
||||
#include "kernel/filter/filter_kernel.h"
|
||||
|
||||
/* kernels */
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_divide_shadow(int sample,
|
||||
TilesInfo *tiles,
|
||||
float *unfilteredA,
|
||||
float *unfilteredB,
|
||||
float *sampleVariance,
|
||||
float *sampleVarianceV,
|
||||
float *bufferVariance,
|
||||
int4 prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_divide_shadow(sample,
|
||||
tiles,
|
||||
x, y,
|
||||
unfilteredA,
|
||||
unfilteredB,
|
||||
sampleVariance,
|
||||
sampleVarianceV,
|
||||
bufferVariance,
|
||||
prefilter_rect,
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_get_feature(int sample,
|
||||
TilesInfo *tiles,
|
||||
int m_offset,
|
||||
int v_offset,
|
||||
float *mean,
|
||||
float *variance,
|
||||
int4 prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
bool use_split_variance)
|
||||
{
|
||||
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_get_feature(sample,
|
||||
tiles,
|
||||
m_offset, v_offset,
|
||||
x, y,
|
||||
mean, variance,
|
||||
prefilter_rect,
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
|
||||
{
|
||||
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
|
||||
float *transform, int *rank,
|
||||
int4 filter_area, int4 rect,
|
||||
int radius, float pca_threshold,
|
||||
int pass_stride)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < filter_area.z && y < filter_area.w) {
|
||||
int *l_rank = rank + y*filter_area.z + x;
|
||||
float *l_transform = transform + y*filter_area.z + x;
|
||||
kernel_filter_construct_transform(buffer,
|
||||
x + filter_area.x, y + filter_area.y,
|
||||
rect, pass_stride,
|
||||
l_transform, l_rank,
|
||||
radius, pca_threshold,
|
||||
filter_area.z*filter_area.w,
|
||||
threadIdx.y*blockDim.x + threadIdx.x);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
|
||||
float ccl_restrict_ptr weightImage,
|
||||
float ccl_restrict_ptr varianceImage,
|
||||
float *differenceImage,
|
||||
int4 rect, int w,
|
||||
int channel_offset,
|
||||
float a, float k_2) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_update_output(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr image,
|
||||
float *outImage, float *accumImage,
|
||||
int4 rect, int w,
|
||||
int f) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float const* __restrict__ transform,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int pass_stride) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
|
||||
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||
kernel_filter_nlm_construct_gramian(x, y,
|
||||
dx, dy,
|
||||
differenceImage,
|
||||
buffer,
|
||||
color_pass, variance_pass,
|
||||
transform, rank,
|
||||
XtWX, XtWY,
|
||||
rect, filter_rect,
|
||||
w, h, f,
|
||||
pass_stride,
|
||||
threadIdx.y*blockDim.x + threadIdx.x);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_finalize(int w, int h,
|
||||
float *buffer, int *rank,
|
||||
float *XtWX, float3 *XtWY,
|
||||
int4 filter_area, int4 buffer_params,
|
||||
int sample) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < filter_area.z && y < filter_area.w) {
|
||||
int storage_ofs = y*filter_area.z+x;
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
262
intern/cycles/kernel/kernels/opencl/filter.cl
Normal file
262
intern/cycles/kernel/kernels/opencl/filter.cl
Normal file
@ -0,0 +1,262 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* OpenCL kernel entry points */
|
||||
|
||||
#include "kernel/kernel_compat_opencl.h"
|
||||
|
||||
#include "kernel/filter/filter_kernel.h"
|
||||
|
||||
/* kernels */
|
||||
|
||||
__kernel void kernel_ocl_filter_divide_shadow(int sample,
|
||||
ccl_global TilesInfo *tiles,
|
||||
ccl_global float *unfilteredA,
|
||||
ccl_global float *unfilteredB,
|
||||
ccl_global float *sampleVariance,
|
||||
ccl_global float *sampleVarianceV,
|
||||
ccl_global float *bufferVariance,
|
||||
int4 prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
char use_split_variance)
|
||||
{
|
||||
int x = prefilter_rect.x + get_global_id(0);
|
||||
int y = prefilter_rect.y + get_global_id(1);
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_divide_shadow(sample,
|
||||
tiles,
|
||||
x, y,
|
||||
unfilteredA,
|
||||
unfilteredB,
|
||||
sampleVariance,
|
||||
sampleVarianceV,
|
||||
bufferVariance,
|
||||
prefilter_rect,
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_get_feature(int sample,
|
||||
ccl_global TilesInfo *tiles,
|
||||
int m_offset,
|
||||
int v_offset,
|
||||
ccl_global float *mean,
|
||||
ccl_global float *variance,
|
||||
int4 prefilter_rect,
|
||||
int buffer_pass_stride,
|
||||
int buffer_denoising_offset,
|
||||
char use_split_variance)
|
||||
{
|
||||
int x = prefilter_rect.x + get_global_id(0);
|
||||
int y = prefilter_rect.y + get_global_id(1);
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_get_feature(sample,
|
||||
tiles,
|
||||
m_offset, v_offset,
|
||||
x, y,
|
||||
mean, variance,
|
||||
prefilter_rect,
|
||||
buffer_pass_stride,
|
||||
buffer_denoising_offset,
|
||||
use_split_variance);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
|
||||
ccl_global float *variance,
|
||||
ccl_global float *a,
|
||||
ccl_global float *b,
|
||||
int4 prefilter_rect,
|
||||
int r)
|
||||
{
|
||||
int x = prefilter_rect.x + get_global_id(0);
|
||||
int y = prefilter_rect.y + get_global_id(1);
|
||||
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||
ccl_global float *transform,
|
||||
ccl_global int *rank,
|
||||
int4 filter_area,
|
||||
int4 rect,
|
||||
int pass_stride,
|
||||
int radius,
|
||||
float pca_threshold)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
if(x < filter_area.z && y < filter_area.w) {
|
||||
ccl_global int *l_rank = rank + y*filter_area.z + x;
|
||||
ccl_global float *l_transform = transform + y*filter_area.z + x;
|
||||
kernel_filter_construct_transform(buffer,
|
||||
x + filter_area.x, y + filter_area.y,
|
||||
rect, pass_stride,
|
||||
l_transform, l_rank,
|
||||
radius, pca_threshold,
|
||||
filter_area.z*filter_area.w,
|
||||
get_local_id(1)*get_local_size(0) + get_local_id(0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr weightImage,
|
||||
ccl_global float ccl_restrict_ptr varianceImage,
|
||||
ccl_global float *differenceImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr image,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float *accumImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
|
||||
ccl_global float ccl_restrict_ptr accumImage,
|
||||
int4 rect,
|
||||
int w) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w,
|
||||
int h,
|
||||
int f,
|
||||
int pass_stride) {
|
||||
int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
|
||||
int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
|
||||
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||
kernel_filter_nlm_construct_gramian(x, y,
|
||||
dx, dy,
|
||||
differenceImage,
|
||||
buffer,
|
||||
color_pass, variance_pass,
|
||||
transform, rank,
|
||||
XtWX, XtWY,
|
||||
rect, filter_rect,
|
||||
w, h, f,
|
||||
pass_stride,
|
||||
get_local_id(1)*get_local_size(0) + get_local_id(0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_finalize(int w,
|
||||
int h,
|
||||
ccl_global float *buffer,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 filter_area,
|
||||
int4 buffer_params,
|
||||
int sample) {
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
if(x < filter_area.z && y < filter_area.w) {
|
||||
int storage_ofs = y*filter_area.z+x;
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
|
||||
ccl_global float *buffer_1,
|
||||
ccl_global float *buffer_2,
|
||||
ccl_global float *buffer_3,
|
||||
ccl_global float *buffer_4,
|
||||
ccl_global float *buffer_5,
|
||||
ccl_global float *buffer_6,
|
||||
ccl_global float *buffer_7,
|
||||
ccl_global float *buffer_8,
|
||||
ccl_global float *buffer_9)
|
||||
{
|
||||
if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
|
||||
tiles->buffers[0] = buffer_1;
|
||||
tiles->buffers[1] = buffer_2;
|
||||
tiles->buffers[2] = buffer_3;
|
||||
tiles->buffers[3] = buffer_4;
|
||||
tiles->buffers[4] = buffer_5;
|
||||
tiles->buffers[5] = buffer_6;
|
||||
tiles->buffers[6] = buffer_7;
|
||||
tiles->buffers[7] = buffer_8;
|
||||
tiles->buffers[8] = buffer_9;
|
||||
}
|
||||
}
|
@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
||||
RNG rng = kernel_split_state.rng[ray_index];
|
||||
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
||||
float3 throughput = branched_state->throughput;
|
||||
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
|
||||
|
||||
float sum_sample_weight = 0.0f;
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
if(ps->denoising_feature_weight > 0.0f) {
|
||||
for(int i = 0; i < sd->num_closure; i++) {
|
||||
const ShaderClosure *sc = &sd->closure[i];
|
||||
|
||||
/* transparency is not handled here, but in outer loop */
|
||||
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sum_sample_weight += sc->sample_weight;
|
||||
}
|
||||
}
|
||||
else {
|
||||
sum_sample_weight = 1.0f;
|
||||
}
|
||||
#endif /* __DENOISING_FEATURES__ */
|
||||
|
||||
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
|
||||
const ShaderClosure *sc = &sd->closure[i];
|
||||
@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
||||
RNG bsdf_rng = cmj_hash(rng, i);
|
||||
|
||||
for(int j = branched_state->next_sample; j < num_samples; j++) {
|
||||
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
|
||||
if(reset_path_state) {
|
||||
*ps = branched_state->path_state;
|
||||
}
|
||||
@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
||||
tp,
|
||||
ps,
|
||||
L,
|
||||
bsdf_ray))
|
||||
bsdf_ray,
|
||||
sum_sample_weight))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
||||
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
|
||||
|
||||
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
|
||||
float3 L_sum;
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if(state->flag & PATH_RAY_SHADOW_CATCHER) {
|
||||
L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
|
||||
}
|
||||
else
|
||||
#endif /* __SHADOW_TRICKS__ */
|
||||
{
|
||||
L_sum = path_radiance_clamp_and_sum(kg, L);
|
||||
}
|
||||
kernel_write_light_passes(kg, buffer, L, sample);
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
|
||||
#endif
|
||||
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
|
||||
|
||||
/* accumulate result in output buffer */
|
||||
kernel_write_pass_float4(buffer, sample, L_rad);
|
||||
bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
|
||||
kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
|
||||
|
||||
path_rng_end(kg, rng_state, rng);
|
||||
|
||||
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
|
||||
|
@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||
if(state->flag & PATH_RAY_CAMERA) {
|
||||
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
||||
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||
state->catcher_object = sd->object;
|
||||
if(!kernel_data.background.transparent) {
|
||||
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
||||
@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
|
||||
kernel_split_state.throughput[ray_index] = throughput/probability;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_update_denoising_features(kg, sd, state, L);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
|
||||
&shadow))
|
||||
{
|
||||
/* accumulate */
|
||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
||||
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||
}
|
||||
else {
|
||||
path_radiance_accum_total_light(L, throughput, &L_light);
|
||||
path_radiance_accum_total_light(L, state, throughput, &L_light);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
|
||||
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
||||
|
||||
if(bsdf) {
|
||||
bsdf->N = N;
|
||||
sd->flag |= bsdf_transparent_setup(bsdf);
|
||||
}
|
||||
break;
|
||||
@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
|
||||
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
||||
|
||||
if(bsdf) {
|
||||
bsdf->N = N;
|
||||
/* todo: giving a fixed weight here will cause issues when
|
||||
* mixing multiple BSDFS. energy will not be conserved and
|
||||
* the throughput can blow up after multiple bounces. we
|
||||
|
@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
|
||||
strength = max(strength, 0.0f);
|
||||
|
||||
/* compute and output perturbed normal */
|
||||
float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
|
||||
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
|
||||
float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
|
||||
if(is_zero(normal_out)) {
|
||||
normal_out = normal_in;
|
||||
}
|
||||
else {
|
||||
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
|
||||
}
|
||||
|
||||
if(use_object_space) {
|
||||
object_normal_transform(kg, sd, &normal_out);
|
||||
|
@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
|
||||
#ifdef __UV__
|
||||
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
|
||||
#endif
|
||||
default: data = make_float3(0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
stack_store_float3(stack, out_offset, data);
|
||||
|
@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
|
||||
float3 co = stack_load_float3(stack, co_offset);
|
||||
float2 uv;
|
||||
|
||||
co = normalize(co);
|
||||
|
||||
co = safe_normalize(co);
|
||||
|
||||
if(projection == 0)
|
||||
uv = direction_to_equirectangular(co);
|
||||
else
|
||||
|
@ -402,7 +402,6 @@ typedef enum ClosureType {
|
||||
CLOSURE_BSDF_DIFFUSE_TOON_ID,
|
||||
|
||||
/* Glossy */
|
||||
CLOSURE_BSDF_GLOSSY_ID,
|
||||
CLOSURE_BSDF_REFLECTION_ID,
|
||||
CLOSURE_BSDF_MICROFACET_GGX_ID,
|
||||
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
|
||||
@ -423,14 +422,13 @@ typedef enum ClosureType {
|
||||
CLOSURE_BSDF_HAIR_REFLECTION_ID,
|
||||
|
||||
/* Transmission */
|
||||
CLOSURE_BSDF_TRANSMISSION_ID,
|
||||
CLOSURE_BSDF_TRANSLUCENT_ID,
|
||||
CLOSURE_BSDF_REFRACTION_ID,
|
||||
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
|
||||
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
|
||||
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
|
||||
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
|
||||
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
|
||||
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
|
||||
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
|
||||
CLOSURE_BSDF_SHARP_GLASS_ID,
|
||||
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
|
||||
@ -465,13 +463,16 @@ typedef enum ClosureType {
|
||||
/* watch this, being lazy with memory usage */
|
||||
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
|
||||
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
|
||||
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
|
||||
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
|
||||
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
|
||||
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
|
||||
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
|
||||
#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
|
||||
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
|
||||
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
|
||||
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
|
||||
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
|
||||
#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
|
||||
(type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
|
||||
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
|
||||
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
|
||||
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
||||
@ -480,7 +481,7 @@ typedef enum ClosureType {
|
||||
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
|
||||
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
|
||||
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
||||
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
|
||||
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
|
||||
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
|
||||
|
||||
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
|
||||
|
@ -42,6 +42,9 @@ BufferParams::BufferParams()
|
||||
full_width = 0;
|
||||
full_height = 0;
|
||||
|
||||
denoising_data_pass = false;
|
||||
denoising_clean_pass = false;
|
||||
|
||||
Pass::add(PASS_COMBINED, passes);
|
||||
}
|
||||
|
||||
@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
|
||||
|
||||
for(size_t i = 0; i < passes.size(); i++)
|
||||
size += passes[i].components;
|
||||
|
||||
|
||||
if(denoising_data_pass) {
|
||||
size += DENOISING_PASS_SIZE_BASE;
|
||||
if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
|
||||
}
|
||||
|
||||
return align_up(size, 4);
|
||||
}
|
||||
|
||||
int BufferParams::get_denoising_offset()
|
||||
{
|
||||
int offset = 0;
|
||||
|
||||
for(size_t i = 0; i < passes.size(); i++)
|
||||
offset += passes[i].components;
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
/* Render Buffer Task */
|
||||
|
||||
RenderTile::RenderTile()
|
||||
@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
|
||||
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
|
||||
}
|
||||
|
||||
bool RenderBuffers::copy_from_device()
|
||||
bool RenderBuffers::copy_from_device(Device *from_device)
|
||||
{
|
||||
if(!buffer.device_pointer)
|
||||
return false;
|
||||
|
||||
device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
|
||||
if(!from_device) {
|
||||
from_device = device;
|
||||
}
|
||||
|
||||
from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
|
||||
{
|
||||
float scale = 1.0f/sample;
|
||||
|
||||
if(offset == DENOISING_PASS_COLOR) {
|
||||
scale *= exposure;
|
||||
}
|
||||
else if(offset == DENOISING_PASS_COLOR_VAR) {
|
||||
scale *= exposure*exposure;
|
||||
}
|
||||
|
||||
offset += params.get_denoising_offset();
|
||||
float *in = (float*)buffer.data_pointer + offset;
|
||||
int pass_stride = params.get_passes_size();
|
||||
int size = params.width*params.height;
|
||||
|
||||
if(components == 1) {
|
||||
for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
|
||||
pixels[0] = in[0]*scale;
|
||||
}
|
||||
}
|
||||
else if(components == 3) {
|
||||
for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
|
||||
pixels[0] = in[0]*scale;
|
||||
pixels[1] = in[1]*scale;
|
||||
pixels[2] = in[2]*scale;
|
||||
}
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -51,6 +51,9 @@ public:
|
||||
|
||||
/* passes */
|
||||
array<Pass> passes;
|
||||
bool denoising_data_pass;
|
||||
/* If only some light path types should be denoised, an additional pass is needed. */
|
||||
bool denoising_clean_pass;
|
||||
|
||||
/* functions */
|
||||
BufferParams();
|
||||
@ -59,6 +62,7 @@ public:
|
||||
bool modified(const BufferParams& params);
|
||||
void add_pass(PassType type);
|
||||
int get_passes_size();
|
||||
int get_denoising_offset();
|
||||
};
|
||||
|
||||
/* Render Buffers */
|
||||
@ -73,18 +77,19 @@ public:
|
||||
/* random number generator state */
|
||||
device_vector<uint> rng_state;
|
||||
|
||||
Device *device;
|
||||
|
||||
explicit RenderBuffers(Device *device);
|
||||
~RenderBuffers();
|
||||
|
||||
void reset(Device *device, BufferParams& params);
|
||||
|
||||
bool copy_from_device();
|
||||
bool copy_from_device(Device *from_device = NULL);
|
||||
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
|
||||
bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
|
||||
|
||||
protected:
|
||||
void device_free();
|
||||
|
||||
Device *device;
|
||||
};
|
||||
|
||||
/* Display Buffer
|
||||
@ -131,6 +136,9 @@ protected:
|
||||
|
||||
class RenderTile {
|
||||
public:
|
||||
typedef enum { PATH_TRACE, DENOISE } Task;
|
||||
|
||||
Task task;
|
||||
int x, y, w, h;
|
||||
int start_sample;
|
||||
int num_samples;
|
||||
@ -138,6 +146,7 @@ public:
|
||||
int resolution;
|
||||
int offset;
|
||||
int stride;
|
||||
int tile_index;
|
||||
|
||||
device_ptr buffer;
|
||||
device_ptr rng_state;
|
||||
|
@ -279,6 +279,10 @@ NODE_DEFINE(Film)
|
||||
|
||||
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
|
||||
|
||||
SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
|
||||
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
|
||||
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
|
||||
kfilm->pass_stride += pass.components;
|
||||
}
|
||||
|
||||
kfilm->pass_denoising_data = 0;
|
||||
kfilm->pass_denoising_clean = 0;
|
||||
kfilm->denoising_flags = 0;
|
||||
if(denoising_data_pass) {
|
||||
kfilm->pass_denoising_data = kfilm->pass_stride;
|
||||
kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
|
||||
kfilm->denoising_flags = denoising_flags;
|
||||
if(denoising_clean_pass) {
|
||||
kfilm->pass_denoising_clean = kfilm->pass_stride;
|
||||
kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
|
||||
kfilm->use_light_pass = 1;
|
||||
}
|
||||
}
|
||||
|
||||
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
|
||||
kfilm->pass_alpha_threshold = pass_alpha_threshold;
|
||||
|
||||
@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
|
||||
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
|
||||
kfilm->mist_falloff = mist_falloff;
|
||||
|
||||
pass_stride = kfilm->pass_stride;
|
||||
denoising_data_offset = kfilm->pass_denoising_data;
|
||||
denoising_clean_offset = kfilm->pass_denoising_clean;
|
||||
|
||||
need_update = false;
|
||||
}
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user