blender/intern/cycles/util/debug.h
Michael Jones 654e1e901b Cycles: Use local atomics for faster shader sorting (enabled on Metal)
This patch adds two new kernels: SORT_BUCKET_PASS and SORT_WRITE_PASS. These replace PREFIX_SUM and SORTED_PATHS_ARRAY on supported devices (currently implemented on Metal, but will be trivial to enable on the other backends). The new kernels exploit sort partitioning (see D15331) by sorting each partition separately using local atomics. This can give an overall render speedup of 2-3% depending on architecture. As before, we fall back to the original non-partitioned sorting when the shader count is "too high".

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D16909
2023-02-06 11:18:26 +00:00

149 lines
3.2 KiB
C++

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#ifndef __UTIL_DEBUG_H__
#define __UTIL_DEBUG_H__
#include <cassert>
#include <iostream>
#include "bvh/params.h"
CCL_NAMESPACE_BEGIN
/* Global storage for all sort of flags used to fine-tune behavior of particular
* areas for the development purposes, without officially exposing settings to
* the interface.
*/
class DebugFlags {
public:
/* Descriptor of CPU feature-set to be used. */
struct CPU {
CPU();
/* Reset flags to their defaults. */
void reset();
/* Flags describing which instructions sets are allowed for use. */
bool avx2 = true;
bool sse41 = true;
bool sse2 = true;
/* Check functions to see whether instructions up to the given one
* are allowed for use.
*/
bool has_avx2()
{
return has_sse41() && avx2;
}
bool has_sse41()
{
return has_sse2() && sse41;
}
bool has_sse2()
{
return sse2;
}
/* Requested BVH layout.
*
* By default the fastest will be used. For debugging the BVH used by other
* CPUs and GPUs can be selected here instead.
*/
BVHLayout bvh_layout = BVH_LAYOUT_AUTO;
};
/* Descriptor of CUDA feature-set to be used. */
struct CUDA {
CUDA();
/* Reset flags to their defaults. */
void reset();
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux at the moment. */
bool adaptive_compile = false;
};
/* Descriptor of HIP feature-set to be used. */
struct HIP {
HIP();
/* Reset flags to their defaults. */
void reset();
/* Whether adaptive feature based runtime compile is enabled or not. */
bool adaptive_compile = false;
};
/* Descriptor of OptiX feature-set to be used. */
struct OptiX {
OptiX();
/* Reset flags to their defaults. */
void reset();
/* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
* validations, and lower optimization level. */
bool use_debug = false;
};
/* Descriptor of Metal feature-set to be used. */
struct Metal {
Metal();
/* Reset flags to their defaults. */
void reset();
/* Whether adaptive feature based runtime compile is enabled or not. */
bool adaptive_compile = false;
/* Whether local atomic sorting is enabled or not. */
bool use_local_atomic_sort = true;
};
/* Get instance of debug flags registry. */
static DebugFlags &get()
{
static DebugFlags instance;
return instance;
}
/* Reset flags to their defaults. */
void reset();
/* Requested CPU flags. */
CPU cpu;
/* Requested CUDA flags. */
CUDA cuda;
/* Requested OptiX flags. */
OptiX optix;
/* Requested HIP flags. */
HIP hip;
/* Requested Metal flags. */
Metal metal;
private:
DebugFlags();
public:
explicit DebugFlags(DebugFlags const & /*other*/) = delete;
void operator=(DebugFlags const & /*other*/) = delete;
};
typedef DebugFlags &DebugFlagsRef;
typedef const DebugFlags &DebugFlagsConstRef;
inline DebugFlags &DebugFlags()
{
return DebugFlags::get();
}
CCL_NAMESPACE_END
#endif /* __UTIL_DEBUG_H__ */