blender/intern/cycles/kernel/kernels/cuda/kernel.cu
Thomas Dinges 4422b3f919 Some fixes for CUDA runtime compile:
* When Baking wasn't used we got an error.
* On top of Volume Nodes (NODES_FEATURE_VOLUME), we now also check if we need volume sampling code,
so we can disable that as well and save some further compilation time.
2016-05-06 23:13:33 +02:00

210 lines
6.8 KiB
Plaintext

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CUDA kernel entry points */
#include "../../kernel_compat_cuda.h"
#include "../../kernel_math.h"
#include "../../kernel_types.h"
#include "../../kernel_globals.h"
#include "../../kernel_film.h"
#include "../../kernel_path.h"
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
/* device data taken from CUDA occupancy calculator */
#ifdef __CUDA_ARCH__
/* 2.0 and 2.1 */
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 32
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
/* 3.0 and 3.5 */
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.2 */
#elif __CUDA_ARCH__ == 320
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.7 */
#elif __CUDA_ARCH__ == 370
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 5.0, 5.2 and 5.3 */
#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520 || __CUDA_ARCH__ == 530
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 40
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* unknown architecture */
#else
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \
threads_block_width*threads_block_width, \
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
)
/* sanity checks */
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
# error "Maximum number of threads per block exceeded"
#endif
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
# error "Maximum number of blocks per multiprocessor exceeded"
#endif
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
}
#ifdef __BRANCHED_PATH__
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
}
#endif
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_shader(uint4 *input,
float4 *output,
float *output_luma,
int type,
int sx,
int sw,
int offset,
int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw) {
kernel_shader_evaluate(NULL,
input,
output,
output_luma,
(ShaderEvalType)type,
x,
sample);
}
}
#ifdef __BAKING__
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw)
kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
}
#endif
#endif