2011-04-27 11:58:34 +00:00
|
|
|
/*
|
2013-08-18 14:16:15 +00:00
|
|
|
* Copyright 2011-2013 Blender Foundation
|
2011-04-27 11:58:34 +00:00
|
|
|
*
|
2013-08-18 14:16:15 +00:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
2011-04-27 11:58:34 +00:00
|
|
|
*
|
2013-08-18 14:16:15 +00:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
2011-04-27 11:58:34 +00:00
|
|
|
*
|
2013-08-18 14:16:15 +00:00
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
2014-12-25 01:50:24 +00:00
|
|
|
* limitations under the License.
|
2011-04-27 11:58:34 +00:00
|
|
|
*/
|
|
|
|
|
2016-11-02 09:54:47 +00:00
|
|
|
#include <climits>
|
2017-02-27 14:22:51 +00:00
|
|
|
#include <limits.h>
|
2011-04-27 11:58:34 +00:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "device.h"
|
|
|
|
#include "device_intern.h"
|
2017-02-14 10:50:29 +00:00
|
|
|
#include "device_split_kernel.h"
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
#include "buffers.h"
|
|
|
|
|
2016-01-14 07:24:09 +00:00
|
|
|
#ifdef WITH_CUDA_DYNLOAD
|
|
|
|
# include "cuew.h"
|
|
|
|
#else
|
|
|
|
# include "util_opengl.h"
|
|
|
|
# include <cuda.h>
|
|
|
|
# include <cudaGL.h>
|
|
|
|
#endif
|
2011-04-27 11:58:34 +00:00
|
|
|
#include "util_debug.h"
|
2014-11-15 20:58:55 +00:00
|
|
|
#include "util_logging.h"
|
2011-04-27 11:58:34 +00:00
|
|
|
#include "util_map.h"
|
2015-11-21 17:16:01 +00:00
|
|
|
#include "util_md5.h"
|
2011-04-27 11:58:34 +00:00
|
|
|
#include "util_opengl.h"
|
|
|
|
#include "util_path.h"
|
2015-01-06 09:13:21 +00:00
|
|
|
#include "util_string.h"
|
2011-09-09 12:04:39 +00:00
|
|
|
#include "util_system.h"
|
2011-04-27 11:58:34 +00:00
|
|
|
#include "util_types.h"
|
2011-09-09 12:04:39 +00:00
|
|
|
#include "util_time.h"
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2017-03-08 10:02:54 +00:00
|
|
|
#include "split/kernel_split_data_types.h"
|
2017-02-14 10:50:29 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
2016-01-14 07:24:09 +00:00
|
|
|
#ifndef WITH_CUDA_DYNLOAD
|
|
|
|
|
|
|
|
/* Transparently implement some functions, so majority of the file does not need
|
|
|
|
* to worry about difference between dynamically loaded and linked CUDA at all.
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
const char *cuewErrorString(CUresult result)
|
|
|
|
{
|
|
|
|
/* We can only give error code here without major code duplication, that
|
|
|
|
* should be enough since dynamic loading is only being disabled by folks
|
|
|
|
* who knows what they're doing anyway.
|
|
|
|
*
|
|
|
|
* NOTE: Avoid call from several threads.
|
|
|
|
*/
|
|
|
|
static string error;
|
|
|
|
error = string_printf("%d", result);
|
|
|
|
return error.c_str();
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *cuewCompilerPath(void)
|
|
|
|
{
|
|
|
|
return CYCLES_CUDA_NVCC_EXECUTABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
int cuewCompilerVersion(void)
|
|
|
|
{
|
|
|
|
return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
|
|
|
|
}
|
|
|
|
|
|
|
|
} /* namespace */
|
|
|
|
#endif /* WITH_CUDA_DYNLOAD */
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
class CUDADevice;
|
|
|
|
|
|
|
|
class CUDASplitKernel : public DeviceSplitKernel {
|
|
|
|
CUDADevice *device;
|
|
|
|
public:
|
|
|
|
explicit CUDASplitKernel(CUDADevice *device);
|
|
|
|
|
2017-03-11 10:23:11 +00:00
|
|
|
virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
|
2017-03-04 11:29:01 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
|
|
|
RenderTile& rtile,
|
|
|
|
int num_global_elements,
|
|
|
|
device_memory& kernel_globals,
|
|
|
|
device_memory& kernel_data_,
|
|
|
|
device_memory& split_data,
|
|
|
|
device_memory& ray_state,
|
|
|
|
device_memory& queue_index,
|
|
|
|
device_memory& use_queues_flag,
|
|
|
|
device_memory& work_pool_wgs);
|
|
|
|
|
|
|
|
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
|
|
|
|
virtual int2 split_kernel_local_size();
|
2017-03-04 11:29:01 +00:00
|
|
|
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
|
2017-02-14 10:50:29 +00:00
|
|
|
};
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
class CUDADevice : public Device
|
|
|
|
{
|
|
|
|
public:
|
2013-08-30 23:09:22 +00:00
|
|
|
DedicatedTaskPool task_pool;
|
2011-04-27 11:58:34 +00:00
|
|
|
CUdevice cuDevice;
|
|
|
|
CUcontext cuContext;
|
|
|
|
CUmodule cuModule;
|
|
|
|
map<device_ptr, bool> tex_interp_map;
|
2016-05-19 10:47:41 +00:00
|
|
|
map<device_ptr, uint> tex_bindless_map;
|
2011-04-27 11:58:34 +00:00
|
|
|
int cuDevId;
|
2013-09-27 19:09:31 +00:00
|
|
|
int cuDevArchitecture;
|
2013-05-13 21:36:48 +00:00
|
|
|
bool first_error;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
struct PixelMem {
|
|
|
|
GLuint cuPBO;
|
|
|
|
CUgraphicsResource cuPBOresource;
|
|
|
|
GLuint cuTexId;
|
|
|
|
int w, h;
|
|
|
|
};
|
|
|
|
|
|
|
|
map<device_ptr, PixelMem> pixel_mem_map;
|
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Bindless Textures */
|
|
|
|
device_vector<uint> bindless_mapping;
|
|
|
|
bool need_bindless_mapping;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
CUdeviceptr cuda_device_ptr(device_ptr mem)
|
|
|
|
{
|
|
|
|
return (CUdeviceptr)mem;
|
|
|
|
}
|
|
|
|
|
2014-08-05 07:57:50 +00:00
|
|
|
static bool have_precompiled_kernels()
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2014-08-05 07:57:50 +00:00
|
|
|
string cubins_path = path_get("lib");
|
|
|
|
return path_exists(cubins_path);
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
Cycles: Refactor Progress system to provide better estimates
The Progress system in Cycles had two limitations so far:
- It just counted tiles, but ignored their size. For example, when rendering a 600x500 image with 512x512 tiles, the right 88x500 tile would count for 50% of the progress, although it only covers 15% of the image.
- Scene update time was incorrectly counted as rendering time - therefore, the remaining time started very long and gradually decreased.
This patch fixes both problems:
First of all, the Progress now has a function to ignore time spans, and that is used to ignore scene update time.
The larger change is the tile size: Instead of counting samples per tile, so that the final value is num_samples*num_tiles, the code now counts every sample for every pixel, so that the final value is num_samples*num_pixels.
Along with that, some unused variables were removed from the Progress and Session classes.
Reviewers: brecht, sergey, #cycles
Subscribers: brecht, candreacchio, sergey
Differential Revision: https://developer.blender.org/D2214
2016-11-26 03:22:34 +00:00
|
|
|
virtual bool show_samples() const
|
|
|
|
{
|
|
|
|
/* The CUDADevice only processes one tile at a time, so showing samples is fine. */
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-02-13 16:46:18 +00:00
|
|
|
/*#ifdef NDEBUG
|
2011-04-27 11:58:34 +00:00
|
|
|
#define cuda_abort()
|
|
|
|
#else
|
|
|
|
#define cuda_abort() abort()
|
2013-02-13 16:46:18 +00:00
|
|
|
#endif*/
|
2013-05-13 21:36:48 +00:00
|
|
|
void cuda_error_documentation()
|
|
|
|
{
|
|
|
|
if(first_error) {
|
|
|
|
fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
|
2017-01-24 00:09:45 +00:00
|
|
|
fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
|
2013-05-13 21:36:48 +00:00
|
|
|
first_error = false;
|
|
|
|
}
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
#define cuda_assert(stmt) \
|
|
|
|
{ \
|
|
|
|
CUresult result = stmt; \
|
|
|
|
\
|
|
|
|
if(result != CUDA_SUCCESS) { \
|
2014-08-05 07:57:50 +00:00
|
|
|
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
|
2011-11-22 20:49:33 +00:00
|
|
|
if(error_msg == "") \
|
|
|
|
error_msg = message; \
|
|
|
|
fprintf(stderr, "%s\n", message.c_str()); \
|
2012-12-23 12:53:58 +00:00
|
|
|
/*cuda_abort();*/ \
|
2013-05-13 21:36:48 +00:00
|
|
|
cuda_error_documentation(); \
|
2011-04-27 11:58:34 +00:00
|
|
|
} \
|
2014-05-03 17:49:56 +00:00
|
|
|
} (void)0
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-02-15 14:54:11 +00:00
|
|
|
bool cuda_error_(CUresult result, const string& stmt)
|
2011-09-09 12:04:39 +00:00
|
|
|
{
|
|
|
|
if(result == CUDA_SUCCESS)
|
|
|
|
return false;
|
|
|
|
|
2014-08-05 07:57:50 +00:00
|
|
|
string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
|
2011-11-22 20:49:33 +00:00
|
|
|
if(error_msg == "")
|
|
|
|
error_msg = message;
|
|
|
|
fprintf(stderr, "%s\n", message.c_str());
|
2013-05-13 21:36:48 +00:00
|
|
|
cuda_error_documentation();
|
2011-09-09 12:04:39 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-02-15 14:54:11 +00:00
|
|
|
#define cuda_error(stmt) cuda_error_(stmt, #stmt)
|
|
|
|
|
|
|
|
void cuda_error_message(const string& message)
|
2011-11-22 20:49:33 +00:00
|
|
|
{
|
|
|
|
if(error_msg == "")
|
|
|
|
error_msg = message;
|
|
|
|
fprintf(stderr, "%s\n", message.c_str());
|
2013-05-13 21:36:48 +00:00
|
|
|
cuda_error_documentation();
|
2011-11-22 20:49:33 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
void cuda_push_context()
|
|
|
|
{
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuCtxSetCurrent(cuContext));
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void cuda_pop_context()
|
|
|
|
{
|
|
|
|
cuda_assert(cuCtxSetCurrent(NULL));
|
|
|
|
}
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
|
2013-12-07 01:29:53 +00:00
|
|
|
: Device(info, stats, background_)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2013-05-13 21:36:48 +00:00
|
|
|
first_error = true;
|
2011-04-27 11:58:34 +00:00
|
|
|
background = background_;
|
|
|
|
|
2012-01-04 18:06:32 +00:00
|
|
|
cuDevId = info.num;
|
2011-09-09 12:04:39 +00:00
|
|
|
cuDevice = 0;
|
|
|
|
cuContext = 0;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
need_bindless_mapping = false;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
/* intialize */
|
2011-09-09 12:04:39 +00:00
|
|
|
if(cuda_error(cuInit(0)))
|
|
|
|
return;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
/* setup device and context */
|
2011-09-09 12:04:39 +00:00
|
|
|
if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
CUresult result;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2012-05-03 23:39:42 +00:00
|
|
|
if(background) {
|
2011-09-09 12:04:39 +00:00
|
|
|
result = cuCtxCreate(&cuContext, 0, cuDevice);
|
2012-05-03 23:39:42 +00:00
|
|
|
}
|
|
|
|
else {
|
2012-05-04 08:00:58 +00:00
|
|
|
result = cuGLCtxCreate(&cuContext, 0, cuDevice);
|
|
|
|
|
|
|
|
if(result != CUDA_SUCCESS) {
|
2012-05-03 23:39:42 +00:00
|
|
|
result = cuCtxCreate(&cuContext, 0, cuDevice);
|
|
|
|
background = true;
|
|
|
|
}
|
|
|
|
}
|
2011-09-09 12:04:39 +00:00
|
|
|
|
2013-02-15 14:54:11 +00:00
|
|
|
if(cuda_error_(result, "cuCtxCreate"))
|
2011-09-09 12:04:39 +00:00
|
|
|
return;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-09-27 19:09:31 +00:00
|
|
|
int major, minor;
|
2016-11-04 13:49:54 +00:00
|
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
|
|
|
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
|
2013-09-27 19:09:31 +00:00
|
|
|
cuDevArchitecture = major*100 + minor*10;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
~CUDADevice()
|
|
|
|
{
|
2012-09-04 13:29:07 +00:00
|
|
|
task_pool.stop();
|
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
if(info.has_bindless_textures) {
|
|
|
|
tex_free(bindless_mapping);
|
|
|
|
}
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuCtxDestroy(cuContext));
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
2015-11-21 16:49:00 +00:00
|
|
|
bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
|
2011-12-12 22:51:35 +00:00
|
|
|
{
|
2013-10-08 15:29:28 +00:00
|
|
|
int major, minor;
|
2016-11-04 13:49:54 +00:00
|
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
|
|
|
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
|
2015-11-21 16:49:00 +00:00
|
|
|
|
2014-03-27 09:29:22 +00:00
|
|
|
/* We only support sm_20 and above */
|
2013-10-08 15:29:28 +00:00
|
|
|
if(major < 2) {
|
|
|
|
cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
|
|
|
|
return false;
|
2011-12-12 22:51:35 +00:00
|
|
|
}
|
2015-11-21 16:49:00 +00:00
|
|
|
|
2011-12-12 22:51:35 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-05-06 20:34:15 +00:00
|
|
|
bool use_adaptive_compilation()
|
|
|
|
{
|
|
|
|
return DebugFlags().cuda.adaptive_compile;
|
|
|
|
}
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
bool use_split_kernel()
|
|
|
|
{
|
|
|
|
return DebugFlags().cuda.split_kernel;
|
|
|
|
}
|
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
/* Common NVCC flags which stays the same regardless of shading model,
|
|
|
|
* kernel sources md5 and only depends on compiler or compilation settings.
|
|
|
|
*/
|
|
|
|
string compile_kernel_get_common_cflags(
|
2017-02-14 10:50:29 +00:00
|
|
|
const DeviceRequestedFeatures& requested_features, bool split=false)
|
2016-08-02 09:24:42 +00:00
|
|
|
{
|
|
|
|
const int cuda_version = cuewCompilerVersion();
|
|
|
|
const int machine = system_cpu_bits();
|
|
|
|
const string kernel_path = path_get("kernel");
|
|
|
|
const string include = kernel_path;
|
|
|
|
string cflags = string_printf("-m%d "
|
|
|
|
"--ptxas-options=\"-v\" "
|
|
|
|
"--use_fast_math "
|
|
|
|
"-DNVCC "
|
|
|
|
"-D__KERNEL_CUDA_VERSION__=%d "
|
|
|
|
"-I\"%s\"",
|
|
|
|
machine,
|
|
|
|
cuda_version,
|
|
|
|
include.c_str());
|
|
|
|
if(use_adaptive_compilation()) {
|
|
|
|
cflags += " " + requested_features.get_build_options();
|
|
|
|
}
|
|
|
|
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
|
|
|
|
if(extra_cflags) {
|
|
|
|
cflags += string(" ") + string(extra_cflags);
|
|
|
|
}
|
|
|
|
#ifdef WITH_CYCLES_DEBUG
|
|
|
|
cflags += " -D__KERNEL_DEBUG__";
|
|
|
|
#endif
|
2017-02-14 10:50:29 +00:00
|
|
|
|
|
|
|
if(split) {
|
|
|
|
cflags += " -D__SPLIT__";
|
|
|
|
}
|
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
return cflags;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool compile_check_compiler() {
|
|
|
|
const char *nvcc = cuewCompilerPath();
|
|
|
|
if(nvcc == NULL) {
|
|
|
|
cuda_error_message("CUDA nvcc compiler not found. "
|
|
|
|
"Install CUDA toolkit in default location.");
|
|
|
|
return false;
|
|
|
|
}
|
2016-08-02 09:27:59 +00:00
|
|
|
const int cuda_version = cuewCompilerVersion();
|
2016-08-02 09:24:42 +00:00
|
|
|
VLOG(1) << "Found nvcc " << nvcc
|
|
|
|
<< ", CUDA version " << cuda_version
|
|
|
|
<< ".";
|
2016-08-02 09:27:59 +00:00
|
|
|
const int major = cuda_version / 10, minor = cuda_version & 10;
|
2016-08-02 09:24:42 +00:00
|
|
|
if(cuda_version == 0) {
|
|
|
|
cuda_error_message("CUDA nvcc compiler version could not be parsed.");
|
|
|
|
return false;
|
|
|
|
}
|
2016-08-09 09:41:25 +00:00
|
|
|
if(cuda_version < 75) {
|
2016-08-02 09:24:42 +00:00
|
|
|
printf("Unsupported CUDA version %d.%d detected, "
|
|
|
|
"you need CUDA 7.5 or newer.\n",
|
2016-08-02 09:27:59 +00:00
|
|
|
major, minor);
|
2016-08-02 09:24:42 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else if(cuda_version != 75 && cuda_version != 80) {
|
|
|
|
printf("CUDA version %d.%d detected, build may succeed but only "
|
|
|
|
"CUDA 7.5 and 8.0 are officially supported.\n",
|
2016-08-02 09:27:59 +00:00
|
|
|
major, minor);
|
2016-08-02 09:24:42 +00:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
|
2011-09-09 12:04:39 +00:00
|
|
|
{
|
2016-05-19 14:32:57 +00:00
|
|
|
/* Compute cubin name. */
|
2011-09-09 12:04:39 +00:00
|
|
|
int major, minor;
|
2016-11-04 13:49:54 +00:00
|
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
|
|
|
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
|
2016-05-06 20:34:15 +00:00
|
|
|
|
2016-05-19 14:32:57 +00:00
|
|
|
/* Attempt to use kernel provided with Blender. */
|
2016-08-02 09:24:42 +00:00
|
|
|
if(!use_adaptive_compilation()) {
|
2017-02-14 10:50:29 +00:00
|
|
|
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
|
|
|
|
: "lib/kernel_sm_%d%d.cubin",
|
2016-08-02 09:24:42 +00:00
|
|
|
major, minor));
|
|
|
|
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
|
2016-05-19 14:32:57 +00:00
|
|
|
if(path_exists(cubin)) {
|
2016-08-02 09:24:42 +00:00
|
|
|
VLOG(1) << "Using precompiled kernel.";
|
2016-05-19 14:32:57 +00:00
|
|
|
return cubin;
|
|
|
|
}
|
2014-11-15 20:58:55 +00:00
|
|
|
}
|
2011-09-09 12:04:39 +00:00
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
const string common_cflags =
|
2017-02-14 10:50:29 +00:00
|
|
|
compile_kernel_get_common_cflags(requested_features, split);
|
2015-11-21 17:16:01 +00:00
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
/* Try to use locally compiled kernel. */
|
|
|
|
const string kernel_path = path_get("kernel");
|
|
|
|
const string kernel_md5 = path_files_md5_hash(kernel_path);
|
|
|
|
|
|
|
|
/* We include cflags into md5 so changing cuda toolkit or changing other
|
|
|
|
* compiler command line arguments makes sure cubin gets re-built.
|
|
|
|
*/
|
|
|
|
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
|
|
|
|
: "cycles_kernel_sm%d%d_%s.cubin",
|
2016-08-02 09:24:42 +00:00
|
|
|
major, minor,
|
|
|
|
cubin_md5.c_str());
|
2016-09-05 14:41:08 +00:00
|
|
|
const string cubin = path_cache_get(path_join("kernels", cubin_file));
|
2016-08-02 09:24:42 +00:00
|
|
|
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
|
2014-11-15 20:58:55 +00:00
|
|
|
if(path_exists(cubin)) {
|
2016-08-02 09:24:42 +00:00
|
|
|
VLOG(1) << "Using locally compiled kernel.";
|
2011-09-09 12:04:39 +00:00
|
|
|
return cubin;
|
2014-11-15 20:58:55 +00:00
|
|
|
}
|
2011-09-09 12:04:39 +00:00
|
|
|
|
2013-01-14 17:30:33 +00:00
|
|
|
#ifdef _WIN32
|
2014-08-05 07:57:50 +00:00
|
|
|
if(have_precompiled_kernels()) {
|
2016-08-02 09:24:42 +00:00
|
|
|
if(major < 2) {
|
|
|
|
cuda_error_message(string_printf(
|
|
|
|
"CUDA device requires compute capability 2.0 or up, "
|
|
|
|
"found %d.%d. Your GPU is not supported.",
|
|
|
|
major, minor));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cuda_error_message(string_printf(
|
|
|
|
"CUDA binary kernel for this graphics card compute "
|
|
|
|
"capability (%d.%d) not found.",
|
|
|
|
major, minor));
|
|
|
|
}
|
2013-01-14 17:30:33 +00:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
/* Compile. */
|
|
|
|
if(!compile_check_compiler()) {
|
2013-10-08 15:29:28 +00:00
|
|
|
return "";
|
|
|
|
}
|
2016-08-02 09:24:42 +00:00
|
|
|
const char *nvcc = cuewCompilerPath();
|
2016-08-02 09:27:59 +00:00
|
|
|
const string kernel = path_join(kernel_path,
|
2016-08-02 09:24:42 +00:00
|
|
|
path_join("kernels",
|
2017-02-14 10:50:29 +00:00
|
|
|
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
|
2011-09-09 12:04:39 +00:00
|
|
|
double starttime = time_dt();
|
|
|
|
printf("Compiling CUDA kernel ...\n");
|
|
|
|
|
2011-09-12 13:13:56 +00:00
|
|
|
path_create_directories(cubin);
|
|
|
|
|
2016-08-02 09:24:42 +00:00
|
|
|
string command = string_printf("\"%s\" "
|
|
|
|
"-arch=sm_%d%d "
|
|
|
|
"--cubin \"%s\" "
|
|
|
|
"-o \"%s\" "
|
|
|
|
"%s ",
|
|
|
|
nvcc,
|
|
|
|
major, minor,
|
|
|
|
kernel.c_str(),
|
|
|
|
cubin.c_str(),
|
|
|
|
common_cflags.c_str());
|
2014-10-05 09:29:26 +00:00
|
|
|
|
2013-06-19 17:54:23 +00:00
|
|
|
printf("%s\n", command.c_str());
|
2011-09-09 12:04:39 +00:00
|
|
|
|
2011-09-12 13:13:56 +00:00
|
|
|
if(system(command.c_str()) == -1) {
|
2016-08-02 09:24:42 +00:00
|
|
|
cuda_error_message("Failed to execute compilation command, "
|
|
|
|
"see console for details.");
|
2011-09-12 13:13:56 +00:00
|
|
|
return "";
|
|
|
|
}
|
2011-09-09 12:04:39 +00:00
|
|
|
|
2016-05-19 14:32:57 +00:00
|
|
|
/* Verify if compilation succeeded */
|
2011-09-09 12:04:39 +00:00
|
|
|
if(!path_exists(cubin)) {
|
2016-08-02 09:24:42 +00:00
|
|
|
cuda_error_message("CUDA kernel compilation failed, "
|
|
|
|
"see console for details.");
|
2011-09-09 12:04:39 +00:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
|
|
|
|
|
|
|
|
return cubin;
|
|
|
|
}
|
2011-09-02 00:10:03 +00:00
|
|
|
|
2015-05-09 14:05:49 +00:00
|
|
|
bool load_kernels(const DeviceRequestedFeatures& requested_features)
|
2011-09-02 00:10:03 +00:00
|
|
|
{
|
2011-09-09 12:04:39 +00:00
|
|
|
/* check if cuda init succeeded */
|
|
|
|
if(cuContext == 0)
|
|
|
|
return false;
|
2015-11-21 16:49:00 +00:00
|
|
|
|
2014-03-27 09:29:22 +00:00
|
|
|
/* check if GPU is supported */
|
2015-11-21 16:49:00 +00:00
|
|
|
if(!support_device(requested_features))
|
2011-12-12 22:51:35 +00:00
|
|
|
return false;
|
|
|
|
|
2011-09-09 12:04:39 +00:00
|
|
|
/* get kernel */
|
2017-02-14 10:50:29 +00:00
|
|
|
string cubin = compile_kernel(requested_features, use_split_kernel());
|
2011-09-09 12:04:39 +00:00
|
|
|
|
|
|
|
if(cubin == "")
|
|
|
|
return false;
|
2011-09-02 00:10:03 +00:00
|
|
|
|
|
|
|
/* open module */
|
2011-09-09 12:04:39 +00:00
|
|
|
cuda_push_context();
|
2011-09-02 00:10:03 +00:00
|
|
|
|
2014-01-10 23:47:58 +00:00
|
|
|
string cubin_data;
|
|
|
|
CUresult result;
|
|
|
|
|
2015-03-27 19:15:15 +00:00
|
|
|
if(path_read_text(cubin, cubin_data))
|
2014-01-10 23:47:58 +00:00
|
|
|
result = cuModuleLoadData(&cuModule, cubin_data.c_str());
|
|
|
|
else
|
|
|
|
result = CUDA_ERROR_FILE_NOT_FOUND;
|
|
|
|
|
2013-02-15 14:54:11 +00:00
|
|
|
if(cuda_error_(result, "cuModuleLoad"))
|
|
|
|
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
|
2011-09-02 00:10:03 +00:00
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
return (result == CUDA_SUCCESS);
|
|
|
|
}
|
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
void load_bindless_mapping()
|
|
|
|
{
|
|
|
|
if(info.has_bindless_textures && need_bindless_mapping) {
|
|
|
|
tex_free(bindless_mapping);
|
|
|
|
tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
|
|
|
|
need_bindless_mapping = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-14 01:45:09 +00:00
|
|
|
void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2016-12-14 01:45:09 +00:00
|
|
|
if(name) {
|
|
|
|
VLOG(1) << "Buffer allocate: " << name << ", "
|
2017-03-07 10:21:36 +00:00
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
2016-12-14 01:45:09 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_push_context();
|
|
|
|
CUdeviceptr device_pointer;
|
2012-11-05 08:04:57 +00:00
|
|
|
size_t size = mem.memory_size();
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuMemAlloc(&device_pointer, size));
|
2011-04-27 11:58:34 +00:00
|
|
|
mem.device_pointer = (device_ptr)device_pointer;
|
2014-09-04 11:22:40 +00:00
|
|
|
mem.device_size = size;
|
2012-11-05 08:04:57 +00:00
|
|
|
stats.mem_alloc(size);
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_copy_to(device_memory& mem)
|
|
|
|
{
|
|
|
|
cuda_push_context();
|
2012-12-23 12:53:58 +00:00
|
|
|
if(mem.device_pointer)
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
2012-01-09 16:58:01 +00:00
|
|
|
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-01-09 16:58:01 +00:00
|
|
|
size_t offset = elem*y*w;
|
|
|
|
size_t size = elem*w*h;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_push_context();
|
2012-12-23 12:53:58 +00:00
|
|
|
if(mem.device_pointer) {
|
|
|
|
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
|
2014-10-14 15:52:23 +00:00
|
|
|
(CUdeviceptr)(mem.device_pointer + offset), size));
|
2012-12-23 12:53:58 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
memset((char*)mem.data_pointer + offset, 0, size);
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_zero(device_memory& mem)
|
|
|
|
{
|
2017-02-22 12:32:57 +00:00
|
|
|
if(mem.data_pointer) {
|
|
|
|
memset((void*)mem.data_pointer, 0, mem.memory_size());
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
cuda_push_context();
|
2012-12-23 12:53:58 +00:00
|
|
|
if(mem.device_pointer)
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_free(device_memory& mem)
|
|
|
|
{
|
|
|
|
if(mem.device_pointer) {
|
|
|
|
cuda_push_context();
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
mem.device_pointer = 0;
|
2012-11-05 08:04:57 +00:00
|
|
|
|
2014-09-04 11:22:40 +00:00
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
mem.device_size = 0;
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void const_copy_to(const char *name, void *host, size_t size)
|
|
|
|
{
|
|
|
|
CUdeviceptr mem;
|
|
|
|
size_t bytes;
|
|
|
|
|
|
|
|
cuda_push_context();
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
|
2011-09-02 00:10:03 +00:00
|
|
|
//assert(bytes == size);
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuMemcpyHtoD(mem, host, size));
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
2015-07-28 11:51:10 +00:00
|
|
|
void tex_alloc(const char *name,
|
|
|
|
device_memory& mem,
|
|
|
|
InterpolationType interpolation,
|
|
|
|
ExtensionType extension)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2016-05-29 22:02:05 +00:00
|
|
|
VLOG(1) << "Texture allocate: " << name << ", "
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
2014-03-29 12:03:48 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Check if we are on sm_30 or above.
|
|
|
|
* We use arrays and bindles textures for storage there */
|
|
|
|
bool has_bindless_textures = info.has_bindless_textures;
|
|
|
|
|
|
|
|
/* General variables for both architectures */
|
2016-02-15 14:40:39 +00:00
|
|
|
string bind_name = name;
|
2016-05-19 10:47:41 +00:00
|
|
|
size_t dsize = datatype_size(mem.data_type);
|
|
|
|
size_t size = mem.memory_size();
|
|
|
|
|
|
|
|
CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
|
|
|
switch(extension) {
|
|
|
|
case EXTENSION_REPEAT:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
|
|
|
break;
|
|
|
|
case EXTENSION_EXTEND:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_CLAMP;
|
|
|
|
break;
|
|
|
|
case EXTENSION_CLIP:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_BORDER;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
CUfilter_mode filter_mode;
|
|
|
|
if(interpolation == INTERPOLATION_CLOSEST) {
|
|
|
|
filter_mode = CU_TR_FILTER_MODE_POINT;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
filter_mode = CU_TR_FILTER_MODE_LINEAR;
|
2016-02-15 14:40:39 +00:00
|
|
|
}
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
CUarray_format_enum format;
|
2016-05-19 10:47:41 +00:00
|
|
|
switch(mem.data_type) {
|
|
|
|
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
|
|
|
|
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
|
|
|
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
|
|
|
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
2016-08-11 20:47:53 +00:00
|
|
|
case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
|
2016-05-19 10:47:41 +00:00
|
|
|
default: assert(0); return;
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* General variables for Fermi */
|
|
|
|
CUtexref texref = NULL;
|
2012-12-23 12:53:58 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
if(!has_bindless_textures) {
|
|
|
|
if(mem.data_depth > 1) {
|
|
|
|
/* Kernel uses different bind names for 2d and 3d float textures,
|
|
|
|
* so we have to adjust couple of things here.
|
|
|
|
*/
|
|
|
|
vector<string> tokens;
|
|
|
|
string_split(tokens, name, "_");
|
|
|
|
bind_name = string_printf("__tex_image_%s_3d_%s",
|
|
|
|
tokens[2].c_str(),
|
|
|
|
tokens[3].c_str());
|
2013-09-27 19:09:31 +00:00
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-09-27 19:09:31 +00:00
|
|
|
cuda_push_context();
|
2016-02-15 14:40:39 +00:00
|
|
|
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_pop_context();
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-09-27 19:09:31 +00:00
|
|
|
if(!texref) {
|
2012-12-23 12:53:58 +00:00
|
|
|
return;
|
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
}
|
2012-12-23 12:53:58 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Data Storage */
|
|
|
|
if(interpolation == INTERPOLATION_NONE) {
|
|
|
|
if(has_bindless_textures) {
|
2016-12-14 01:45:09 +00:00
|
|
|
mem_alloc(NULL, mem, MEM_READ_ONLY);
|
2016-05-19 10:47:41 +00:00
|
|
|
mem_copy_to(mem);
|
2013-09-27 19:09:31 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_push_context();
|
2013-09-27 19:09:31 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
CUdeviceptr cumem;
|
|
|
|
size_t cubytes;
|
2013-09-27 19:09:31 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
|
2013-09-27 19:09:31 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
if(cubytes == 8) {
|
|
|
|
/* 64 bit device pointer */
|
|
|
|
uint64_t ptr = mem.device_pointer;
|
|
|
|
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
2014-03-07 22:16:09 +00:00
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
else {
|
|
|
|
/* 32 bit device pointer */
|
|
|
|
uint32_t ptr = (uint32_t)mem.device_pointer;
|
|
|
|
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
2014-03-07 22:16:09 +00:00
|
|
|
}
|
2013-09-27 19:09:31 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_pop_context();
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
2013-09-27 19:09:31 +00:00
|
|
|
else {
|
2016-12-14 01:45:09 +00:00
|
|
|
mem_alloc(NULL, mem, MEM_READ_ONLY);
|
2013-09-27 19:09:31 +00:00
|
|
|
mem_copy_to(mem);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-09-27 19:09:31 +00:00
|
|
|
cuda_push_context();
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
|
|
|
|
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
|
|
|
|
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
|
2016-05-19 10:47:41 +00:00
|
|
|
|
|
|
|
cuda_pop_context();
|
2013-09-27 19:09:31 +00:00
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
}
|
|
|
|
/* Texture Storage */
|
|
|
|
else {
|
|
|
|
CUarray handle = NULL;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
if(mem.data_depth > 1) {
|
|
|
|
CUDA_ARRAY3D_DESCRIPTOR desc;
|
|
|
|
|
|
|
|
desc.Width = mem.data_width;
|
|
|
|
desc.Height = mem.data_height;
|
|
|
|
desc.Depth = mem.data_depth;
|
|
|
|
desc.Format = format;
|
|
|
|
desc.NumChannels = mem.data_elements;
|
|
|
|
desc.Flags = 0;
|
|
|
|
|
|
|
|
cuda_assert(cuArray3DCreate(&handle, &desc));
|
2016-04-20 12:42:04 +00:00
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
else {
|
|
|
|
CUDA_ARRAY_DESCRIPTOR desc;
|
|
|
|
|
|
|
|
desc.Width = mem.data_width;
|
|
|
|
desc.Height = mem.data_height;
|
|
|
|
desc.Format = format;
|
|
|
|
desc.NumChannels = mem.data_elements;
|
|
|
|
|
|
|
|
cuda_assert(cuArrayCreate(&handle, &desc));
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!handle) {
|
|
|
|
cuda_pop_context();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Allocate 3D, 2D or 1D memory */
|
2016-04-20 12:42:04 +00:00
|
|
|
if(mem.data_depth > 1) {
|
2016-05-19 10:47:41 +00:00
|
|
|
CUDA_MEMCPY3D param;
|
|
|
|
memset(¶m, 0, sizeof(param));
|
|
|
|
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
|
|
param.dstArray = handle;
|
|
|
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
|
|
param.srcHost = (void*)mem.data_pointer;
|
|
|
|
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
|
|
|
param.WidthInBytes = param.srcPitch;
|
|
|
|
param.Height = mem.data_height;
|
|
|
|
param.Depth = mem.data_depth;
|
|
|
|
|
|
|
|
cuda_assert(cuMemcpy3D(¶m));
|
2013-09-27 19:09:31 +00:00
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
else if(mem.data_height > 1) {
|
|
|
|
CUDA_MEMCPY2D param;
|
|
|
|
memset(¶m, 0, sizeof(param));
|
|
|
|
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
|
|
param.dstArray = handle;
|
|
|
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
|
|
param.srcHost = (void*)mem.data_pointer;
|
|
|
|
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
|
|
|
param.WidthInBytes = param.srcPitch;
|
|
|
|
param.Height = mem.data_height;
|
|
|
|
|
|
|
|
cuda_assert(cuMemcpy2D(¶m));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
|
2016-04-20 12:42:04 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Fermi and Kepler */
|
|
|
|
mem.device_pointer = (device_ptr)handle;
|
|
|
|
mem.device_size = size;
|
2012-11-05 08:04:57 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
stats.mem_alloc(size);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Bindless Textures - Kepler */
|
|
|
|
if(has_bindless_textures) {
|
|
|
|
int flat_slot = 0;
|
|
|
|
if(string_startswith(name, "__tex_image")) {
|
|
|
|
int pos = string(name).rfind("_");
|
|
|
|
flat_slot = atoi(name + pos + 1);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
assert(0);
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
CUDA_RESOURCE_DESC resDesc;
|
|
|
|
memset(&resDesc, 0, sizeof(resDesc));
|
|
|
|
resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
|
|
|
|
resDesc.res.array.hArray = handle;
|
|
|
|
resDesc.flags = 0;
|
|
|
|
|
|
|
|
CUDA_TEXTURE_DESC texDesc;
|
|
|
|
memset(&texDesc, 0, sizeof(texDesc));
|
|
|
|
texDesc.addressMode[0] = address_mode;
|
|
|
|
texDesc.addressMode[1] = address_mode;
|
|
|
|
texDesc.addressMode[2] = address_mode;
|
|
|
|
texDesc.filterMode = filter_mode;
|
|
|
|
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
|
|
|
|
|
|
|
|
CUtexObject tex = 0;
|
|
|
|
cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
|
|
|
|
|
|
|
|
/* Safety check */
|
|
|
|
if((uint)tex > UINT_MAX) {
|
|
|
|
assert(0);
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Resize once */
|
2016-08-11 20:47:53 +00:00
|
|
|
if(flat_slot >= bindless_mapping.size()) {
|
|
|
|
/* Allocate some slots in advance, to reduce amount
|
|
|
|
* of re-allocations.
|
|
|
|
*/
|
|
|
|
bindless_mapping.resize(flat_slot + 128);
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Set Mapping and tag that we need to (re-)upload to device */
|
|
|
|
bindless_mapping.get_data()[flat_slot] = (uint)tex;
|
|
|
|
tex_bindless_map[mem.device_pointer] = (uint)tex;
|
|
|
|
need_bindless_mapping = true;
|
2013-09-27 19:09:31 +00:00
|
|
|
}
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Regular Textures - Fermi */
|
2013-09-27 19:09:31 +00:00
|
|
|
else {
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
|
|
|
|
cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
|
|
|
|
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
|
|
|
|
}
|
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Fermi, Data and Image Textures */
|
|
|
|
if(!has_bindless_textures) {
|
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
|
|
|
|
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
|
|
|
|
if(mem.data_depth > 1) {
|
|
|
|
cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
|
2013-09-27 19:09:31 +00:00
|
|
|
}
|
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
|
|
|
|
|
2013-09-27 19:09:31 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Fermi and Kepler */
|
2014-03-07 22:16:09 +00:00
|
|
|
tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void tex_free(device_memory& mem)
|
|
|
|
{
|
|
|
|
if(mem.device_pointer) {
|
|
|
|
if(tex_interp_map[mem.device_pointer]) {
|
|
|
|
cuda_push_context();
|
|
|
|
cuArrayDestroy((CUarray)mem.device_pointer);
|
|
|
|
cuda_pop_context();
|
|
|
|
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Free CUtexObject (Bindless Textures) */
|
|
|
|
if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
|
|
|
|
uint flat_slot = tex_bindless_map[mem.device_pointer];
|
|
|
|
cuTexObjectDestroy(flat_slot);
|
|
|
|
}
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
|
|
|
|
mem.device_pointer = 0;
|
2012-11-05 08:04:57 +00:00
|
|
|
|
2014-09-04 11:22:40 +00:00
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
mem.device_size = 0;
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
|
|
|
|
mem_free(mem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-23 14:34:34 +00:00
|
|
|
void path_trace(RenderTile& rtile, int sample, bool branched)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-12-23 12:53:58 +00:00
|
|
|
if(have_error())
|
|
|
|
return;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
CUfunction cuPathTrace;
|
2012-09-04 13:29:07 +00:00
|
|
|
CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
|
|
|
|
CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
/* get kernel function */
|
2014-07-01 23:12:13 +00:00
|
|
|
if(branched) {
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
|
|
|
|
}
|
2013-08-23 14:34:34 +00:00
|
|
|
|
|
|
|
if(have_error())
|
|
|
|
return;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-07-25 11:33:19 +00:00
|
|
|
/* pass in parameters */
|
|
|
|
void *args[] = {&d_buffer,
|
2016-02-03 14:00:55 +00:00
|
|
|
&d_rng_state,
|
|
|
|
&sample,
|
|
|
|
&rtile.x,
|
|
|
|
&rtile.y,
|
|
|
|
&rtile.w,
|
|
|
|
&rtile.h,
|
|
|
|
&rtile.offset,
|
|
|
|
&rtile.stride};
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-04-16 17:04:58 +00:00
|
|
|
/* launch kernel */
|
|
|
|
int threads_per_block;
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
|
2014-04-16 17:04:58 +00:00
|
|
|
|
|
|
|
/*int num_registers;
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
|
2014-04-16 17:04:58 +00:00
|
|
|
|
|
|
|
printf("threads_per_block %d\n", threads_per_block);
|
|
|
|
printf("num_registers %d\n", num_registers);*/
|
|
|
|
|
2016-05-22 17:11:26 +00:00
|
|
|
int xthreads = (int)sqrt(threads_per_block);
|
|
|
|
int ythreads = (int)sqrt(threads_per_block);
|
2012-09-04 13:29:07 +00:00
|
|
|
int xblocks = (rtile.w + xthreads - 1)/xthreads;
|
|
|
|
int yblocks = (rtile.h + ythreads - 1)/ythreads;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
|
2014-07-25 11:33:19 +00:00
|
|
|
|
|
|
|
cuda_assert(cuLaunchKernel(cuPathTrace,
|
2016-02-03 14:00:55 +00:00
|
|
|
xblocks , yblocks, 1, /* blocks */
|
|
|
|
xthreads, ythreads, 1, /* threads */
|
|
|
|
0, 0, args, 0));
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-05-19 14:39:25 +00:00
|
|
|
cuda_assert(cuCtxSynchronize());
|
2014-04-17 09:59:05 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
2013-08-30 23:49:38 +00:00
|
|
|
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-12-23 12:53:58 +00:00
|
|
|
if(have_error())
|
|
|
|
return;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
CUfunction cuFilmConvert;
|
2013-08-30 23:49:38 +00:00
|
|
|
CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
|
2012-09-04 13:29:07 +00:00
|
|
|
CUdeviceptr d_buffer = cuda_device_ptr(buffer);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
/* get kernel function */
|
2014-05-03 17:49:56 +00:00
|
|
|
if(rgba_half) {
|
|
|
|
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
|
2013-08-30 23:49:38 +00:00
|
|
|
float sample_scale = 1.0f/(task.sample + 1);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-07-25 11:33:19 +00:00
|
|
|
/* pass in parameters */
|
|
|
|
void *args[] = {&d_rgba,
|
2016-02-03 14:00:55 +00:00
|
|
|
&d_buffer,
|
|
|
|
&sample_scale,
|
|
|
|
&task.x,
|
|
|
|
&task.y,
|
|
|
|
&task.w,
|
|
|
|
&task.h,
|
|
|
|
&task.offset,
|
|
|
|
&task.stride};
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-04-16 17:04:58 +00:00
|
|
|
/* launch kernel */
|
|
|
|
int threads_per_block;
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
|
2014-04-16 17:04:58 +00:00
|
|
|
|
2016-05-22 17:11:26 +00:00
|
|
|
int xthreads = (int)sqrt(threads_per_block);
|
|
|
|
int ythreads = (int)sqrt(threads_per_block);
|
2011-04-27 11:58:34 +00:00
|
|
|
int xblocks = (task.w + xthreads - 1)/xthreads;
|
|
|
|
int yblocks = (task.h + ythreads - 1)/ythreads;
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
|
2014-07-25 11:33:19 +00:00
|
|
|
|
|
|
|
cuda_assert(cuLaunchKernel(cuFilmConvert,
|
2016-02-03 14:00:55 +00:00
|
|
|
xblocks , yblocks, 1, /* blocks */
|
|
|
|
xthreads, ythreads, 1, /* threads */
|
|
|
|
0, 0, args, 0));
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-08-30 23:49:38 +00:00
|
|
|
unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
2011-12-31 15:18:13 +00:00
|
|
|
void shader(DeviceTask& task)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-12-23 12:53:58 +00:00
|
|
|
if(have_error())
|
|
|
|
return;
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
cuda_push_context();
|
|
|
|
|
2014-04-16 17:04:58 +00:00
|
|
|
CUfunction cuShader;
|
2011-12-31 15:18:13 +00:00
|
|
|
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
|
2013-06-21 13:05:08 +00:00
|
|
|
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
|
2015-12-30 14:04:01 +00:00
|
|
|
CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
/* get kernel function */
|
2014-05-27 11:20:07 +00:00
|
|
|
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
|
|
|
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-04-30 16:16:48 +00:00
|
|
|
/* do tasks in smaller chunks, so we can cancel it */
|
|
|
|
const int shader_chunk_size = 65536;
|
|
|
|
const int start = task.shader_x;
|
|
|
|
const int end = task.shader_x + task.shader_w;
|
2014-08-19 09:39:40 +00:00
|
|
|
int offset = task.offset;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-08-02 06:53:52 +00:00
|
|
|
bool canceled = false;
|
|
|
|
for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
|
2014-07-22 21:41:01 +00:00
|
|
|
for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
|
|
|
|
int shader_w = min(shader_chunk_size, end - shader_x);
|
2011-12-31 15:18:13 +00:00
|
|
|
|
2014-07-25 11:33:19 +00:00
|
|
|
/* pass in parameters */
|
2015-12-30 14:04:01 +00:00
|
|
|
void *args[8];
|
|
|
|
int arg = 0;
|
|
|
|
args[arg++] = &d_input;
|
|
|
|
args[arg++] = &d_output;
|
|
|
|
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
|
|
|
|
args[arg++] = &d_output_luma;
|
|
|
|
}
|
|
|
|
args[arg++] = &task.shader_eval_type;
|
2016-01-19 21:28:16 +00:00
|
|
|
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
|
|
|
args[arg++] = &task.shader_filter;
|
|
|
|
}
|
2015-12-30 14:04:01 +00:00
|
|
|
args[arg++] = &shader_x;
|
|
|
|
args[arg++] = &shader_w;
|
|
|
|
args[arg++] = &offset;
|
|
|
|
args[arg++] = &sample;
|
2014-04-30 16:16:48 +00:00
|
|
|
|
2014-06-06 12:40:09 +00:00
|
|
|
/* launch kernel */
|
|
|
|
int threads_per_block;
|
|
|
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
|
2014-04-30 16:16:48 +00:00
|
|
|
|
2014-06-06 12:40:09 +00:00
|
|
|
int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
|
2014-04-30 16:16:48 +00:00
|
|
|
|
2014-06-06 12:40:09 +00:00
|
|
|
cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
|
2014-07-25 11:33:19 +00:00
|
|
|
cuda_assert(cuLaunchKernel(cuShader,
|
2016-02-03 14:00:55 +00:00
|
|
|
xblocks , 1, 1, /* blocks */
|
|
|
|
threads_per_block, 1, 1, /* threads */
|
|
|
|
0, 0, args, 0));
|
2014-06-06 12:40:09 +00:00
|
|
|
|
|
|
|
cuda_assert(cuCtxSynchronize());
|
2014-07-22 21:41:01 +00:00
|
|
|
|
|
|
|
if(task.get_cancel()) {
|
2016-09-29 13:48:10 +00:00
|
|
|
canceled = true;
|
2014-07-22 21:41:01 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-06-06 12:40:09 +00:00
|
|
|
}
|
2014-07-22 21:41:01 +00:00
|
|
|
|
|
|
|
task.update_progress(NULL);
|
2014-04-30 16:16:48 +00:00
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
|
|
|
|
CUdeviceptr map_pixels(device_ptr mem)
|
|
|
|
{
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem = pixel_mem_map[mem];
|
|
|
|
CUdeviceptr buffer;
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
size_t bytes;
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
|
|
|
|
cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cuda_device_ptr(mem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void unmap_pixels(device_ptr mem)
|
|
|
|
{
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem = pixel_mem_map[mem];
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void pixels_alloc(device_memory& mem)
|
|
|
|
{
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem;
|
|
|
|
|
|
|
|
pmem.w = mem.data_width;
|
|
|
|
pmem.h = mem.data_height;
|
|
|
|
|
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
glGenBuffers(1, &pmem.cuPBO);
|
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
|
2013-08-30 23:49:38 +00:00
|
|
|
if(mem.data_type == TYPE_HALF)
|
|
|
|
glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
|
|
|
|
else
|
|
|
|
glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
glGenTextures(1, &pmem.cuTexId);
|
|
|
|
glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
|
2013-08-30 23:49:38 +00:00
|
|
|
if(mem.data_type == TYPE_HALF)
|
|
|
|
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
|
|
|
|
else
|
2015-12-08 06:19:08 +00:00
|
|
|
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
|
2011-04-27 11:58:34 +00:00
|
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
|
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
|
|
|
glBindTexture(GL_TEXTURE_2D, 0);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2012-05-04 08:00:58 +00:00
|
|
|
CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2013-02-15 14:54:11 +00:00
|
|
|
if(result == CUDA_SUCCESS) {
|
2012-05-04 08:00:58 +00:00
|
|
|
cuda_pop_context();
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2012-05-04 08:00:58 +00:00
|
|
|
mem.device_pointer = pmem.cuTexId;
|
|
|
|
pixel_mem_map[mem.device_pointer] = pmem;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-09-04 11:22:40 +00:00
|
|
|
mem.device_size = mem.memory_size();
|
|
|
|
stats.mem_alloc(mem.device_size);
|
2012-11-05 08:04:57 +00:00
|
|
|
|
2012-05-04 08:00:58 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* failed to register buffer, fallback to no interop */
|
|
|
|
glDeleteBuffers(1, &pmem.cuPBO);
|
|
|
|
glDeleteTextures(1, &pmem.cuTexId);
|
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
background = true;
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Device::pixels_alloc(mem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void pixels_copy_from(device_memory& mem, int y, int w, int h)
|
|
|
|
{
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem = pixel_mem_map[mem.device_pointer];
|
|
|
|
|
|
|
|
cuda_push_context();
|
|
|
|
|
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
|
|
|
|
uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
|
|
|
|
size_t offset = sizeof(uchar)*4*y*w;
|
|
|
|
memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
|
|
|
|
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
|
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Device::pixels_copy_from(mem, y, w, h);
|
|
|
|
}
|
|
|
|
|
|
|
|
void pixels_free(device_memory& mem)
|
|
|
|
{
|
|
|
|
if(mem.device_pointer) {
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem = pixel_mem_map[mem.device_pointer];
|
|
|
|
|
|
|
|
cuda_push_context();
|
|
|
|
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
|
2011-04-27 11:58:34 +00:00
|
|
|
glDeleteBuffers(1, &pmem.cuPBO);
|
|
|
|
glDeleteTextures(1, &pmem.cuTexId);
|
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
|
|
|
|
mem.device_pointer = 0;
|
|
|
|
|
2014-09-04 11:22:40 +00:00
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
mem.device_size = 0;
|
2012-11-05 08:04:57 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Device::pixels_free(mem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-11 14:40:38 +00:00
|
|
|
void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
|
2014-03-26 08:57:30 +00:00
|
|
|
const DeviceDrawParams &draw_params)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
|
|
|
if(!background) {
|
|
|
|
PixelMem pmem = pixel_mem_map[mem.device_pointer];
|
2015-05-11 14:28:41 +00:00
|
|
|
float *vpointer;
|
2011-04-27 11:58:34 +00:00
|
|
|
|
|
|
|
cuda_push_context();
|
|
|
|
|
2014-08-02 06:53:52 +00:00
|
|
|
/* for multi devices, this assumes the inefficient method that we allocate
|
2012-06-09 17:22:52 +00:00
|
|
|
* all pixels on the device even though we only render to a subset */
|
2013-08-30 23:49:38 +00:00
|
|
|
size_t offset = 4*y*w;
|
|
|
|
|
|
|
|
if(mem.data_type == TYPE_HALF)
|
|
|
|
offset *= sizeof(GLhalf);
|
|
|
|
else
|
|
|
|
offset *= sizeof(uint8_t);
|
2011-09-12 13:13:56 +00:00
|
|
|
|
2015-11-24 07:20:38 +00:00
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
|
2011-04-27 11:58:34 +00:00
|
|
|
glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
|
2013-08-30 23:49:38 +00:00
|
|
|
if(mem.data_type == TYPE_HALF)
|
|
|
|
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
|
|
|
|
else
|
|
|
|
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
|
2015-11-24 07:20:38 +00:00
|
|
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
glEnable(GL_TEXTURE_2D);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-08-28 13:55:59 +00:00
|
|
|
if(transparent) {
|
|
|
|
glEnable(GL_BLEND);
|
|
|
|
glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
|
|
|
|
}
|
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
glColor3f(1.0f, 1.0f, 1.0f);
|
|
|
|
|
2014-03-26 08:57:30 +00:00
|
|
|
if(draw_params.bind_display_space_shader_cb) {
|
|
|
|
draw_params.bind_display_space_shader_cb();
|
|
|
|
}
|
|
|
|
|
2015-06-01 13:11:57 +00:00
|
|
|
if(!vertex_buffer)
|
2015-05-11 14:28:41 +00:00
|
|
|
glGenBuffers(1, &vertex_buffer);
|
|
|
|
|
|
|
|
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
|
|
|
|
/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
|
|
|
|
glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
|
|
|
|
|
|
|
|
vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
|
|
|
|
|
2015-06-01 13:11:57 +00:00
|
|
|
if(vpointer) {
|
2015-05-11 14:28:41 +00:00
|
|
|
/* texture coordinate - vertex pair */
|
|
|
|
vpointer[0] = 0.0f;
|
|
|
|
vpointer[1] = 0.0f;
|
2015-05-11 14:40:38 +00:00
|
|
|
vpointer[2] = dx;
|
2015-05-11 14:28:41 +00:00
|
|
|
vpointer[3] = dy;
|
|
|
|
|
|
|
|
vpointer[4] = (float)w/(float)pmem.w;
|
|
|
|
vpointer[5] = 0.0f;
|
2015-05-11 14:40:38 +00:00
|
|
|
vpointer[6] = (float)width + dx;
|
2015-05-11 14:28:41 +00:00
|
|
|
vpointer[7] = dy;
|
|
|
|
|
|
|
|
vpointer[8] = (float)w/(float)pmem.w;
|
|
|
|
vpointer[9] = (float)h/(float)pmem.h;
|
2015-05-11 14:40:38 +00:00
|
|
|
vpointer[10] = (float)width + dx;
|
2015-05-11 14:28:41 +00:00
|
|
|
vpointer[11] = (float)height + dy;
|
|
|
|
|
|
|
|
vpointer[12] = 0.0f;
|
|
|
|
vpointer[13] = (float)h/(float)pmem.h;
|
2015-05-11 14:40:38 +00:00
|
|
|
vpointer[14] = dx;
|
2015-05-11 14:28:41 +00:00
|
|
|
vpointer[15] = (float)height + dy;
|
|
|
|
|
|
|
|
glUnmapBuffer(GL_ARRAY_BUFFER);
|
|
|
|
}
|
|
|
|
|
|
|
|
glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), 0);
|
|
|
|
glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), (char *)NULL + 2 * sizeof(float));
|
|
|
|
|
|
|
|
glEnableClientState(GL_VERTEX_ARRAY);
|
|
|
|
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
|
|
|
|
|
|
|
|
glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
|
|
|
|
|
|
|
|
glDisableClientState(GL_TEXTURE_COORD_ARRAY);
|
|
|
|
glDisableClientState(GL_VERTEX_ARRAY);
|
|
|
|
|
|
|
|
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2014-03-26 08:57:30 +00:00
|
|
|
if(draw_params.unbind_display_space_shader_cb) {
|
|
|
|
draw_params.unbind_display_space_shader_cb();
|
|
|
|
}
|
|
|
|
|
2011-08-28 13:55:59 +00:00
|
|
|
if(transparent)
|
|
|
|
glDisable(GL_BLEND);
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2011-04-27 11:58:34 +00:00
|
|
|
glBindTexture(GL_TEXTURE_2D, 0);
|
|
|
|
glDisable(GL_TEXTURE_2D);
|
|
|
|
|
|
|
|
cuda_pop_context();
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-05-11 14:40:38 +00:00
|
|
|
Device::draw_pixels(mem, y, w, h, dx, dy, width, height, transparent, draw_params);
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
void thread_run(DeviceTask *task)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-09-04 13:29:07 +00:00
|
|
|
if(task->type == DeviceTask::PATH_TRACE) {
|
|
|
|
RenderTile tile;
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2013-08-23 14:34:34 +00:00
|
|
|
bool branched = task->integrator_branched;
|
2016-05-19 10:47:41 +00:00
|
|
|
|
|
|
|
/* Upload Bindless Mapping */
|
|
|
|
load_bindless_mapping();
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
if(!use_split_kernel()) {
|
|
|
|
/* keep rendering tiles until done */
|
|
|
|
while(task->acquire_tile(this, tile)) {
|
|
|
|
int start_sample = tile.start_sample;
|
|
|
|
int end_sample = tile.start_sample + tile.num_samples;
|
2012-09-04 13:29:07 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
for(int sample = start_sample; sample < end_sample; sample++) {
|
|
|
|
if(task->get_cancel()) {
|
|
|
|
if(task->need_finish_queue == false)
|
|
|
|
break;
|
|
|
|
}
|
2012-09-04 13:29:07 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
path_trace(tile, sample, branched);
|
2012-09-04 13:29:07 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
tile.sample = sample + 1;
|
2014-03-06 19:51:13 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
task->update_progress(&tile, tile.w*tile.h);
|
|
|
|
}
|
|
|
|
|
|
|
|
task->release_tile(tile);
|
2012-09-04 13:29:07 +00:00
|
|
|
}
|
2017-02-14 10:50:29 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
DeviceRequestedFeatures requested_features;
|
|
|
|
if(!use_adaptive_compilation()) {
|
|
|
|
requested_features.max_closure = 64;
|
|
|
|
}
|
|
|
|
|
|
|
|
CUDASplitKernel split_kernel(this);
|
|
|
|
split_kernel.load_kernels(requested_features);
|
|
|
|
|
|
|
|
while(task->acquire_tile(this, tile)) {
|
|
|
|
device_memory void_buffer;
|
|
|
|
split_kernel.path_trace(task, tile, void_buffer, void_buffer);
|
|
|
|
|
|
|
|
task->release_tile(tile);
|
2012-09-04 13:29:07 +00:00
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
if(task->get_cancel()) {
|
|
|
|
if(task->need_finish_queue == false)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-09-04 13:29:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if(task->type == DeviceTask::SHADER) {
|
2016-05-19 10:47:41 +00:00
|
|
|
/* Upload Bindless Mapping */
|
|
|
|
load_bindless_mapping();
|
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
shader(*task);
|
|
|
|
|
|
|
|
cuda_push_context();
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuCtxSynchronize());
|
2012-09-04 13:29:07 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
class CUDADeviceTask : public DeviceTask {
|
|
|
|
public:
|
|
|
|
CUDADeviceTask(CUDADevice *device, DeviceTask& task)
|
|
|
|
: DeviceTask(task)
|
|
|
|
{
|
|
|
|
run = function_bind(&CUDADevice::thread_run, device, this);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-03-27 10:47:55 +00:00
|
|
|
int get_split_task_count(DeviceTask& /*task*/)
|
2014-07-22 21:41:01 +00:00
|
|
|
{
|
2014-08-06 16:10:56 +00:00
|
|
|
return 1;
|
2014-07-22 21:41:01 +00:00
|
|
|
}
|
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
void task_add(DeviceTask& task)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2013-08-30 23:49:38 +00:00
|
|
|
if(task.type == DeviceTask::FILM_CONVERT) {
|
2012-09-04 13:29:07 +00:00
|
|
|
/* must be done in main thread due to opengl access */
|
2013-08-30 23:49:38 +00:00
|
|
|
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
cuda_push_context();
|
2014-05-03 17:49:56 +00:00
|
|
|
cuda_assert(cuCtxSynchronize());
|
2012-09-04 13:29:07 +00:00
|
|
|
cuda_pop_context();
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
task_pool.push(new CUDADeviceTask(this, task));
|
|
|
|
}
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2012-09-04 13:29:07 +00:00
|
|
|
void task_wait()
|
|
|
|
{
|
2013-08-30 23:09:22 +00:00
|
|
|
task_pool.wait();
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void task_cancel()
|
|
|
|
{
|
2012-09-04 13:29:07 +00:00
|
|
|
task_pool.cancel();
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
2017-02-14 10:50:29 +00:00
|
|
|
|
|
|
|
friend class CUDASplitKernelFunction;
|
|
|
|
friend class CUDASplitKernel;
|
2011-04-27 11:58:34 +00:00
|
|
|
};
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
|
|
|
|
* now that the definition of that class is complete
|
|
|
|
*/
|
|
|
|
#undef cuda_assert
|
|
|
|
#define cuda_assert(stmt) \
|
|
|
|
{ \
|
|
|
|
CUresult result = stmt; \
|
|
|
|
\
|
|
|
|
if(result != CUDA_SUCCESS) { \
|
|
|
|
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
|
|
|
|
if(device->error_msg == "") \
|
|
|
|
device->error_msg = message; \
|
|
|
|
fprintf(stderr, "%s\n", message.c_str()); \
|
|
|
|
/*cuda_abort();*/ \
|
|
|
|
device->cuda_error_documentation(); \
|
|
|
|
} \
|
|
|
|
} (void)0
|
|
|
|
|
|
|
|
/* split kernel */
|
|
|
|
|
|
|
|
class CUDASplitKernelFunction : public SplitKernelFunction{
|
|
|
|
CUDADevice* device;
|
|
|
|
CUfunction func;
|
|
|
|
public:
|
|
|
|
CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
|
|
|
|
|
|
|
|
/* enqueue the kernel, returns false if there is an error */
|
|
|
|
bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
|
|
|
|
{
|
|
|
|
return enqueue(dim, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* enqueue the kernel, returns false if there is an error */
|
|
|
|
bool enqueue(const KernelDimensions &dim, void *args[])
|
|
|
|
{
|
|
|
|
device->cuda_push_context();
|
|
|
|
|
|
|
|
if(device->have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* we ignore dim.local_size for now, as this is faster */
|
|
|
|
int threads_per_block;
|
|
|
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
|
|
|
|
|
|
|
|
int xthreads = (int)sqrt(threads_per_block);
|
|
|
|
int ythreads = (int)sqrt(threads_per_block);
|
|
|
|
|
|
|
|
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
|
|
|
|
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
|
|
|
|
|
|
|
|
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
|
|
|
|
cuda_assert(cuLaunchKernel(func,
|
2017-03-07 10:21:36 +00:00
|
|
|
xblocks , yblocks, 1, /* blocks */
|
|
|
|
xthreads, ythreads, 1, /* threads */
|
|
|
|
0, 0, args, 0));
|
2017-02-14 10:50:29 +00:00
|
|
|
|
|
|
|
device->cuda_pop_context();
|
|
|
|
|
|
|
|
return !device->have_error();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-03-11 10:23:11 +00:00
|
|
|
uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
|
2017-03-04 11:29:01 +00:00
|
|
|
{
|
2017-03-11 10:23:11 +00:00
|
|
|
device_vector<uint64_t> size_buffer;
|
2017-03-04 11:29:01 +00:00
|
|
|
size_buffer.resize(1);
|
|
|
|
device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
|
|
|
|
|
|
|
|
device->cuda_push_context();
|
|
|
|
|
|
|
|
uint threads = num_threads;
|
|
|
|
CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
|
|
|
|
|
|
|
|
struct args_t {
|
|
|
|
uint* num_threads;
|
|
|
|
CUdeviceptr* size;
|
|
|
|
};
|
|
|
|
|
|
|
|
args_t args = {
|
|
|
|
&threads,
|
|
|
|
&d_size
|
|
|
|
};
|
|
|
|
|
|
|
|
CUfunction state_buffer_size;
|
|
|
|
cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
|
|
|
|
|
|
|
|
cuda_assert(cuLaunchKernel(state_buffer_size,
|
2017-03-07 10:21:36 +00:00
|
|
|
1, 1, 1,
|
|
|
|
1, 1, 1,
|
|
|
|
0, 0, &args, 0));
|
2017-03-04 11:29:01 +00:00
|
|
|
|
|
|
|
device->cuda_pop_context();
|
|
|
|
|
2017-03-11 10:23:11 +00:00
|
|
|
device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
|
2017-03-04 11:29:01 +00:00
|
|
|
device->mem_free(size_buffer);
|
|
|
|
|
|
|
|
return *size_buffer.get_data();
|
|
|
|
}
|
|
|
|
|
2017-02-14 10:50:29 +00:00
|
|
|
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
|
|
|
RenderTile& rtile,
|
|
|
|
int num_global_elements,
|
|
|
|
device_memory& /*kernel_globals*/,
|
|
|
|
device_memory& /*kernel_data*/,
|
|
|
|
device_memory& split_data,
|
|
|
|
device_memory& ray_state,
|
|
|
|
device_memory& queue_index,
|
|
|
|
device_memory& use_queues_flag,
|
|
|
|
device_memory& work_pool_wgs)
|
|
|
|
{
|
|
|
|
device->cuda_push_context();
|
|
|
|
|
|
|
|
CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
|
|
|
|
CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
|
|
|
|
CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
|
|
|
|
CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
|
|
|
|
CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
|
|
|
|
|
|
|
|
CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
|
|
|
|
CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
|
|
|
|
|
|
|
|
int end_sample = rtile.start_sample + rtile.num_samples;
|
|
|
|
int queue_size = dim.global_size[0] * dim.global_size[1];
|
|
|
|
|
|
|
|
struct args_t {
|
|
|
|
CUdeviceptr* split_data_buffer;
|
|
|
|
int* num_elements;
|
|
|
|
CUdeviceptr* ray_state;
|
|
|
|
CUdeviceptr* rng_state;
|
|
|
|
int* start_sample;
|
|
|
|
int* end_sample;
|
|
|
|
int* sx;
|
|
|
|
int* sy;
|
|
|
|
int* sw;
|
|
|
|
int* sh;
|
|
|
|
int* offset;
|
|
|
|
int* stride;
|
|
|
|
CUdeviceptr* queue_index;
|
|
|
|
int* queuesize;
|
|
|
|
CUdeviceptr* use_queues_flag;
|
|
|
|
CUdeviceptr* work_pool_wgs;
|
|
|
|
int* num_samples;
|
|
|
|
CUdeviceptr* buffer;
|
|
|
|
};
|
|
|
|
|
|
|
|
args_t args = {
|
|
|
|
&d_split_data,
|
|
|
|
&num_global_elements,
|
|
|
|
&d_ray_state,
|
|
|
|
&d_rng_state,
|
|
|
|
&rtile.start_sample,
|
|
|
|
&end_sample,
|
|
|
|
&rtile.x,
|
|
|
|
&rtile.y,
|
|
|
|
&rtile.w,
|
|
|
|
&rtile.h,
|
|
|
|
&rtile.offset,
|
|
|
|
&rtile.stride,
|
|
|
|
&d_queue_index,
|
|
|
|
&queue_size,
|
|
|
|
&d_use_queues_flag,
|
|
|
|
&d_work_pool_wgs,
|
|
|
|
&rtile.num_samples,
|
|
|
|
&d_buffer
|
|
|
|
};
|
|
|
|
|
|
|
|
CUfunction data_init;
|
|
|
|
cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
|
|
|
|
if(device->have_error()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
|
|
|
|
|
|
|
|
device->cuda_pop_context();
|
|
|
|
|
|
|
|
return !device->have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
|
|
|
|
{
|
|
|
|
CUfunction func;
|
|
|
|
|
|
|
|
device->cuda_push_context();
|
|
|
|
|
|
|
|
cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
|
|
|
|
if(device->have_error()) {
|
|
|
|
device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
device->cuda_pop_context();
|
|
|
|
|
|
|
|
return new CUDASplitKernelFunction(device, func);
|
|
|
|
}
|
|
|
|
|
|
|
|
int2 CUDASplitKernel::split_kernel_local_size()
|
|
|
|
{
|
|
|
|
return make_int2(32, 1);
|
|
|
|
}
|
|
|
|
|
2017-03-04 11:29:01 +00:00
|
|
|
int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
|
2017-02-14 10:50:29 +00:00
|
|
|
{
|
|
|
|
/* TODO(mai): implement something here to detect ideal work size */
|
|
|
|
return make_int2(256, 256);
|
|
|
|
}
|
|
|
|
|
2014-08-05 07:57:50 +00:00
|
|
|
bool device_cuda_init(void)
|
|
|
|
{
|
2016-01-14 07:24:09 +00:00
|
|
|
#ifdef WITH_CUDA_DYNLOAD
|
2014-08-05 07:57:50 +00:00
|
|
|
static bool initialized = false;
|
|
|
|
static bool result = false;
|
|
|
|
|
2015-03-27 19:15:15 +00:00
|
|
|
if(initialized)
|
2014-08-05 07:57:50 +00:00
|
|
|
return result;
|
|
|
|
|
|
|
|
initialized = true;
|
2014-11-15 20:58:55 +00:00
|
|
|
int cuew_result = cuewInit();
|
2015-03-27 19:15:15 +00:00
|
|
|
if(cuew_result == CUEW_SUCCESS) {
|
2014-11-15 20:58:55 +00:00
|
|
|
VLOG(1) << "CUEW initialization succeeded";
|
|
|
|
if(CUDADevice::have_precompiled_kernels()) {
|
2016-03-12 10:00:06 +00:00
|
|
|
VLOG(1) << "Found precompiled kernels";
|
2014-08-05 07:57:50 +00:00
|
|
|
result = true;
|
2014-11-15 20:58:55 +00:00
|
|
|
}
|
2014-08-05 07:57:50 +00:00
|
|
|
#ifndef _WIN32
|
2014-11-15 20:58:55 +00:00
|
|
|
else if(cuewCompilerPath() != NULL) {
|
2016-03-12 10:00:06 +00:00
|
|
|
VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
|
2014-08-05 07:57:50 +00:00
|
|
|
result = true;
|
2014-11-15 20:58:55 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
VLOG(1) << "Neither precompiled kernels nor CUDA compiler wad found,"
|
|
|
|
<< " unable to use CUDA";
|
|
|
|
}
|
2014-08-05 07:57:50 +00:00
|
|
|
#endif
|
|
|
|
}
|
2014-11-15 20:58:55 +00:00
|
|
|
else {
|
|
|
|
VLOG(1) << "CUEW initialization failed: "
|
|
|
|
<< ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
|
|
|
|
? "Error setting up atexit() handler"
|
|
|
|
: "Error opening the library");
|
|
|
|
}
|
2014-08-05 07:57:50 +00:00
|
|
|
|
|
|
|
return result;
|
2016-01-14 07:24:09 +00:00
|
|
|
#else /* WITH_CUDA_DYNLOAD */
|
|
|
|
return true;
|
|
|
|
#endif /* WITH_CUDA_DYNLOAD */
|
2014-08-05 07:57:50 +00:00
|
|
|
}
|
|
|
|
|
2012-11-05 08:04:57 +00:00
|
|
|
Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
|
2011-04-27 11:58:34 +00:00
|
|
|
{
|
2012-11-05 08:04:57 +00:00
|
|
|
return new CUDADevice(info, stats, background);
|
2012-01-04 18:06:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void device_cuda_info(vector<DeviceInfo>& devices)
|
|
|
|
{
|
2012-11-21 01:00:03 +00:00
|
|
|
CUresult result;
|
2012-01-04 18:06:32 +00:00
|
|
|
int count = 0;
|
|
|
|
|
2012-11-21 01:00:03 +00:00
|
|
|
result = cuInit(0);
|
|
|
|
if(result != CUDA_SUCCESS) {
|
|
|
|
if(result != CUDA_ERROR_NO_DEVICE)
|
2014-08-05 07:57:50 +00:00
|
|
|
fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
|
2012-01-04 18:06:32 +00:00
|
|
|
return;
|
2012-11-21 01:00:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
result = cuDeviceGetCount(&count);
|
|
|
|
if(result != CUDA_SUCCESS) {
|
2014-08-05 07:57:50 +00:00
|
|
|
fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
|
2012-01-04 18:06:32 +00:00
|
|
|
return;
|
2012-11-21 01:00:03 +00:00
|
|
|
}
|
2016-08-02 09:28:20 +00:00
|
|
|
|
2012-01-09 16:58:01 +00:00
|
|
|
vector<DeviceInfo> display_devices;
|
2015-06-20 15:34:12 +00:00
|
|
|
|
2012-01-04 18:06:32 +00:00
|
|
|
for(int num = 0; num < count; num++) {
|
|
|
|
char name[256];
|
|
|
|
int attr;
|
2015-06-20 15:34:12 +00:00
|
|
|
|
2012-01-04 18:06:32 +00:00
|
|
|
if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
|
|
|
|
continue;
|
|
|
|
|
2016-11-04 13:49:54 +00:00
|
|
|
int major;
|
|
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
|
2015-06-20 15:34:12 +00:00
|
|
|
if(major < 2) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-01-04 18:06:32 +00:00
|
|
|
DeviceInfo info;
|
|
|
|
|
|
|
|
info.type = DEVICE_CUDA;
|
|
|
|
info.description = string(name);
|
2015-06-27 13:11:46 +00:00
|
|
|
info.num = num;
|
2012-01-04 18:06:32 +00:00
|
|
|
|
2012-01-26 19:07:01 +00:00
|
|
|
info.advanced_shading = (major >= 2);
|
2016-05-19 10:47:41 +00:00
|
|
|
info.has_bindless_textures = (major >= 3);
|
2012-05-13 12:32:44 +00:00
|
|
|
info.pack_images = false;
|
2012-01-26 19:07:01 +00:00
|
|
|
|
Cycles: Refactor Device selection to allow individual GPU compute device selection
Previously, it was only possible to choose a single GPU or all of that type (CUDA or OpenCL).
Now, a toggle button is displayed for every device.
These settings are tied to the PCI Bus ID of the devices, so they're consistent across hardware addition and removal (but not when swapping/moving cards).
From the code perspective, the more important change is that now, the compute device properties are stored in the Addon preferences of the Cycles addon, instead of directly in the User Preferences.
This allows for a cleaner implementation, removing the Cycles C API functions that were called by the RNA code to specify the enum items.
Note that this change is neither backwards- nor forwards-compatible, but since it's only a User Preference no existing files are broken.
Reviewers: #cycles, brecht
Reviewed By: #cycles, brecht
Subscribers: brecht, juicyfruit, mib2berlin, Blendify
Differential Revision: https://developer.blender.org/D2338
2016-11-07 01:33:53 +00:00
|
|
|
int pci_location[3] = {0, 0, 0};
|
|
|
|
cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
|
|
|
|
cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
|
|
|
|
cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
|
2016-11-22 15:38:37 +00:00
|
|
|
info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
|
|
|
|
name,
|
|
|
|
(unsigned int)pci_location[0],
|
|
|
|
(unsigned int)pci_location[1],
|
|
|
|
(unsigned int)pci_location[2]);
|
Cycles: Refactor Device selection to allow individual GPU compute device selection
Previously, it was only possible to choose a single GPU or all of that type (CUDA or OpenCL).
Now, a toggle button is displayed for every device.
These settings are tied to the PCI Bus ID of the devices, so they're consistent across hardware addition and removal (but not when swapping/moving cards).
From the code perspective, the more important change is that now, the compute device properties are stored in the Addon preferences of the Cycles addon, instead of directly in the User Preferences.
This allows for a cleaner implementation, removing the Cycles C API functions that were called by the RNA code to specify the enum items.
Note that this change is neither backwards- nor forwards-compatible, but since it's only a User Preference no existing files are broken.
Reviewers: #cycles, brecht
Reviewed By: #cycles, brecht
Subscribers: brecht, juicyfruit, mib2berlin, Blendify
Differential Revision: https://developer.blender.org/D2338
2016-11-07 01:33:53 +00:00
|
|
|
|
2012-01-04 18:06:32 +00:00
|
|
|
/* if device has a kernel timeout, assume it is used for display */
|
2012-01-09 16:58:01 +00:00
|
|
|
if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
|
2016-06-03 09:52:08 +00:00
|
|
|
info.description += " (Display)";
|
2012-01-04 18:06:32 +00:00
|
|
|
info.display_device = true;
|
2012-01-09 16:58:01 +00:00
|
|
|
display_devices.push_back(info);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
devices.push_back(info);
|
2012-01-04 18:06:32 +00:00
|
|
|
}
|
2012-01-09 16:58:01 +00:00
|
|
|
|
|
|
|
if(!display_devices.empty())
|
|
|
|
devices.insert(devices.end(), display_devices.begin(), display_devices.end());
|
2011-04-27 11:58:34 +00:00
|
|
|
}
|
|
|
|
|
2015-01-06 09:13:21 +00:00
|
|
|
string device_cuda_capabilities(void)
|
|
|
|
{
|
|
|
|
CUresult result = cuInit(0);
|
|
|
|
if(result != CUDA_SUCCESS) {
|
|
|
|
if(result != CUDA_ERROR_NO_DEVICE) {
|
|
|
|
return string("Error initializing CUDA: ") + cuewErrorString(result);
|
|
|
|
}
|
2015-06-05 12:13:59 +00:00
|
|
|
return "No CUDA device found\n";
|
2015-01-06 09:13:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int count;
|
|
|
|
result = cuDeviceGetCount(&count);
|
|
|
|
if(result != CUDA_SUCCESS) {
|
|
|
|
return string("Error getting devices: ") + cuewErrorString(result);
|
|
|
|
}
|
2011-04-27 11:58:34 +00:00
|
|
|
|
2015-01-06 09:13:21 +00:00
|
|
|
string capabilities = "";
|
|
|
|
for(int num = 0; num < count; num++) {
|
|
|
|
char name[256];
|
|
|
|
if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
capabilities += string("\t") + name + "\n";
|
|
|
|
int value;
|
|
|
|
#define GET_ATTR(attr) \
|
|
|
|
{ \
|
|
|
|
if(cuDeviceGetAttribute(&value, \
|
|
|
|
CU_DEVICE_ATTRIBUTE_##attr, \
|
|
|
|
num) == CUDA_SUCCESS) \
|
|
|
|
{ \
|
|
|
|
capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
|
|
|
|
value); \
|
|
|
|
} \
|
|
|
|
} (void)0
|
|
|
|
/* TODO(sergey): Strip all attributes which are not useful for us
|
|
|
|
* or does not depend on the driver.
|
|
|
|
*/
|
|
|
|
GET_ATTR(MAX_THREADS_PER_BLOCK);
|
|
|
|
GET_ATTR(MAX_BLOCK_DIM_X);
|
|
|
|
GET_ATTR(MAX_BLOCK_DIM_Y);
|
|
|
|
GET_ATTR(MAX_BLOCK_DIM_Z);
|
|
|
|
GET_ATTR(MAX_GRID_DIM_X);
|
|
|
|
GET_ATTR(MAX_GRID_DIM_Y);
|
|
|
|
GET_ATTR(MAX_GRID_DIM_Z);
|
|
|
|
GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
|
|
|
|
GET_ATTR(SHARED_MEMORY_PER_BLOCK);
|
|
|
|
GET_ATTR(TOTAL_CONSTANT_MEMORY);
|
|
|
|
GET_ATTR(WARP_SIZE);
|
|
|
|
GET_ATTR(MAX_PITCH);
|
|
|
|
GET_ATTR(MAX_REGISTERS_PER_BLOCK);
|
|
|
|
GET_ATTR(REGISTERS_PER_BLOCK);
|
|
|
|
GET_ATTR(CLOCK_RATE);
|
|
|
|
GET_ATTR(TEXTURE_ALIGNMENT);
|
|
|
|
GET_ATTR(GPU_OVERLAP);
|
|
|
|
GET_ATTR(MULTIPROCESSOR_COUNT);
|
|
|
|
GET_ATTR(KERNEL_EXEC_TIMEOUT);
|
|
|
|
GET_ATTR(INTEGRATED);
|
|
|
|
GET_ATTR(CAN_MAP_HOST_MEMORY);
|
|
|
|
GET_ATTR(COMPUTE_MODE);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
|
|
|
|
GET_ATTR(SURFACE_ALIGNMENT);
|
|
|
|
GET_ATTR(CONCURRENT_KERNELS);
|
|
|
|
GET_ATTR(ECC_ENABLED);
|
|
|
|
GET_ATTR(TCC_DRIVER);
|
|
|
|
GET_ATTR(MEMORY_CLOCK_RATE);
|
|
|
|
GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
|
|
|
|
GET_ATTR(L2_CACHE_SIZE);
|
|
|
|
GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
|
|
|
|
GET_ATTR(ASYNC_ENGINE_COUNT);
|
|
|
|
GET_ATTR(UNIFIED_ADDRESSING);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(CAN_TEX2D_GATHER);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
|
|
|
|
GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
|
|
|
|
GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
|
|
|
|
GET_ATTR(COMPUTE_CAPABILITY_MINOR);
|
|
|
|
GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
|
|
|
|
GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
|
|
|
|
GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
|
|
|
|
GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
|
|
|
|
GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
|
|
|
|
GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
|
|
|
|
GET_ATTR(MANAGED_MEMORY);
|
|
|
|
GET_ATTR(MULTI_GPU_BOARD);
|
|
|
|
GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
|
|
|
|
#undef GET_ATTR
|
|
|
|
capabilities += "\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
return capabilities;
|
|
|
|
}
|
|
|
|
|
|
|
|
CCL_NAMESPACE_END
|