cycles: Add an nvrtc based cubin cli compiler.

nvcc is very picky regarding compiler versions, severely limiting the compiler we can use, this commit adds a nvrtc based compiler that'll allow us to build the cubins even if the host compiler is unsupported. for details see D2913. Differential Revision: http://developer.blender.org/D2913
2018-02-03 10:59:09 -07:00 · 2018-02-03 10:59:09 -07:00 · a5052770b8
commit a5052770b8
parent db989e1f11
9 changed files with 406 additions and 47 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -409,12 +409,14 @@ option(WITH_CYCLES_STANDALONE_GUI	"Build Cycles standalone with GUI" OFF)
 option(WITH_CYCLES_OSL				"Build Cycles with OSL support" ${_init_CYCLES_OSL})
 option(WITH_CYCLES_OPENSUBDIV		"Build Cycles with OpenSubdiv support" ${_init_CYCLES_OPENSUBDIV})
 option(WITH_CYCLES_CUDA_BINARIES	"Build Cycles CUDA binaries" OFF)
+option(WITH_CYCLES_CUBIN_COMPILER	"Build cubins with nvrtc based compiler instead of nvcc" OFF)
 set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
 option(WITH_CYCLES_LOGGING	"Build Cycles with logging support" ON)
 option(WITH_CYCLES_DEBUG	"Build Cycles with extra debug capabilities" OFF)
 option(WITH_CYCLES_NATIVE_ONLY	"Build Cycles with native kernel only (which fits current CPU, use for development only)" OFF)
+mark_as_advanced(WITH_CYCLES_CUBIN_COMPILER)
 mark_as_advanced(WITH_CYCLES_LOGGING)
 mark_as_advanced(WITH_CYCLES_DEBUG)
 mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@ -1323,6 +1323,7 @@ int cuewInit(void);
 const char *cuewErrorString(CUresult result);
 const char *cuewCompilerPath(void);
 int cuewCompilerVersion(void);
+int cuewNvrtcVersion(void);

 #ifdef __cplusplus
 }
--- a/extern/cuew/src/cuew.c
+++ b/extern/cuew/src/cuew.c
@ -329,7 +329,7 @@ int cuewInit(void) {
 #ifdef _WIN32
  /* Expected in c:/windows/system or similar, no path needed. */
  const char *cuda_paths[] = {"nvcuda.dll", NULL};
-  const char *nvrtc_paths[] = {"nvrtc.dll", NULL};
+  const char *nvrtc_paths[] = {"nvrtc64_80.dll", "nvrtc64_90.dll", "nvrtc64_91.dll", NULL};
 #elif defined(__APPLE__)
  /* Default installation path. */
  const char *cuda_paths[] = {"/usr/local/cuda/lib/libcuda.dylib", NULL};
@ -766,6 +766,15 @@ const char *cuewCompilerPath(void) {
  return NULL;
 }

+int cuewNvrtcVersion(void) {
+  int major, minor;
+  if (nvrtcVersion) {
+    nvrtcVersion(&major, &minor);
+    return 10 * major + minor;
+  }
+  return 0;
+}
+
 int cuewCompilerVersion(void) {
  const char *path = cuewCompilerPath();
  const char *marker = "Cuda compilation tools, release ";
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -242,6 +242,24 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 	unset(_has_no_error_unused_macros)
 endif()

+if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
+	if(MSVC)
+		set(MAX_MSVC 1800)
+		if(${CUDA_VERSION} EQUAL "8.0")
+			set(MAX_MSVC 1900)
+		elseif(${CUDA_VERSION} EQUAL "9.0")
+			set(MAX_MSVC 1910)
+		elseif(${CUDA_VERSION} EQUAL "9.1")
+			set(MAX_MSVC 1911)
+		endif()
+		if (NOT MSVC_VERSION LESS ${MAX_MSVC})
+			message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
+			set(WITH_CYCLES_CUBIN_COMPILER ON)
+		endif()
+		unset(MAX_MSVC)
+	endif()
+endif()
+

 # Subdirectories

@ -254,7 +272,7 @@ if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()

-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK)
+if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
 	add_subdirectory(app)
 endif()

--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@ -120,3 +120,27 @@ if(WITH_CYCLES_NETWORK)
 	endif()
 	unset(SRC)
 endif()
+
+if(WITH_CYCLES_CUBIN_COMPILER)
+	# 32 bit windows is special, nvrtc is not supported on x86, so even
+	# though we are building 32 bit blender a 64 bit cubin_cc will have
+	# to be build to compile the cubins.
+	if(MSVC AND NOT CMAKE_CL_64)
+		Message("cycles_cubin_cc not supported on x86")
+	else()
+		set(SRC cycles_cubin_cc.cpp)
+		set(INC ../../../extern/cuew/include)
+		add_executable(cycles_cubin_cc ${SRC})
+		include_directories(${INC})
+		target_link_libraries(cycles_cubin_cc
+			extern_cuew
+			${OPENIMAGEIO_LIBRARIES}
+			${PLATFORM_LINKLIBS}
+		)
+		if(NOT CYCLES_STANDALONE_REPOSITORY)
+			target_link_libraries(cycles_cubin_cc bf_intern_guardedalloc)
+		endif()
+		unset(SRC)
+		unset(INC)
+	endif()
+endif()
--- a/intern/cycles/app/cycles_cubin_cc.cpp
+++ b/intern/cycles/app/cycles_cubin_cc.cpp
@ -0,0 +1,284 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include <OpenImageIO/argparse.h>
+#include <OpenImageIO/filesystem.h>
+
+#include "cuew.h"
+
+#ifdef _MSC_VER
+# include <Windows.h>
+#endif
+
+using std::string;
+using std::vector;
+
+class CompilationSettings
+{
+public:
+	CompilationSettings()
+	: target_arch(0),
+	  bits(64),
+	  verbose(false),
+	  fast_math(false)
+	{}
+
+	string cuda_toolkit_dir;
+	string input_file;
+	string output_file;
+	string ptx_file;
+	vector<string> defines;
+	vector<string> includes;
+	int target_arch;
+	int bits;
+	bool verbose;
+	bool fast_math;
+};
+
+bool compile_cuda(CompilationSettings &settings)
+{
+	const char* headers[] = {"stdlib.h" , "float.h", "math.h", "stdio.h"};
+	const char* header_content[] = {"\n", "\n", "\n", "\n"};
+
+	printf("Building %s\n", settings.input_file.c_str());
+
+	string code;
+	if(!OIIO::Filesystem::read_text_file(settings.input_file, code)) {
+		fprintf(stderr, "Error: unable to read %s\n", settings.input_file.c_str());
+		return false;
+	}
+
+	vector<string> options;
+	for(size_t i = 0; i < settings.includes.size(); i++) {
+		options.push_back("-I" + settings.includes[i]);
+	}
+
+	for(size_t i = 0; i < settings.defines.size(); i++) {
+		options.push_back("-D" + settings.defines[i]);
+	}
+
+	options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
+	options.push_back("--device-as-default-execution-space");
+	if(settings.fast_math)
+		options.push_back("--use_fast_math");
+
+	nvrtcProgram prog;
+	nvrtcResult result = nvrtcCreateProgram(&prog,
+		code.c_str(),                    // buffer
+		NULL,                            // name
+		sizeof(headers) / sizeof(void*), // numHeaders
+		header_content,                  // headers
+		headers);                        // includeNames
+
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcCreateProgram failed (%x)\n\n", result);
+		return false;
+	}
+
+	/* Tranfer options to a classic C array. */
+	vector<const char*> opts(options.size());
+	for(size_t i = 0; i < options.size(); i++) {
+		opts[i] = options[i].c_str();
+	}
+
+	result = nvrtcCompileProgram(prog, options.size(), &opts[0]);
+
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcCompileProgram failed (%x)\n\n", result);
+
+		size_t log_size;
+		nvrtcGetProgramLogSize(prog, &log_size);
+
+		vector<char> log(log_size);
+		nvrtcGetProgramLog(prog, &log[0]);
+		fprintf(stderr, "%s\n", &log[0]);
+
+		return false;
+	}
+
+	/* Retrieve the ptx code. */
+	size_t ptx_size;
+	result = nvrtcGetPTXSize(prog, &ptx_size);
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcGetPTXSize failed (%x)\n\n", result);
+		return false;
+	}
+
+	vector<char> ptx_code(ptx_size);
+	result = nvrtcGetPTX(prog, &ptx_code[0]);
+	if(result != NVRTC_SUCCESS) {
+		fprintf(stderr, "Error: nvrtcGetPTX failed (%x)\n\n", result);
+		return false;
+	}
+
+	/* Write a file in the temp folder with the ptx code. */
+	settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" + OIIO::Filesystem::unique_path();
+	FILE * f= fopen(settings.ptx_file.c_str(), "wb");
+	fwrite(&ptx_code[0], 1, ptx_size, f);
+	fclose(f);
+
+	return true;
+}
+
+bool link_ptxas(CompilationSettings &settings)
+{
+	string cudapath = "";
+	if(settings.cuda_toolkit_dir.size())
+		cudapath = settings.cuda_toolkit_dir + "/bin/";
+
+	string ptx = "\"" +cudapath + "ptxas\" " + settings.ptx_file +
+					" -o " + settings.output_file +
+					" --gpu-name sm_" + std::to_string(settings.target_arch) +
+					" -m" + std::to_string(settings.bits);
+
+	if(settings.verbose)
+		ptx += " --verbose";
+
+	int pxresult = system(ptx.c_str());
+	if(pxresult) {
+		fprintf(stderr, "Error: ptxas failed (%x)\n\n", pxresult);
+		return false;
+	}
+
+	if(!OIIO::Filesystem::remove(settings.ptx_file)) {
+		fprintf(stderr, "Error: removing %s\n\n", settings.ptx_file.c_str());
+	}
+
+	return true;
+}
+
+bool init(CompilationSettings &settings)
+{
+#ifdef _MSC_VER
+	if(settings.cuda_toolkit_dir.size()) {
+		SetDllDirectory((settings.cuda_toolkit_dir + "/bin").c_str());
+	}
+#endif
+
+	int cuewresult = cuewInit();
+	if(cuewresult != CUEW_SUCCESS) {
+		fprintf(stderr, "Error: cuew init fialed (0x%x)\n\n", cuewresult);
+		return false;
+	}
+
+	if(cuewNvrtcVersion() < 80) {
+		fprintf(stderr, "Error: only cuda 8 and higher is supported, %d\n\n", cuewCompilerVersion());
+		return false;
+	}
+
+	if(!nvrtcCreateProgram) {
+		fprintf(stderr, "Error: nvrtcCreateProgram not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcCompileProgram) {
+		fprintf(stderr, "Error: nvrtcCompileProgram not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetProgramLogSize) {
+		fprintf(stderr, "Error: nvrtcGetProgramLogSize not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetProgramLog) {
+		fprintf(stderr, "Error: nvrtcGetProgramLog not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetPTXSize) {
+		fprintf(stderr, "Error: nvrtcGetPTXSize not resolved\n");
+		return false;
+	}
+
+	if(!nvrtcGetPTX) {
+		fprintf(stderr, "Error: nvrtcGetPTX not resolved\n");
+		return false;
+	}
+
+	return true;
+}
+
+bool parse_parameters(int argc, const char **argv, CompilationSettings &settings)
+{
+	OIIO::ArgParse ap;
+	ap.options("Usage: cycles_cubin_cc [options]",
+		"-target %d", &settings.target_arch, "target shader model",
+		"-m %d", &settings.bits, "Cuda architecture bits",
+		"-i %s", &settings.input_file, "Input source filename",
+		"-o %s", &settings.output_file, "Output cubin filename",
+		"-I %L", &settings.includes, "Add additional includepath",
+		"-D %L", &settings.defines, "Add additional defines",
+		"-v", &settings.verbose, "Use verbose logging",
+		"--use_fast_math", &settings.fast_math, "Use fast math",
+		"-cuda-toolkit-dir %s", &settings.cuda_toolkit_dir, "path to the cuda toolkit binary directory",
+		NULL);
+
+	if(ap.parse(argc, argv) < 0) {
+		fprintf(stderr, "%s\n", ap.geterror().c_str());
+		ap.usage();
+		return false;
+	}
+
+	if(!settings.output_file.size()) {
+		fprintf(stderr, "Error: Output file not set(-o), required\n\n");
+		return false;
+	}
+
+	if(!settings.input_file.size()) {
+		fprintf(stderr, "Error: Input file not set(-i, required\n\n");
+		return false;
+	}
+
+	if(!settings.target_arch) {
+		fprintf(stderr, "Error: target shader model not set (-target), required\n\n");
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, const char **argv)
+{
+	CompilationSettings settings;
+
+	if(!parse_parameters(argc, argv, settings)) {
+		fprintf(stderr, "Error: invalid parameters, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!init(settings)) {
+		fprintf(stderr, "Error: initialization error, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!compile_cuda(settings)) {
+		fprintf(stderr, "Error: compilation error, exiting\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if(!link_ptxas(settings)) {
+		exit(EXIT_FAILURE);
+	}
+
+	return 0;
+}
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -348,7 +348,6 @@ public:
 	        const DeviceRequestedFeatures& requested_features,
 	        bool filter=false, bool split=false)
 	{
-		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
 		const string source_path = path_get("source");
 		const string include_path = source_path;
@ -356,10 +355,8 @@ public:
 		                              "--ptxas-options=\"-v\" "
 		                              "--use_fast_math "
 		                              "-DNVCC "
-		                              "-D__KERNEL_CUDA_VERSION__=%d "
 		                               "-I\"%s\"",
 		                              machine,
-		                              cuda_version,
 		                              include_path.c_str());
 		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -356,55 +356,67 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(cuda_cubins)

 	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
+		set(cuda_cubin ${name}_${arch}.cubin)
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+
+		set(cuda_flags
+			-D CCL_NAMESPACE_BEGIN=
+			-D CCL_NAMESPACE_END=
+			-D NVCC
+			-m ${CUDA_BITS}
+			-I ${CMAKE_CURRENT_SOURCE_DIR}/..
+			-I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+			--use_fast_math
+			-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+
 		if(${experimental})
-			set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
 			set(name ${name}_experimental)
 		endif()

-		set(cuda_cubin ${name}_${arch}.cubin)
-
 		if(WITH_CYCLES_DEBUG)
-			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
-		else()
-			set(cuda_debug_flags "")
+			set(cuda_flags ${cuda_flags} -D __KERNEL_DEBUG__)
 		endif()

-		set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
-		set(cuda_nvcc_version ${CUDA_VERSION})
+		if(WITH_CYCLES_CUBIN_COMPILER)
+			string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)

-		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
-		set(cuda_math_flags "--use_fast_math")
-
-		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
-
-		add_custom_command(
-			OUTPUT ${cuda_cubin}
-			COMMAND ${cuda_nvcc_command}
-					-arch=${arch}
-					${CUDA_NVCC_FLAGS}
-					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-					--ptxas-options="-v"
-					${cuda_arch_flags}
-					${cuda_version_flags}
-					${cuda_math_flags}
-					${flags}
-					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/..
-					-DCCL_NAMESPACE_BEGIN=
-					-DCCL_NAMESPACE_END=
-					-DNVCC
-			DEPENDS ${sources})
+			# Needed to find libnvrtc-builtins.so. Can't do it from inside
+			# cycles_cubin_cc since the env variable is read before main()
+			if(APPLE)
+				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+					-E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+			elseif(UNIX)
+				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+					-E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+			endif()

+			add_custom_command(
+				OUTPUT ${cuda_cubin}
+				COMMAND ${CUBIN_CC_ENV}
+						"$<TARGET_FILE:cycles_cubin_cc>"
+						-target ${CUDA_ARCH}
+						-i ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+						${cuda_flags}
+						-v
+						-cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+				DEPENDS ${sources} cycles_cubin_cc)
+		else()
+			add_custom_command(
+				OUTPUT ${cuda_cubin}
+				COMMAND ${CUDA_NVCC_EXECUTABLE}
+						-arch=${arch}
+						${CUDA_NVCC_FLAGS}
+						--cubin
+						${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+						--ptxas-options="-v"
+						${cuda_flags}
+				DEPENDS ${sources})
+		endif()
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})

-		unset(cuda_extra_flags)
 		unset(cuda_debug_flags)
-
-		unset(cuda_nvcc_command)
-		unset(cuda_nvcc_version)
 	endmacro()

 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
@ -412,12 +424,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			message(STATUS "CUDA binaries for ${arch} disabled, not supported by CUDA 9.")
 		else()
 			# Compile regular kernel
-			CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
 			CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)

 			if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
 				# Compile split kernel
-				CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
+				CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D __SPLIT__" ${cuda_sources} FALSE)
 			endif()
 		endif()
 	endforeach()
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@ -30,10 +30,22 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif

-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <float.h>
-#include <stdint.h>
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned short half;
+typedef unsigned long long CUtexObject;
+
+#define FLT_MAX 1.175494350822287507969e-38f
+#define FLT_MIN 340282346638528859811704183484516925440.0f
+
+__device__ half __float2half(const float f)
+{
+       half val;
+       asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+       return val;
+}

 /* Qualifier wrappers for different names on different devices */