From 4bde68cdd67283abd869d97550fdcb2af600f4b8 Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Sun, 23 Jun 2024 00:52:30 +0200 Subject: [PATCH] Cycles: Compress GPU kernels to reduce file size Precompiled Cycles kernels make up a considerable fraction of the total size of Blender builds nowadays. As we add more features and support for more architectures, this will only continue to increase. However, since these kernels tend to be quite compressible, we can save a lot of storage by storing them in compressed form and decompressing the required kernel(s) during loading. By using Zstandard compression with a high level, we can get decent compression ratios (~5x for the current kernels) while keeping decompression time low (about 30ms in the worse case in my tests). And since we already require zstd for Blender, this doesn't introduce a new dependency. While the main improvement is to the size of the extracted Blender installation (which is reduced by ~400-500MB currently), this also shrinks the download on Windows, since .zip's deflate compression is less effective. It doesn't help on Linux since we're already using .tar.xz there, but the smaller installed size is still a good thing. See #123522 for initial discussion. Pull Request: https://projects.blender.org/blender/blender/pulls/123557 --- intern/cycles/cmake/zstd_compress.cpp | 54 ++++++++++++++++++++++ intern/cycles/device/cuda/device_impl.cpp | 6 +-- intern/cycles/device/hip/device_impl.cpp | 4 +- intern/cycles/device/hiprt/device_impl.cpp | 5 +- intern/cycles/device/optix/device_impl.cpp | 8 ++-- intern/cycles/kernel/CMakeLists.txt | 43 +++++++++++++---- intern/cycles/util/CMakeLists.txt | 2 + intern/cycles/util/path.cpp | 47 +++++++++++++++++++ intern/cycles/util/path.h | 3 ++ 9 files changed, 152 insertions(+), 20 deletions(-) create mode 100644 intern/cycles/cmake/zstd_compress.cpp diff --git a/intern/cycles/cmake/zstd_compress.cpp b/intern/cycles/cmake/zstd_compress.cpp new file mode 100644 index 00000000000..b2e64568ea9 --- /dev/null +++ b/intern/cycles/cmake/zstd_compress.cpp @@ -0,0 +1,54 @@ +/* SPDX-FileCopyrightText: 2024 Blender Foundation + * + * SPDX-License-Identifier: Apache-2.0 */ + +#include +#include +#include + +#include + +int main(int argc, const char **argv) +{ + if (argc < 3) { + return -1; + } + + /* TODO: This might fail for non-ASCII paths on Windows... */ + std::ifstream in(argv[1], std::ios_base::binary); + std::ofstream out(argv[2], std::ios_base::binary); + if (!in || !out) { + return -1; + } + + in.seekg(0, std::ios_base::end); + size_t in_size = in.tellg(); + in.seekg(0, std::ios_base::beg); + if (!in) { + return -1; + } + + std::vector in_data(in_size); + in.read(in_data.data(), in_size); + if (!in) { + return -1; + } + + size_t out_size = ZSTD_compressBound(in_size); + if (ZSTD_isError(out_size)) { + return -1; + } + std::vector out_data(out_size); + + out_size = ZSTD_compress(out_data.data(), out_data.size(), in_data.data(), in_data.size(), 19); + if (ZSTD_isError(out_size)) { + return -1; + } + + out.write(out_data.data(), out_size); + if (!out) { + return -1; + } + + return 0; +} diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 5e1d8142202..cc412539636 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -256,7 +256,7 @@ string CUDADevice::compile_kernel(const string &common_cflags, /* Attempt to use kernel provided with Blender. */ if (!use_adaptive_compilation()) { if (!force_ptx) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor)); VLOG_INFO << "Testing for pre-compiled kernel " << cubin << "."; if (path_exists(cubin)) { VLOG_INFO << "Using precompiled kernel."; @@ -268,7 +268,7 @@ string CUDADevice::compile_kernel(const string &common_cflags, int ptx_major = major, ptx_minor = minor; while (ptx_major >= 3) { const string ptx = path_get( - string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor)); VLOG_INFO << "Testing for pre-compiled kernel " << ptx << "."; if (path_exists(ptx)) { VLOG_INFO << "Using precompiled kernel."; @@ -440,7 +440,7 @@ bool CUDADevice::load_kernels(const uint kernel_features) string cubin_data; CUresult result; - if (path_read_text(cubin, cubin_data)) { + if (path_read_compressed_text(cubin, cubin_data)) { result = cuModuleLoadData(&cuModule, cubin_data.c_str()); } else { diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 3679275fdd8..47e0a4a0d54 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -231,7 +231,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c /* Attempt to use kernel provided with Blender. */ if (!use_adaptive_compilation()) { - const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch.c_str())); + const string fatbin = path_get(string_printf("lib/%s_%s.fatbin.zst", name, arch.c_str())); VLOG_INFO << "Testing for pre-compiled kernel " << fatbin << "."; if (path_exists(fatbin)) { VLOG_INFO << "Using precompiled kernel."; @@ -387,7 +387,7 @@ bool HIPDevice::load_kernels(const uint kernel_features) string fatbin_data; hipError_t result; - if (path_read_text(fatbin, fatbin_data)) + if (path_read_compressed_text(fatbin, fatbin_data)) result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); else result = hipErrorFileNotFound; diff --git a/intern/cycles/device/hiprt/device_impl.cpp b/intern/cycles/device/hiprt/device_impl.cpp index bfca220e1f0..c326ef3259c 100644 --- a/intern/cycles/device/hiprt/device_impl.cpp +++ b/intern/cycles/device/hiprt/device_impl.cpp @@ -141,7 +141,7 @@ string HIPRTDevice::compile_kernel(const uint kernel_features, const char *name, const std::string arch = hipDeviceArch(hipDevId); if (!use_adaptive_compilation()) { - const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb", name)); + const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb.zst", name)); VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; if (path_exists(fatbin)) { VLOG(1) << "Using precompiled kernel."; @@ -309,8 +309,7 @@ bool HIPRTDevice::load_kernels(const uint kernel_features) string fatbin_data; hipError_t result; - if (path_read_text(fatbin, fatbin_data)) { - + if (path_read_compressed_text(fatbin, fatbin_data)) { result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); } else diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index a22daf168d2..4c83b6c04bb 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -216,7 +216,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) ""; string ptx_filename; if (need_optix_kernels) { - ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx"); + ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst"); if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { std::string optix_include_dir = get_optix_include_dir(); if (optix_include_dir.empty()) { @@ -348,7 +348,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) string cflags = compile_kernel_get_common_cflags(kernel_features); ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true); } - if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { + if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) { set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str())); return false; } @@ -798,8 +798,8 @@ bool OptiXDevice::load_osl_kernels() osl_modules.resize(osl_kernels.size() + 1); { /* Load and compile PTX module with OSL services. */ - string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx"); - if (!path_read_text(ptx_filename, ptx_data)) { + string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst"); + if (!path_read_compressed_text(ptx_filename, ptx_data)) { set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'", ptx_filename.c_str())); return false; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 28b1ceda344..56fb1fafd98 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -416,6 +416,11 @@ set(LIB ) +# Zstd compressor for kernels +add_executable(zstd_compress ../cmake/zstd_compress.cpp) +target_include_directories(zstd_compress SYSTEM PRIVATE ${ZSTD_INCLUDE_DIRS}) +target_link_libraries(zstd_compress ${ZSTD_LIBRARIES} ${PTHREADS_LIBRARIES}) + # CUDA module if(WITH_CYCLES_CUDA_BINARIES) @@ -455,6 +460,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(format "cubin") endif() set(cuda_file ${name}_${arch}.${format}) + set(cuda_file_compressed ${cuda_file}.zst) set(kernel_sources ${sources}) if(NOT ${prev_arch} STREQUAL "none") @@ -517,9 +523,14 @@ if(WITH_CYCLES_CUDA_BINARIES) DEPENDS ${kernel_sources}) endif() + add_custom_command( + OUTPUT ${cuda_file_compressed} + COMMAND "$" ${cuda_file} ${cuda_file_compressed} + DEPENDS ${cuda_file}) + unset(_cuda_nvcc_args) - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND cuda_cubins ${cuda_file}) + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file_compressed}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND cuda_cubins ${cuda_file_compressed}) unset(cuda_debug_flags) endmacro() @@ -603,6 +614,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) macro(CYCLES_HIP_KERNEL_ADD arch name flags sources experimental) set(format "fatbin") set(hip_file ${name}_${arch}.${format}) + set(hip_file_compressed ${hip_file}.zst) set(kernel_sources ${sources}) set(hip_kernel_src "/device/hip/${name}.cpp") @@ -657,8 +669,12 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) OUTPUT ${hip_file} COMMAND ${hip_command} ${hip_flags} DEPENDS ${kernel_sources}) - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND hip_fatbins ${hip_file}) + add_custom_command( + OUTPUT ${hip_file_compressed} + COMMAND "$" ${hip_file} ${hip_file_compressed} + DEPENDS ${hip_file}) + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file_compressed}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND hip_fatbins ${hip_file_compressed}) endmacro() foreach(arch ${CYCLES_HIP_BINARIES_ARCH}) @@ -680,6 +696,7 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES) ${SRC_UTIL_HEADERS}) set(bitcode_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.bc) set(hiprt_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.hipfb) + set(hiprt_file_compressed ${hiprt_file}.zst) set(kernel_sources ${hiprt_sources}) set(hiprt_kernel_src "/device/hiprt/kernel.cpp") if(WIN32) @@ -744,8 +761,12 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES) OUTPUT ${hiprt_file} COMMAND ${hiprt_link_command} ${hiprt_link_flags} DEPENDS ${bitcode_file}) - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file}" ${CYCLES_INSTALL_PATH}/lib) - add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file}) + add_custom_command( + OUTPUT ${hiprt_file_compressed} + COMMAND "$" ${hiprt_file} ${hiprt_file_compressed} + DEPENDS ${hiprt_file}) + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file_compressed}" ${CYCLES_INSTALL_PATH}/lib) + add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file_compressed}) cycles_set_solution_folder(cycles_kernel_hiprt) endif() @@ -754,6 +775,7 @@ endif() if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) macro(cycles_optix_kernel_add name input flags) set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx") + set(output_compressed "${output}.zst") set(cuda_flags ${flags} -I "${OPTIX_INCLUDE_DIR}" @@ -795,9 +817,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") - list(APPEND optix_ptx ${output}) + add_custom_command( + OUTPUT ${output_compressed} + COMMAND "$" ${output} ${output_compressed} + DEPENDS ${output}) - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND optix_ptx ${output_compressed}) + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output_compressed}" ${CYCLES_INSTALL_PATH}/lib) endmacro() cycles_optix_kernel_add( diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 2f854e3d69a..6eaa619da25 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -7,6 +7,7 @@ set(INC ) set(INC_SYS + ${ZSTD_INCLUDE_DIRS} ) set(SRC @@ -32,6 +33,7 @@ set(SRC set(LIB ${TBB_LIBRARIES} + ${ZSTD_LIBRARIES} ) set(SRC_HEADERS diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp index 373b33f838d..2ae37ed01f4 100644 --- a/intern/cycles/util/path.cpp +++ b/intern/cycles/util/path.cpp @@ -19,6 +19,8 @@ OIIO_NAMESPACE_USING #include +#include + #if defined(_WIN32) # define DIR_SEP '\\' # define DIR_SEP_ALT '/' @@ -704,6 +706,36 @@ bool path_read_binary(const string &path, vector &binary) return true; } +bool path_read_compressed_binary(const string &path, vector &binary) +{ + if (!string_endswith(path, ".zst")) { + return path_read_binary(path, binary); + } + + vector compressed; + if (!path_read_binary(path, compressed)) { + return false; + } + + const size_t full_size = ZSTD_getFrameContentSize(compressed.data(), compressed.size()); + + if (full_size == ZSTD_CONTENTSIZE_ERROR) { + /* Potentially corrupted file? */ + return false; + } + if (full_size == ZSTD_CONTENTSIZE_UNKNOWN) { + /* Technically this is an optional field, but we can expect it to be set for now. + * Otherwise we'd need streaming decompression and repeated resizing of the vector. */ + return false; + } + + binary.resize(full_size); + + size_t err = ZSTD_decompress(binary.data(), binary.size(), compressed.data(), compressed.size()); + + return ZSTD_isError(err) == 0; +} + bool path_read_text(const string &path, string &text) { vector binary; @@ -719,6 +751,21 @@ bool path_read_text(const string &path, string &text) return true; } +bool path_read_compressed_text(const string &path, string &text) +{ + vector binary; + + if (!path_exists(path) || !path_read_compressed_binary(path, binary)) { + return false; + } + + const char *str = (const char *)&binary[0]; + size_t size = binary.size(); + text = string(str, size); + + return true; +} + uint64_t path_modified_time(const string &path) { path_stat_t st; diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h index e34c852a4ef..15c00d01faa 100644 --- a/intern/cycles/util/path.h +++ b/intern/cycles/util/path.h @@ -50,6 +50,9 @@ bool path_write_text(const string &path, string &text); bool path_read_binary(const string &path, vector &binary); bool path_read_text(const string &path, string &text); +bool path_read_compressed_binary(const string &path, vector &binary); +bool path_read_compressed_text(const string &path, string &text); + /* File manipulation. */ bool path_remove(const string &path);