Merged changes in the trunk up to revision 46045.

Conflicts resolved: doc/python_api/sphinx_doc_gen.py source/blender/blenkernel/intern/subsurf_ccg.c source/blender/editors/mesh/editmesh_tools.c source/blender/makesdna/DNA_scene_types.h
2012-04-29 00:59:04 +00:00 · 2012-04-29 00:59:04 +00:00 · 2585d52405
commit 2585d52405
parent d8e12e8710 4465d2f419
656 changed files with 33812 additions and 31804 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -194,12 +194,7 @@ option(WITH_IMAGE_REDCODE       "Enable RedCode Image Support" OFF)
 option(WITH_IMAGE_FRAMESERVER   "Enable image FrameServer Support for rendering" ON)

 # Audio/Video format support
-if(MINGW)
-	set(PLATFORM_DEFAULT ON)
-else()
-	set(PLATFORM_DEFAULT OFF)
-endif()
-option(WITH_CODEC_FFMPEG        "Enable FFMPeg Support (http://ffmpeg.org)" ${PLATFORM_DEFAULT})
+option(WITH_CODEC_FFMPEG        "Enable FFMPeg Support (http://ffmpeg.org)" OFF)
 unset(PLATFORM_DEFAULT)

 option(WITH_CODEC_SNDFILE       "Enable libsndfile Support (http://www.mega-nerd.com/libsndfile)" OFF)
@ -233,6 +228,8 @@ if(UNIX AND NOT APPLE)
 	option(WITH_INSTALL_PORTABLE "Install redistributeable runtime, otherwise install into CMAKE_INSTALL_PREFIX" ON)
 endif()
 option(WITH_PYTHON_INSTALL       "Copy system python into the blender install folder" ON)
+option(WITH_MINGW64                   "Use the 64-bit version of MinGW" OFF)
+mark_as_advanced(WITH_MINGW64)

 # Cycles
 option(WITH_CYCLES					"Enable cycles Render Engine" ON)
@ -371,9 +368,8 @@ if(MINGW)
 		                    "line if youre a developer who wants to add support.")
 	endif()
 	
-	if((NOT WITH_CODEC_FFMPEG) AND (WITH_CYCLES OR WITH_IMAGE_OPENEXR OR WITH_IMAGE_TIFF))
-		message(FATAL_ERROR "MINGW has a problem with: WITH_CYCLES/WITH_IMAGE_OPENEXR/WITH_IMAGE_TIFF "
-		                    "when WITH_CODEC_FFMPEG is disabled, enable FFMPEG or disable CYCLES/EXR/TIFF.")
+	if((WITH_MINGW64) AND (WITH_OPENCOLLADA OR WITH_CODEC_FFMPEG))
+		message(FATAL_ERROR "MINGW64 still doesn't support: WITH_OPENCOLLADA/WITH_CODEC_FFMPEG")
 	endif()
 endif()

@ -740,6 +736,15 @@ elseif(WIN32)
 	# this file is included anyway when building under Windows with cl.exe
 	#  include(${CMAKE_ROOT}/Modules/Platform/Windows-cl.cmake)

+	if(CMAKE_COMPILER_IS_GNUCC)
+		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/mingw32)
+
+		# Setup 64bit and 64bit windows systems		
+		if(WITH_MINGW64)
+			message("Set 64 bit compiler for MinGW.")
+			set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/mingw64)
+		endif()
+	else()
 		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/windows)

 		# Setup 64bit and 64bit windows systems
@ -747,6 +752,7 @@ elseif(WIN32)
 			message("64 bit compiler detected.")
 			set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/win64)
 		endif()
+	endif()
 		
 	add_definitions(-DWIN32)

@ -796,33 +802,35 @@ elseif(WIN32)
 	endif()

 	if(MSVC)
-		if(CMAKE_CL_64)
 		set(PLATFORM_LINKLIBS ws2_32 vfw32 winmm kernel32 user32 gdi32 comdlg32 advapi32 shfolder shell32 ole32 oleaut32 uuid)
-		else()
-			set(PLATFORM_LINKLIBS ws2_32 vfw32 winmm kernel32 user32 gdi32 comdlg32 advapi32 shfolder shell32 ole32 oleaut32 uuid)
-		endif()

 		add_definitions(/D_CRT_NONSTDC_NO_DEPRECATE /D_CRT_SECURE_NO_DEPRECATE /D_SCL_SECURE_NO_DEPRECATE /D_CONSOLE /D_LIB)

-		set(CMAKE_CXX_FLAGS "/nologo /J /W1 /Gd /wd4018 /wd4244 /wd4305 /wd4800 /wd4065 /wd4267 /we4013" CACHE STRING "MSVC MT C++ flags " FORCE)
-		set(CMAKE_C_FLAGS   "/nologo /J /W1 /Gd /wd4018 /wd4244 /wd4305 /wd4800 /wd4065 /wd4267 /we4013 /EHsc" CACHE STRING "MSVC MT C++ flags " FORCE)
+		set(CMAKE_CXX_FLAGS "/nologo /J /Gd /EHsc" CACHE STRING "MSVC MT C++ flags " FORCE)
+		set(CMAKE_C_FLAGS   "/nologo /J /Gd"       CACHE STRING "MSVC MT C++ flags " FORCE)

 		if(CMAKE_CL_64)
-			set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /EHsc /RTC1 /MTd /W3 /nologo /Zi /J" CACHE STRING "MSVC MT flags " FORCE)
+			set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
 		else()
-			set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /EHsc /RTC1 /MTd /W3 /nologo /ZI /J" CACHE STRING "MSVC MT flags " FORCE)
+			set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
 		endif()
-		set(CMAKE_CXX_FLAGS_RELEASE "/O2 /Ob2 /EHsc /MT /W3 /nologo /J" CACHE STRING "MSVC MT flags " FORCE)
-		set(CMAKE_CXX_FLAGS_MINSIZEREL "/O1 /Ob1 /EHsc /MT /W3 /nologo /J" CACHE STRING "MSVC MT flags " FORCE)
-		set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /EHsc /MT /W3 /nologo /Zi /J" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_CXX_FLAGS_RELEASE "/O2 /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_CXX_FLAGS_MINSIZEREL "/O1 /Ob1 /MT" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
 		if(CMAKE_CL_64)
-			set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /EHsc /RTC1 /MTd /W3 /nologo /Zi /J" CACHE STRING "MSVC MT flags " FORCE)
+			set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
 		else()
-			set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /EHsc /RTC1 /MTd /W3 /nologo /ZI /J" CACHE STRING "MSVC MT flags " FORCE)
+			set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
 		endif()
-		set(CMAKE_C_FLAGS_RELEASE "/O2 /Ob2 /EHsc /MT /W3 /nologo /J" CACHE STRING "MSVC MT flags " FORCE)
-		set(CMAKE_C_FLAGS_MINSIZEREL "/O1 /Ob1 /EHsc /MT /W3 /nologo /J" CACHE STRING "MSVC MT flags " FORCE)
-		set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /EHsc /MT /W3 /nologo /Zi /J" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_C_FLAGS_RELEASE "/O2 /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_C_FLAGS_MINSIZEREL "/O1 /Ob1 /MT" CACHE STRING "MSVC MT flags " FORCE)
+		set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
+
+		# most msvc warnings are C & C++
+		set(_WARNINGS "/W3 /wd4018 /wd4244 /wd4305 /wd4800 /wd4181 /wd4065 /wd4267 /we4013")
+		set(C_WARNINGS "${_WARNINGS}")
+		set(CXX_WARNINGS "${_WARNINGS}")
+		unset(_WARNINGS)

 		if(WITH_INTERNATIONAL)
 			set(GETTEXT ${LIBDIR}/gettext)
@ -995,29 +1003,44 @@ elseif(WIN32)

 		set(PLATFORM_LINKFLAGS_DEBUG "/NODEFAULTLIB:libcmt.lib /NODEFAULTLIB:libc.lib")

-	else()
+		# used in many places so include globally, like OpenGL
+		blender_include_dirs("${PTHREADS_INCLUDE_DIRS}")
+
+	elseif(CMAKE_COMPILER_IS_GNUCC)
 	# keep GCC specific stuff here		
-		if(CMAKE_COMPILER_IS_GNUCC)
 		set(PLATFORM_LINKLIBS "-lshell32 -lshfolder -lgdi32 -lmsvcrt -lwinmm -lmingw32 -lm -lws2_32 -lz -lstdc++ -lole32 -luuid -lwsock32 -lpsapi")
 		set(PLATFORM_CFLAGS "-pipe -funsigned-char -fno-strict-aliasing")

-			add_definitions(-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE)
+		if(WITH_MINGW64)
+			#Yes, the point for MinGW64 is moar optimization by default :)
+			set(PLATFORM_CFLAGS "${PLATFORM_CFLAGS} -mmmx -msse -msse2 -ftree-vectorize")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive")
+			set(PLATFORM_LINKLIBS "${PLATFORM_LINKLIBS} -lpthread")
+			
+			add_definitions(-DFREE_WINDOWS64 -DMS_WIN64)
 		endif()

+		add_definitions(-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE)
+
+
 		add_definitions(-DFREE_WINDOWS)

 		if(WITH_INTERNATIONAL)
-			set(GETTEXT ${LIBDIR}/gcc/gettext)
+			set(GETTEXT ${LIBDIR}/gettext)
 			set(GETTEXT_INCLUDE_DIRS ${GETTEXT}/include)
 			set(GETTEXT_LIBPATH ${GETTEXT}/lib)
 			set(GETTEXT_LIBRARIES intl)
 		endif()
 		
-		set(PNG "${LIBDIR}/gcc/png")
+		set(PNG "${LIBDIR}/png")
 		set(PNG_INCLUDE_DIR "${PNG}/include")
 		set(PNG_LIBPATH ${PNG}/lib) # not cmake defined

+		if(WITH_MINGW64)
+			set(JPEG_LIBRARIES jpeg)
+		else()
 			set(JPEG_LIBRARIES libjpeg)
+		endif()
 		set(PNG_LIBRARIES png)

 		set(ZLIB ${LIBDIR}/zlib)
@ -1025,31 +1048,34 @@ elseif(WIN32)
 		set(ZLIB_LIBPATH ${ZLIB}/lib)
 		set(ZLIB_LIBRARIES z)

+		#comes with own pthread library
+		if(NOT WITH_MINGW64)
 			set(PTHREADS ${LIBDIR}/pthreads)
 			set(PTHREADS_INCLUDE_DIRS ${PTHREADS}/include)
 			set(PTHREADS_LIBPATH ${PTHREADS}/lib)
 			set(PTHREADS_LIBRARIES pthreadGC2)
+		endif()
 		
-		set(FREETYPE ${LIBDIR}/gcc/freetype)
+		set(FREETYPE ${LIBDIR}/freetype)
 		set(FREETYPE_INCLUDE_DIRS ${FREETYPE}/include ${FREETYPE}/include/freetype2)
 		set(FREETYPE_LIBPATH ${FREETYPE}/lib)
 		set(FREETYPE_LIBRARY freetype)

 		if(WITH_FFTW3)
-			set(FFTW3 ${LIBDIR}/gcc/fftw3)
+			set(FFTW3 ${LIBDIR}/fftw3)
 			set(FFTW3_LIBRARIES fftw3)
 			set(FFTW3_INCLUDE_DIRS ${FFTW3}/include)
 			set(FFTW3_LIBPATH ${FFTW3}/lib)
 		endif()

 		if(WITH_OPENCOLLADA)
-			set(OPENCOLLADA ${LIBDIR}/gcc/opencollada)
+			set(OPENCOLLADA ${LIBDIR}/opencollada)
 			set(OPENCOLLADA_INCLUDE_DIRS
-				${LIBDIR}/gcc/opencollada/include/COLLADAStreamWriter/include
-				${LIBDIR}/gcc/opencollada/include/COLLADABaseUtils/include
-				${LIBDIR}/gcc/opencollada/include/COLLADAFramework/include
-				${LIBDIR}/gcc/opencollada/include/COLLADASaxFrameworkLoader/include
-				${LIBDIR}/gcc/opencollada/include/GeneratedSaxParser/include
+				${LIBDIR}/opencollada/include/COLLADAStreamWriter/include
+				${LIBDIR}/opencollada/include/COLLADABaseUtils/include
+				${LIBDIR}/opencollada/include/COLLADAFramework/include
+				${LIBDIR}/opencollada/include/COLLADASaxFrameworkLoader/include
+				${LIBDIR}/opencollada/include/GeneratedSaxParser/include
 			)
 			set(OPENCOLLADA_LIBPATH ${OPENCOLLADA}/lib ${OPENCOLLADA}/lib)
 			set(OPENCOLLADA_LIBRARIES OpenCOLLADAStreamWriter OpenCOLLADASaxFrameworkLoader OpenCOLLADAFramework OpenCOLLADABaseUtils GeneratedSaxParser UTF MathMLSolver expat pcre buffer ftoa)
@ -1064,14 +1090,14 @@ elseif(WIN32)
 		endif()

 		if(WITH_IMAGE_OPENEXR)
-			set(OPENEXR ${LIBDIR}/gcc/openexr)
+			set(OPENEXR ${LIBDIR}/openexr)
 			set(OPENEXR_INCLUDE_DIRS ${OPENEXR}/include/OpenEXR)
 			set(OPENEXR_LIBRARIES Half IlmImf Imath IlmThread Iex)
 			set(OPENEXR_LIBPATH ${OPENEXR}/lib)
 		endif()

 		if(WITH_IMAGE_TIFF)
-			set(TIFF ${LIBDIR}/gcc/tiff)
+			set(TIFF ${LIBDIR}/tiff)
 			set(TIFF_LIBRARY tiff)
 			set(TIFF_INCLUDE_DIR ${TIFF}/include)
 			set(TIFF_LIBPATH ${TIFF}/lib)
@ -1101,19 +1127,24 @@ elseif(WIN32)
 		if(WITH_BOOST)
 			set(BOOST ${LIBDIR}/boost)
 			set(BOOST_INCLUDE_DIR ${BOOST}/include)
+			if(WITH_MINGW64)
+				set(BOOST_POSTFIX "mgw47-mt-s-1_49")
+				set(BOOST_DEBUG_POSTFIX "mgw47-mt-sd-1_49")
+			else()
 				set(BOOST_POSTFIX "mgw46-mt-s-1_47")
 				set(BOOST_DEBUG_POSTFIX "mgw46-mt-sd-1_47")
+			endif()		
 			set(BOOST_LIBRARIES
 				optimized boost_date_time-${BOOST_POSTFIX} boost_filesystem-${BOOST_POSTFIX}
 				boost_regex-${BOOST_POSTFIX} boost_system-${BOOST_POSTFIX} boost_thread-${BOOST_POSTFIX}
 				debug boost_date_time-${BOOST_DEBUG_POSTFIX} boost_filesystem-${BOOST_DEBUG_POSTFIX}
 				boost_regex-${BOOST_DEBUG_POSTFIX} boost_system-${BOOST_DEBUG_POSTFIX} boost_thread-${BOOST_DEBUG_POSTFIX})
-			set(BOOST_LIBPATH ${BOOST}/lib/gcc)
+			set(BOOST_LIBPATH ${BOOST}/lib)
 			set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB -DBOOST_THREAD_USE_LIB ")
 		endif()
 			
 		if(WITH_OPENIMAGEIO)
-			set(OPENIMAGEIO ${LIBDIR}/gcc/openimageio)
+			set(OPENIMAGEIO ${LIBDIR}/openimageio)
 			set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
 			set(OPENIMAGEIO_LIBRARIES OpenImageIO)
 			set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
@ -1128,16 +1159,13 @@ elseif(WIN32)

 	endif()

-	# used in many places so include globally, like OpenGL
-	blender_include_dirs("${PTHREADS_INCLUDE_DIRS}")
-
 elseif(APPLE)

 	if(${CMAKE_OSX_DEPLOYMENT_TARGET} STREQUAL "10.5" OR ${CMAKE_OSX_DEPLOYMENT_TARGET} STRGREATER "10.5")
 		set(WITH_LIBS10.5 ON CACHE BOOL "Use 10.5 libs" FORCE) # valid also for 10.6/10.7
 	endif()

-	if(${XCODE_VERSION} LESS 4.3)
+	if(${XCODE_VERSION} VERSION_LESS 4.3)
 		SET(CMAKE_OSX_SYSROOT /Developer/SDKs/MacOSX${OSX_SYSTEM}.sdk CACHE PATH "" FORCE ) # use guaranteed existing sdk
 	else()
 		# note: i don't use xcode-select path on purpose, cause also /Applications/Xcode.app would be allowed
--- a/32
+++ b/32
@ -170,7 +170,7 @@ if sys.platform=='win32':
    if env['CC'] in ['cl', 'cl.exe']:
        platform = 'win64-vc' if bitness == 64 else 'win32-vc'
    elif env['CC'] in ['gcc']:
-        platform = 'win32-mingw'
+        platform = 'win64-mingw' if bitness == 64 else 'win32-mingw'

 env.SConscriptChdir(0)

@ -774,7 +774,7 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'win64-vc', 'linuxcross'):
        dllsources.append('${LCGDIR}/thumbhandler/lib/BlendThumb.dll')	
    dllsources.append('${LCGDIR}/thumbhandler/lib/BlendThumb64.dll')

-    if env['WITH_BF_OIIO']:
+    if env['WITH_BF_OIIO'] and env['OURPLATFORM'] != 'win32-mingw':
        dllsources.append('${LCGDIR}/openimageio/bin/OpenImageIO.dll')

    dllsources.append('#source/icons/blender.exe.manifest')
@ -782,6 +782,34 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'win64-vc', 'linuxcross'):
    windlls = env.Install(dir=env['BF_INSTALLDIR'], source = dllsources)
    allinstall += windlls

+if env['OURPLATFORM'] == 'win64-mingw':
+    dllsources = []
+    
+    if env['WITH_BF_PYTHON']:
+        if env['BF_DEBUG']:
+            dllsources.append('${BF_PYTHON_LIBPATH}/${BF_PYTHON_DLL}_d.dll')
+        else:
+            dllsources.append('${BF_PYTHON_LIBPATH}/${BF_PYTHON_DLL}.dll')
+
+    if env['WITH_BF_FFMPEG']:
+        dllsources += env['BF_FFMPEG_DLL'].split()
+
+    if env['WITH_BF_OPENAL']:
+        dllsources.append('${LCGDIR}/openal/lib/OpenAL32.dll')
+        dllsources.append('${LCGDIR}/openal/lib/wrap_oal.dll')
+
+    if env['WITH_BF_SNDFILE']:
+        dllsources.append('${LCGDIR}/sndfile/lib/libsndfile-1.dll')
+
+    if env['WITH_BF_SDL']:
+        dllsources.append('${LCGDIR}/sdl/lib/SDL.dll')
+	
+    dllsources.append('${LCGDIR}/thumbhandler/lib/BlendThumb64.dll')
+    dllsources.append('#source/icons/blender.exe.manifest')
+
+    windlls = env.Install(dir=env['BF_INSTALLDIR'], source = dllsources)
+    allinstall += windlls
+
 installtarget = env.Alias('install', allinstall)
 bininstalltarget = env.Alias('install-bin', blenderinstall)

--- a/build_files/buildbot/master.cfg
+++ b/build_files/buildbot/master.cfg
@ -117,6 +117,8 @@ add_builder(c, 'salad_linux_x86_64_scons', '', generic_builder, 'soc-2011-salad'
 add_builder(c, 'win32_scons', 'windows', generic_builder)
 add_builder(c, 'salad_win32_scons', 'windows', generic_builder, 'soc-2011-salad')
 add_builder(c, 'win64_scons', 'win64', generic_builder)
+add_builder(c, 'mingw_win64_scons', 'mingw64', generic_builder)
+add_builder(c, 'mingw_win32_scons', 'mingw32', generic_builder)
 #add_builder(c, 'freebsd_i386_cmake', '', generic_builder)
 #add_builder(c, 'freebsd_x86_64_cmake', '', generic_builder)

--- a/build_files/buildbot/master_unpack.py
+++ b/build_files/buildbot/master_unpack.py
@ -48,7 +48,8 @@ def get_platform(filename):
    tokens = filename.split("-")
    platforms = ('osx', 'mac', 'bsd',
                 'win', 'linux', 'source',
-                 'solaris')
+                 'solaris',
+                 'mingw')
    platform_tokens = []
    found = False

--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@ -117,6 +117,8 @@ else:
            scons_options.append('BF_BITNESS=' + bitness)
            scons_options.append('WITH_BF_CYCLES_CUDA_BINARIES=True')
            scons_options.append('BF_CYCLES_CUDA_NVCC=nvcc.exe')
+            if builder.find('mingw') != -1:
+                scons_options.append('BF_TOOLSET=mingw')

        retcode = subprocess.call(['python', 'scons/scons.py'] + scons_options)
        sys.exit(retcode)
--- a/build_files/buildbot/slave_pack.py
+++ b/build_files/buildbot/slave_pack.py
@ -81,6 +81,8 @@ if builder.find('scons') != -1:
            scons_options.append('BF_BITNESS=' + bitness)
            scons_options.append('WITH_BF_CYCLES_CUDA_BINARIES=True')
            scons_options.append('BF_CYCLES_CUDA_NVCC=nvcc.exe')
+            if builder.find('mingw') != -1:
+                scons_options.append('BF_TOOLSET=mingw')

        retcode = subprocess.call(['python', 'scons/scons.py'] + scons_options)
        sys.exit(retcode)
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@ -212,11 +212,9 @@ macro(setup_liblinks
 	target_link_libraries(${target}
 			${OPENGL_gl_LIBRARY}
 			${OPENGL_glu_LIBRARY}
-			${JPEG_LIBRARIES}
 			${PNG_LIBRARIES}
 			${ZLIB_LIBRARIES}
-			${FREETYPE_LIBRARY}
-			${PLATFORM_LINKLIBS})
+			${FREETYPE_LIBRARY})

 	# since we are using the local libs for python when compiling msvc projects, we need to add _d when compiling debug versions
 	if(WITH_PYTHON)  # AND NOT WITH_PYTHON_MODULE  # WIN32 needs
@ -271,6 +269,7 @@ macro(setup_liblinks
 	if(WITH_BOOST)
 		target_link_libraries(${target} ${BOOST_LIBRARIES})
 	endif()
+	target_link_libraries(${target} ${JPEG_LIBRARIES})
 	if(WITH_IMAGE_OPENEXR)
 		if(WIN32 AND NOT UNIX AND NOT CMAKE_COMPILER_IS_GNUCC)
 			file_list_suffix(OPENEXR_LIBRARIES_DEBUG "${OPENEXR_LIBRARIES}" "_d")
@ -328,6 +327,8 @@ macro(setup_liblinks
 	if(WIN32 AND NOT UNIX)
 		target_link_libraries(${target} ${PTHREADS_LIBRARIES})
 	endif()
+
+	target_link_libraries(${target} ${PLATFORM_LINKLIBS})
 endmacro()

 macro(TEST_SSE_SUPPORT
--- a/build_files/scons/config/win32-mingw-config.py
+++ b/build_files/scons/config/win32-mingw-config.py
@ -1,4 +1,4 @@
-LCGDIR = '#../lib/windows'
+LCGDIR = '#../lib/mingw32'
 LIBDIR = "${LCGDIR}"

 BF_PYTHON = LIBDIR + '/python'
@ -48,7 +48,7 @@ BF_PTHREADS_LIBPATH = '${BF_PTHREADS}/lib'

 WITH_BF_OPENEXR = True
 WITH_BF_STATICOPENEXR = False
-BF_OPENEXR = LIBDIR + '/gcc/openexr'
+BF_OPENEXR = LIBDIR + '/openexr'
 BF_OPENEXR_INC = '${BF_OPENEXR}/include ${BF_OPENEXR}/include/OpenEXR'
 BF_OPENEXR_LIB = 'Half IlmImf Imath IlmThread Iex'
 BF_OPENEXR_LIBPATH = '${BF_OPENEXR}/lib'
@ -64,13 +64,13 @@ BF_JPEG_LIB = 'liblibjpeg'
 BF_JPEG_LIBPATH = '${BF_JPEG}/lib'

 WITH_BF_PNG = True
-BF_PNG = LIBDIR + '/gcc/png'
+BF_PNG = LIBDIR + '/png'
 BF_PNG_INC = '${BF_PNG}/include'
 BF_PNG_LIB = 'png'
 BF_PNG_LIBPATH = '${BF_PNG}/lib'

 WITH_BF_TIFF = True
-BF_TIFF = LIBDIR + '/gcc/tiff'
+BF_TIFF = LIBDIR + '/tiff'
 BF_TIFF_INC = '${BF_TIFF}/include'
 BF_TIFF_LIB = 'tiff'
 BF_TIFF_LIBPATH = '${BF_TIFF}/lib'
@ -83,7 +83,7 @@ BF_ZLIB_LIBPATH = '${BF_ZLIB}/lib'

 WITH_BF_INTERNATIONAL = True

-BF_GETTEXT = LIBDIR + '/gcc/gettext'
+BF_GETTEXT = LIBDIR + '/gettext'
 BF_GETTEXT_INC = '${BF_GETTEXT}/include'
 BF_GETTEXT_LIB = 'intl'
 BF_GETTEXT_LIBPATH = '${BF_GETTEXT}/lib'
@ -95,7 +95,7 @@ BF_OPENJPEG_INC = '${BF_OPENJPEG}'
 BF_OPENJPEG_LIBPATH='${BF_OPENJPEG}/lib'

 WITH_BF_FFTW3 = True
-BF_FFTW3 = LIBDIR + '/gcc/fftw3'
+BF_FFTW3 = LIBDIR + '/fftw3'
 BF_FFTW3_INC = '${BF_FFTW3}/include'
 BF_FFTW3_LIB = 'fftw3'
 BF_FFTW3_LIBPATH = '${BF_FFTW3}/lib'
@ -112,7 +112,7 @@ BF_BULLET_LIB = 'extern_bullet'
 BF_WINTAB = LIBDIR + '/wintab'
 BF_WINTAB_INC = '${BF_WINTAB}/INCLUDE'

-BF_FREETYPE = LIBDIR + '/gcc/freetype'
+BF_FREETYPE = LIBDIR + '/freetype'
 BF_FREETYPE_INC = '${BF_FREETYPE}/include ${BF_FREETYPE}/include/freetype2'
 BF_FREETYPE_LIB = 'freetype'
 BF_FREETYPE_LIBPATH = '${BF_FREETYPE}/lib'
@ -145,7 +145,7 @@ BF_COLLADA = '#source/blender/collada'
 BF_COLLADA_INC = '${BF_COLLADA}'
 BF_COLLADA_LIB = 'bf_collada'

-BF_OPENCOLLADA = LIBDIR + '/gcc/opencollada'
+BF_OPENCOLLADA = LIBDIR + '/opencollada'
 BF_OPENCOLLADA_INC = '${BF_OPENCOLLADA}/include'
 BF_OPENCOLLADA_LIB = 'OpenCOLLADAStreamWriter OpenCOLLADASaxFrameworkLoader OpenCOLLADAFramework OpenCOLLADABaseUtils GeneratedSaxParser UTF MathMLSolver expat pcre buffer ftoa'
 BF_OPENCOLLADA_LIBPATH = '${BF_OPENCOLLADA}/lib'
@ -154,7 +154,7 @@ BF_OPENCOLLADA_LIBPATH = '${BF_OPENCOLLADA}/lib'
 WITH_BF_CYCLES = True 

 WITH_BF_OIIO = True
-BF_OIIO = LIBDIR + '/gcc/openimageio'
+BF_OIIO = LIBDIR + '/openimageio'
 BF_OIIO_INC = BF_OIIO + '/include'
 BF_OIIO_LIB = 'OpenImageIO'
 BF_OIIO_LIBPATH = BF_OIIO + '/lib'
@ -163,7 +163,7 @@ WITH_BF_BOOST = True
 BF_BOOST = LIBDIR + '/boost'
 BF_BOOST_INC = BF_BOOST + '/include'
 BF_BOOST_LIB = 'boost_date_time-mgw46-mt-s-1_47 boost_filesystem-mgw46-mt-s-1_47 boost_regex-mgw46-mt-s-1_47 boost_system-mgw46-mt-s-1_47 boost_thread-mgw46-mt-s-1_47'
-BF_BOOST_LIBPATH = BF_BOOST + '/lib/gcc'
+BF_BOOST_LIBPATH = BF_BOOST + '/lib'

 #Ray trace optimization
 WITH_BF_RAYOPTIMIZATION = True
--- a/build_files/scons/config/win64-mingw-config.py
+++ b/build_files/scons/config/win64-mingw-config.py
@ -0,0 +1,207 @@
+LCGDIR = '#../lib/mingw64'
+LIBDIR = "${LCGDIR}"
+
+BF_PYTHON = LIBDIR + '/python'
+BF_PYTHON_VERSION = '3.2'
+WITH_BF_STATICPYTHON = False
+BF_PYTHON_INC = '${BF_PYTHON}/include/python${BF_PYTHON_VERSION}'
+BF_PYTHON_BINARY = 'python'
+BF_PYTHON_LIB = 'python${BF_PYTHON_VERSION[0]}${BF_PYTHON_VERSION[2]}mw'
+BF_PYTHON_DLL = 'python32'
+BF_PYTHON_LIBPATH = '${BF_PYTHON}/lib'
+
+WITH_BF_OPENAL = True
+BF_OPENAL = LIBDIR + '/openal'
+BF_OPENAL_INC = '${BF_OPENAL}/include'
+BF_OPENAL_LIB = 'wrap_oal'
+BF_OPENAL_LIBPATH = '${BF_OPENAL}/lib'
+
+WITH_BF_FFMPEG = False # TODO: FFmpeg gives linking errors, need to compile with MinGW-w64?
+BF_FFMPEG_LIB = 'avformat-53 avcodec-53 avdevice-53 avutil-51 swscale-2'
+BF_FFMPEG_LIBPATH = LIBDIR + '/ffmpeg/lib'
+BF_FFMPEG_INC =  LIBDIR + '/ffmpeg/include'
+BF_FFMPEG_DLL = '${BF_FFMPEG_LIBPATH}/avformat-53.dll ${BF_FFMPEG_LIBPATH}/avcodec-53.dll ${BF_FFMPEG_LIBPATH}/avdevice-53.dll ${BF_FFMPEG_LIBPATH}/avutil-51.dll ${BF_FFMPEG_LIBPATH}/swscale-2.dll'
+
+WITH_BF_JACK = False
+BF_JACK = LIBDIR + '/jack'
+BF_JACK_INC = '${BF_JACK}/include'
+BF_JACK_LIB = 'libjack'
+BF_JACK_LIBPATH = '${BF_JACK}/lib'
+
+WITH_BF_SNDFILE = False
+BF_SNDFILE = LIBDIR + '/sndfile'
+BF_SNDFILE_INC = '${BF_SNDFILE}/include'
+BF_SNDFILE_LIB = 'libsndfile-1'
+BF_SNDFILE_LIBPATH = '${BF_SNDFILE}/lib'
+
+WITH_BF_SDL = True
+BF_SDL = LIBDIR + '/sdl'
+BF_SDL_INC = '${BF_SDL}/include'
+BF_SDL_LIB = 'SDL'
+BF_SDL_LIBPATH = '${BF_SDL}/lib'
+
+BF_PTHREADS = '' # Part of MinGW-w64
+BF_PTHREADS_INC = ''
+BF_PTHREADS_LIB = ''
+BF_PTHREADS_LIBPATH = ''
+
+WITH_BF_OPENEXR = True
+WITH_BF_STATICOPENEXR = False
+BF_OPENEXR = LIBDIR + '/openexr'
+BF_OPENEXR_INC = '${BF_OPENEXR}/include ${BF_OPENEXR}/include/OpenEXR'
+BF_OPENEXR_LIB = 'Half IlmImf Imath IlmThread Iex'
+BF_OPENEXR_LIBPATH = '${BF_OPENEXR}/lib'
+
+WITH_BF_DDS = True
+
+WITH_BF_JPEG = True
+BF_JPEG = LIBDIR + '/jpeg'
+BF_JPEG_INC = '${BF_JPEG}/include'
+BF_JPEG_LIB = 'jpeg'
+BF_JPEG_LIBPATH = '${BF_JPEG}/lib'
+
+WITH_BF_PNG = True
+BF_PNG = LIBDIR + '/png'
+BF_PNG_INC = '${BF_PNG}/include'
+BF_PNG_LIB = 'png'
+BF_PNG_LIBPATH = '${BF_PNG}/lib'
+
+WITH_BF_TIFF = True
+BF_TIFF = LIBDIR + '/tiff'
+BF_TIFF_INC = '${BF_TIFF}/include'
+BF_TIFF_LIB = 'tiff'
+BF_TIFF_LIBPATH = '${BF_TIFF}/lib'
+
+WITH_BF_ZLIB = True
+BF_ZLIB = LIBDIR + '/zlib'
+BF_ZLIB_INC = '${BF_ZLIB}/include'
+BF_ZLIB_LIB = 'z'
+BF_ZLIB_LIBPATH = '${BF_ZLIB}/lib'
+
+WITH_BF_INTERNATIONAL = True
+
+BF_GETTEXT = LIBDIR + '/gettext'
+BF_GETTEXT_INC = '${BF_GETTEXT}/include'
+BF_GETTEXT_LIB = 'intl'
+BF_GETTEXT_LIBPATH = '${BF_GETTEXT}/lib'
+
+WITH_BF_OPENJPEG = True
+BF_OPENJPEG = '#extern/libopenjpeg'
+BF_OPENJPEG_LIB = ''
+BF_OPENJPEG_INC = '${BF_OPENJPEG}'
+BF_OPENJPEG_LIBPATH='${BF_OPENJPEG}/lib'
+
+WITH_BF_FFTW3 = True
+BF_FFTW3 = LIBDIR + '/fftw3'
+BF_FFTW3_INC = '${BF_FFTW3}/include'
+BF_FFTW3_LIB = 'fftw3'
+BF_FFTW3_LIBPATH = '${BF_FFTW3}/lib'
+
+WITH_BF_GAMEENGINE = True
+WITH_BF_OCEANSIM = True
+WITH_BF_PLAYER = True
+WITH_BF_LIBMV = True
+
+WITH_BF_BULLET = True
+BF_BULLET = '#extern/bullet2/src'
+BF_BULLET_INC = '${BF_BULLET}'
+BF_BULLET_LIB = 'extern_bullet'
+
+BF_WINTAB = LIBDIR + '/wintab'
+BF_WINTAB_INC = '${BF_WINTAB}/INCLUDE'
+
+# enable freetype2 support for text objects
+BF_FREETYPE = LIBDIR + '/freetype'
+BF_FREETYPE_INC = '${BF_FREETYPE}/include ${BF_FREETYPE}/include/freetype2/'
+BF_FREETYPE_LIB = 'freetype'
+BF_FREETYPE_LIBPATH = '${BF_FREETYPE}/lib'
+
+WITH_BF_QUICKTIME = False
+
+WITH_BF_ICONV = True
+BF_ICONV = LIBDIR + "/iconv"
+BF_ICONV_INC = '${BF_ICONV}/include'
+BF_ICONV_LIB = 'iconv'
+BF_ICONV_LIBPATH = '${BF_ICONV}/lib'
+
+WITH_BF_REDCODE = False
+BF_REDCODE_INC = '#extern'
+
+# Mesa Libs should go here if your using them as well....
+WITH_BF_STATICOPENGL = False
+BF_OPENGL = 'C:\\MingW'
+BF_OPENGL_INC = '${BF_OPENGL}/include'
+BF_OPENGL_LIBINC = '${BF_OPENGL}/lib'
+BF_OPENGL_LIB = 'opengl32 glu32'
+BF_OPENGL_LIB_STATIC = [ '${BF_OPENGL}/lib/libGL.a', '${BF_OPENGL}/lib/libGLU.a',
+             '${BF_OPENGL}/lib/libXmu.a', '${BF_OPENGL}/lib/libXext.a',
+             '${BF_OPENGL}/lib/libX11.a', '${BF_OPENGL}/lib/libXi.a' ]
+
+WITH_BF_COLLADA = False # TODO: Compile Collada with MinGW-w64
+BF_COLLADA = '#source/blender/collada'
+BF_COLLADA_INC = '${BF_COLLADA}'
+BF_COLLADA_LIB = 'bf_collada'
+
+BF_OPENCOLLADA = LIBDIR + '/opencollada'
+BF_OPENCOLLADA_INC = '${BF_OPENCOLLADA}/include'
+BF_OPENCOLLADA_LIB = 'OpenCOLLADAStreamWriter OpenCOLLADASaxFrameworkLoader OpenCOLLADAFramework OpenCOLLADABaseUtils GeneratedSaxParser UTF MathMLSolver expat pcre buffer ftoa'
+BF_OPENCOLLADA_LIBPATH = '${BF_OPENCOLLADA}/lib'
+
+#Cycles
+WITH_BF_CYCLES = True
+WITH_BF_CYCLES_CUDA_BINARIES = False
+BF_CYCLES_CUDA_NVCC = "" # Path to the NVIDIA CUDA compiler
+BF_CYCLES_CUDA_BINARIES_ARCH = ['sm_13', 'sm_20', 'sm_21']
+
+WITH_BF_OIIO = True
+BF_OIIO = LIBDIR + '/openimageio'
+BF_OIIO_INC = '${BF_OIIO}/include'
+BF_OIIO_LIB = 'OpenImageIO'
+BF_OIIO_LIBPATH = '${BF_OIIO}/lib'
+
+WITH_BF_BOOST = True
+BF_BOOST = LIBDIR + '/boost'
+BF_BOOST_INC = BF_BOOST + '/include'
+BF_BOOST_LIB = 'boost_date_time-mgw47-mt-s-1_49 boost_date_time-mgw47-mt-sd-1_49 boost_filesystem-mgw47-mt-s-1_49 boost_filesystem-mgw47-mt-sd-1_49 boost_regex-mgw47-mt-s-1_49 boost_regex-mgw47-mt-sd-1_49 boost_system-mgw47-mt-s-1_49 boost_system-mgw47-mt-sd-1_49 boost_thread-mgw47-mt-s-1_49 boost_thread-mgw47-mt-sd-1_49'
+BF_BOOST_LIBPATH = BF_BOOST + '/lib'
+
+#Ray trace optimization
+WITH_BF_RAYOPTIMIZATION = True
+BF_RAYOPTIMIZATION_SSE_FLAGS = ['-mmmx', '-msse', '-msse2', '-ftree-vectorize']
+
+WITH_BF_OPENMP = True
+
+##
+CC = 'gcc'
+CXX = 'g++'
+
+CCFLAGS = [ '-pipe', '-funsigned-char', '-fno-strict-aliasing' ]
+CXXFLAGS = [ '-fpermissive' ]
+
+CPPFLAGS = ['-DWIN32', '-DMS_WIN64', '-DFREE_WINDOWS', '-DFREE_WINDOWS64', '-D_LARGEFILE_SOURCE', '-D_FILE_OFFSET_BITS=64', '-D_LARGEFILE64_SOURCE', '-DBOOST_ALL_NO_LIB', '-DBOOST_THREAD_USE_LIB', '-DGLEW_STATIC', '-D_SSIZE_T_']
+REL_CFLAGS = []
+REL_CXXFLAGS = []
+REL_CCFLAGS = ['-DNDEBUG', '-O2', '-ftree-vectorize', '-mmmx', '-msse', '-msse2']
+
+C_WARN = ['-Wno-char-subscripts', '-Wdeclaration-after-statement', '-Wstrict-prototypes']
+
+CC_WARN = [ '-Wall' ]
+
+LLIBS = ['-lshell32', '-lshfolder', '-lgdi32', '-lmsvcrt', '-lwinmm', '-lmingw32', '-lm', '-lws2_32', '-lz', '-lstdc++','-lole32','-luuid', '-lwsock32', '-lpsapi', '-lpthread']
+
+PLATFORM_LINKFLAGS = ['-Xlinker', '--stack=2097152']
+
+## DISABLED, causes linking errors!
+## for re-distrobution, so users dont need mingw installed
+# PLATFORM_LINKFLAGS += ["-static-libgcc", "-static-libstdc++"]
+
+BF_DEBUG = False
+BF_DEBUG_CCFLAGS= ['-g', '-D_DEBUG']
+
+BF_PROFILE_CCFLAGS = ['-pg', '-g']
+BF_PROFILE_LINKFLAGS = ['-pg']
+BF_PROFILE_FLAGS = BF_PROFILE_CCFLAGS
+BF_PROFILE = False
+
+BF_BUILDDIR = '..\\build\\win64-mingw'
+BF_INSTALLDIR='..\\install\\win64-mingw'
--- a/build_files/scons/tools/Blender.py
+++ b/build_files/scons/tools/Blender.py
@ -182,12 +182,12 @@ def setup_staticlibs(lenv):
    if lenv['WITH_BF_SNDFILE'] and lenv['WITH_BF_STATICSNDFILE']:
        statlibs += Split(lenv['BF_SNDFILE_LIB_STATIC'])

-    if lenv['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+    if lenv['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
        libincs += Split(lenv['BF_PTHREADS_LIBPATH'])

    if lenv['WITH_BF_COLLADA']:
        libincs += Split(lenv['BF_OPENCOLLADA_LIBPATH'])
-        if lenv['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+        if lenv['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
            libincs += Split(lenv['BF_PCRE_LIBPATH'])
            libincs += Split(lenv['BF_EXPAT_LIBPATH'])

@ -206,7 +206,7 @@ def setup_staticlibs(lenv):
            statlibs += Split(lenv['BF_BOOST_LIB_STATIC'])

    # setting this last so any overriding of manually libs could be handled
-    if lenv['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'win64-vc', 'linuxcross'):
+    if lenv['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'win64-vc', 'linuxcross', 'win64-mingw'):
        libincs.append('/usr/lib')

    if lenv['WITH_BF_JEMALLOC']:
@ -228,7 +228,7 @@ def setup_syslibs(lenv):
    if not lenv['WITH_BF_FREETYPE_STATIC']:
        syslibs += Split(lenv['BF_FREETYPE_LIB'])
    if lenv['WITH_BF_PYTHON'] and not lenv['WITH_BF_STATICPYTHON']:
-        if lenv['BF_DEBUG'] and lenv['OURPLATFORM'] in ('win32-vc', 'win64-vc', 'win32-mingw'):
+        if lenv['BF_DEBUG'] and lenv['OURPLATFORM'] in ('win32-vc', 'win64-vc', 'win32-mingw', 'win64-mingw'):
            syslibs.append(lenv['BF_PYTHON_LIB']+'_d')
        else:
            syslibs.append(lenv['BF_PYTHON_LIB'])
@ -268,7 +268,7 @@ def setup_syslibs(lenv):
        syslibs += Split(lenv['BF_SDL_LIB'])
    if not lenv['WITH_BF_STATICOPENGL']:
        syslibs += Split(lenv['BF_OPENGL_LIB'])
-    if lenv['OURPLATFORM'] in ('win32-vc', 'win32-mingw','linuxcross', 'win64-vc'):
+    if lenv['OURPLATFORM'] in ('win32-vc', 'win32-mingw','linuxcross', 'win64-vc', 'win64-mingw'):
        syslibs += Split(lenv['BF_PTHREADS_LIB'])
    if lenv['WITH_BF_COLLADA']:
        syslibs.append(lenv['BF_PCRE_LIB'])
@ -341,7 +341,7 @@ def creator(env):
        if env['BF_DEBUG']:
            defs.append('_DEBUG')

-    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
        incs.append(env['BF_PTHREADS_INC'])
        incs.append('#/intern/utfconv')

@ -731,7 +731,7 @@ class BlenderEnvironment(SConsEnvironment):
        if not self or not libname or not source:
            print bc.FAIL+'Cannot continue.  Missing argument for BlenderRes '+libname+bc.ENDC
            self.Exit()
-        if self['OURPLATFORM'] not in ('win32-vc','win32-mingw','linuxcross', 'win64-vc'):
+        if self['OURPLATFORM'] not in ('win32-vc','win32-mingw','linuxcross', 'win64-vc', 'win64-mingw'):
            print bc.FAIL+'BlenderRes is for windows only!'+bc.END
            self.Exit()
        
--- a/build_files/scons/tools/btools.py
+++ b/build_files/scons/tools/btools.py
@ -617,12 +617,18 @@ def buildslave(target=None, source=None, env=None):
    Builder for buildbot integration. Used by buildslaves of http://builder.blender.org only.
    """

-    if env['OURPLATFORM'] in ('win32-vc', 'win64-vc', 'win32-mingw', 'darwin'):
+    if env['OURPLATFORM'] in ('win32-vc', 'win64-vc', 'win32-mingw', 'darwin', 'win64-mingw'):
        extension = '.zip'
    else:
        extension = '.tar.bz2'

+    if env['OURPLATFORM'] == 'win32-mingw':
+        platform = 'mingw32'
+    elif env['OURPLATFORM'] == 'win64-mingw':
+        platform = 'mingw64'
+    else:
        platform = env['OURPLATFORM'].split('-')[0]
+
    if platform == 'linux':
        import platform

@ -662,15 +668,13 @@ def NSIS_print(target, source, env):
 def NSIS_Installer(target=None, source=None, env=None):
    print "="*35

-    if env['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'win64-vc'):
+    if env['OURPLATFORM'] not in ('win32-vc', 'win32-mingw', 'win64-vc', 'win64-mingw'):
        print "NSIS installer is only available on Windows."
        Exit()
-    if env['OURPLATFORM'] == 'win32-vc':
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw'):
        bitness = '32'
-    elif env['OURPLATFORM'] == 'win64-vc':
+    elif env['OURPLATFORM'] in ('win64-vc', 'win64-mingw'):
        bitness = '64'
-    else:
-        bitness = '-mingw'

    start_dir = os.getcwd()
    rel_dir = os.path.join(start_dir,'release','windows','installer')
@ -762,7 +766,7 @@ def NSIS_Installer(target=None, source=None, env=None):
    cmdline = "makensis " + "\""+tmpnsi+"\""

    startupinfo = subprocess.STARTUPINFO()
-    startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+    #startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
    proc = subprocess.Popen(cmdline, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
        stderr=subprocess.PIPE, startupinfo=startupinfo, shell = True)
    data, err = proc.communicate()
--- a/doc/manpage/blender.1
+++ b/doc/manpage/blender.1
@ -1,4 +1,13 @@
-.TH "BLENDER" "1" "April 05, 2012" "Blender Blender 2\&.62 (sub 3)"
+.TH "BLENDER" "1" "April 26, 2012" "Blender Blender 2\&.63 (sub 0)
+	build date: 2012-04-26
+	build time: 19:38:31
+	build revision: 45987
+	build platform: Linux
+	build type: Debug
+	build c flags:  -fopenmp  -msse2  -msse -pipe -fPIC -funsigned-char -fno-strict-aliasing  -Wall -Wcast-align -Werror=declaration-after-statement -Werror=implicit-function-declaration -Werror=return-type -Wstrict-prototypes -Wno-char-subscripts -Wno-unknown-pragmas -Wpointer-arith -Wunused-parameter -Wwrite-strings
+	build c++ flags:  -D__STDC_CONSTANT_MACROS -fopenmp  -msse2  -msse -pipe -fPIC -funsigned-char -fno-strict-aliasing  -Wall -Wno-invalid-offsetof -Wno-sign-compare
+	build link flags: -pthread
+	build system: CMake"

 .SH NAME
 blender \- a 3D modelling and rendering package
@ -15,7 +24,7 @@ Use Blender to create TV commercials, to make technical visualizations, business
 http://www.blender.org
 .SH OPTIONS

-Blender 2.62 (sub 3)
+Blender 2.63 (sub 0)
 Usage: blender [args ...] [file] [args ...]
 .br
 .SS "Render Options:"
@ -220,6 +229,12 @@ Enable floating point exceptions
 Enable debug messages from FFmpeg library
 .br

+.TP
+.B \-\-debug\-libmv
+.br
+Enable debug messages from libmv library
+.br
+
 .IP

 .TP
@ -406,6 +421,7 @@ Arguments are executed in the order they are given. eg
  \fIBLENDER_SYSTEM_DATAFILES\fR Directory for system wide data files.
  \fIBLENDER_SYSTEM_PYTHON\fR Directory for system python libraries.
  \fITMP\fR or \fITMPDIR\fR Store temporary files here.
+  \fISDL_AUDIODRIVER\fR LibSDL audio driver \- alsa, esd, dma.
  \fIPYTHONHOME\fR Path to the python directory, eg. /usr/lib/python.
 .br
 .br
--- a/doc/python_api/examples/bpy.types.Mesh.py
+++ b/doc/python_api/examples/bpy.types.Mesh.py
@ -0,0 +1,41 @@
+"""
+Mesh Data
+++++++++
+
+The mesh data is accessed in object mode and intended for compact storage,
+for more flexible mesh editing from python see :mod:`bmesh`.
+
+Blender stores 4 main arrays to define mesh geometry.
+
+* :class:`Mesh.vertices` (3 points in space)
+* :class:`Mesh.edges` (reference 2 vertices)
+* :class:`Mesh.loops` (reference a single vertex and edge)
+* :class:`Mesh.polygons`: (reference a range of loops)
+
+
+Each polygon reference a slice in the loop array, this way, polygons do not store vertices or corner data such as UV's directly,
+only a reference to loops that the polygon uses.
+
+:class:`Mesh.loops`, :class:`Mesh.uv_layers` :class:`Mesh.vertex_colors` are all aligned so the same polygon loop
+indicies can be used to find the UV's and vertex colors as with as the vertices.
+
+To compare mesh API options see: :ref:`NGons and Tessellation Faces <info_gotcha_mesh_faces>`
+
+
+This example script prints the vertices and UV's for each polygon, assumes the active object is a mesh with UVs.
+"""
+
+import bpy
+
+me = bpy.context.object.data
+uv_layer = me.uv.layers.active.data
+
+for poly in me.polygons:
+    print("Polygon index: %d, length: %d" % (poly.index, poly.loop_total))
+
+    # range is used here to show how the polygons reference loops,
+    # for convenience 'poly.loop_indices' can be used instead.
+    for loop_index in range(poly.loop_start, poly.loop_start + poly.loop_total):
+        print("    Vertex: %d" % me.loops[loop_index].vertex_index)
+        print("    UV: %r" % uv_layer[loop_index].uv)
+    
--- a/doc/python_api/rst/include__bmesh.rst
+++ b/doc/python_api/rst/include__bmesh.rst
@ -32,12 +32,11 @@ For an overview of BMesh data types and how they reference each other see:

 .. warning::

-   TODO Items Are
+   TODO items are...

   * add access to BMesh **walkers**
-   * add a way to re-tessellate an editmode bmesh.
-   * add deform vert custom-data access.
-
+   * add api for calling BMesh operators (unrelated to bpy.ops)
+   * add custom-data manipulation functions add/remove/rename.

 Example Script
 --------------
--- a/doc/python_api/rst/info_gotcha.rst
+++ b/doc/python_api/rst/info_gotcha.rst
@ -132,6 +132,8 @@ write useful tools in python which are also fast to execute while in edit-mode.
 For the time being this limitation just has to be worked around but we're aware its frustrating needs to be addressed.


+.. _info_gotcha_mesh_faces:
+
 NGons and Tessellation Faces
 ============================

--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@ -230,10 +230,10 @@ if not ARGS.partial:

 else:
    # can manually edit this too:
-    FILTER_BPY_OPS = ("import.scene", )  # allow
-    FILTER_BPY_TYPES = ("bpy_struct", "Operator", "ID")  # allow
+    #FILTER_BPY_OPS = ("import.scene", )  # allow
+    #FILTER_BPY_TYPES = ("bpy_struct", "Operator", "ID")  # allow
    EXCLUDE_INFO_DOCS = True
-    EXCLUDE_MODULES = (
+    EXCLUDE_MODULES = [
        "aud",
        "bge",
        "bge.constraints",
@ -262,7 +262,7 @@ else:
        "mathutils.geometry",
        "mathutils.noise",
        "Freestyle",
-    )
+        ]

    # ------
    # Filter
@ -270,7 +270,18 @@ else:
    # TODO, support bpy.ops and bpy.types filtering
    import fnmatch
    m = None
-    EXCLUDE_MODULES = tuple([m for m in EXCLUDE_MODULES if not fnmatch.fnmatchcase(m, ARGS.partial)])
+    EXCLUDE_MODULES = [m for m in EXCLUDE_MODULES if not fnmatch.fnmatchcase(m, ARGS.partial)]
+
+    # special support for bpy.types.XXX
+    FILTER_BPY_OPS = tuple([m[8:] for m in ARGS.partial.split(":") if m.startswith("bpy.ops.")])
+    if FILTER_BPY_OPS:
+        EXCLUDE_MODULES.remove("bpy.ops")
+
+    FILTER_BPY_TYPES = tuple([m[10:] for m in ARGS.partial.split(":") if m.startswith("bpy.types.")])
+    if FILTER_BPY_TYPES:
+        EXCLUDE_MODULES.remove("bpy.types")
+
+    print(FILTER_BPY_TYPES)

    EXCLUDE_INFO_DOCS = (not fnmatch.fnmatchcase("info", ARGS.partial))

--- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
+++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btQuantizedBvh.h
@ -78,8 +78,10 @@ ATTRIBUTE_ALIGNED16	(struct) btQuantizedBvhNode
 	int	getTriangleIndex() const
 	{
 		btAssert(isLeafNode());
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
 		// Get only the lower bits where the triangle index is stored
-		return (m_escapeIndexOrTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		return (m_escapeIndexOrTriangleIndex&~(y));
 	}
 	int	getPartId() const
 	{
--- a/extern/bullet2/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
+++ b/extern/bullet2/src/BulletSoftBody/btSoftBodyConcaveCollisionAlgorithm.h
@ -45,7 +45,9 @@ struct btTriIndex
 	int	getTriangleIndex() const
 	{
 		// Get only the lower bits where the triangle index is stored
-		return (m_PartIdTriangleIndex&~((~0)<<(31-MAX_NUM_PARTS_IN_BITS)));
+		unsigned int x = 0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		return (m_PartIdTriangleIndex&~(y));
 	}
 	int	getPartId() const
 	{
--- a/extern/bullet2/src/SConscript
+++ b/extern/bullet2/src/SConscript
@ -11,7 +11,7 @@ if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
    defs += ' WIN32 NDEBUG _WINDOWS'
    #cflags += ['/MT', '/W3', '/GX', '/O2', '/Op']
    cflags += ['/MT', '/W3', '/GX', '/Og', '/Ot', '/Ob1', '/Op', '/G6', '/O3', '/EHcs']
-elif env['OURPLATFORM'] in ('win32-mingw', 'linuxcross'):
+elif env['OURPLATFORM'] in ('win32-mingw', 'linuxcross', 'win64-mingw'):
    defs += ' NDEBUG'
    cflags += ['-O2']
 elif env['OURPLATFORM'] in ('linux', 'freebsd4', 'freebsd5'):
--- a/extern/carve/SConscript
+++ b/extern/carve/SConscript
@ -14,7 +14,7 @@ incs = ['include']
 if env['WITH_BF_BOOST']:
    if env['OURPLATFORM'] not in ('win32-vc', 'win64-vc'):
        # Boost is setting as preferred collections library in the Carve code when using MSVC compiler
-        if env['OURPLATFORM'] != 'win32-mingw':
+        if env['OURPLATFORM'] not in ('win32-mingw', 'win64-mingw'):
            defs.append('HAVE_BOOST_UNORDERED_COLLECTIONS')

    defs.append('CARVE_SYSTEM_BOOST')
--- a/extern/carve/bundle.sh
+++ b/extern/carve/bundle.sh
@ -114,7 +114,7 @@ incs = ['include']
 if env['WITH_BF_BOOST']:
    if env['OURPLATFORM'] not in ('win32-vc', 'win64-vc'):
        # Boost is setting as preferred collections library in the Carve code when using MSVC compiler
-        if env['OURPLATFORM'] != 'win32-mingw':
+        if env['OURPLATFORM'] not in ('win32-mingw', 'win64-mingw'):
            defs.append('HAVE_BOOST_UNORDERED_COLLECTIONS')

    defs.append('CARVE_SYSTEM_BOOST')
--- a/extern/carve/patches/mingw_w64.patch
+++ b/extern/carve/patches/mingw_w64.patch
@ -0,0 +1,13 @@
+Index: bundle.sh
+===================================================================
+--- bundle.sh	(revision 45912)
+++ bundle.sh	(working copy)
+@@ -114,7 +114,7 @@
+ if env['WITH_BF_BOOST']:
+     if env['OURPLATFORM'] not in ('win32-vc', 'win64-vc'):
+         # Boost is setting as preferred collections library in the Carve code when using MSVC compiler
+-        if env['OURPLATFORM'] != 'win32-mingw':
+        if env['OURPLATFORM'] not in ('win32-mingw', 'win64-mingw'):
+             defs.append('HAVE_BOOST_UNORDERED_COLLECTIONS')
+ 
+     defs.append('CARVE_SYSTEM_BOOST')
--- a/extern/libmv/CMakeLists.txt
+++ b/extern/libmv/CMakeLists.txt
@ -69,6 +69,7 @@ set(SRC
 	libmv/simple_pipeline/detect.cc
 	libmv/simple_pipeline/initialize_reconstruction.cc
 	libmv/simple_pipeline/intersect.cc
+	libmv/simple_pipeline/modal_solver.cc
 	libmv/simple_pipeline/pipeline.cc
 	libmv/simple_pipeline/reconstruction.cc
 	libmv/simple_pipeline/resect.cc
@ -126,6 +127,7 @@ set(SRC
 	libmv/simple_pipeline/detect.h
 	libmv/simple_pipeline/initialize_reconstruction.h
 	libmv/simple_pipeline/intersect.h
+	libmv/simple_pipeline/modal_solver.h
 	libmv/simple_pipeline/pipeline.h
 	libmv/simple_pipeline/reconstruction.h
 	libmv/simple_pipeline/resect.h
--- a/extern/libmv/ChangeLog
+++ b/extern/libmv/ChangeLog
@ -1,3 +1,16 @@
+commit a44312a7beb2963b8e3bf8015c516d2eff40cc3d
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Thu Apr 12 13:56:02 2012 +0600
+
+    Added solver for modal camera motion, currently supports only tripod solving
+    
+    This solver is intended to deal with such camera motions as tripod and panning,
+    where it's impossible to reconstruct exact position of markers in 3d view.
+    
+    It projects markers onto sphere and uses rigid registration of rotation to
+    find rotation angles which makes bundles from previous and current frame be
+    as closest as it's possible.
+
 commit fa3842e472e3b9c789e47bf6d8f592aa40a84f16
 Author: Sergey Sharybin <sergey.vfx@gmail.com>
 Date:   Thu Apr 12 12:32:48 2012 +0600
@ -520,9 +533,3 @@ Author: Matthias Fauconneau <matthias.fauconneau@gmail.com>
 Date:   Fri Aug 19 16:04:37 2011 +0200

    MSVC compatibility: heap allocate pattern, explicit float cast.
-
-commit 702658d2f8616964a6eeb3743fd85e97ac7ff09d
-Author: Matthias Fauconneau <matthias.fauconneau@gmail.com>
-Date:   Fri Aug 19 14:59:24 2011 +0200
-
-    Expose regularization parameters (areaPenalty and conditionPenalty) in API.
--- a/extern/libmv/SConscript
+++ b/extern/libmv/SConscript
@ -34,7 +34,7 @@ incs = '. ../Eigen3'
 incs += ' ' + env['BF_PNG_INC']
 incs += ' ' + env['BF_ZLIB_INC']

-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
    incs += ' ./third_party/glog/src/windows ./third_party/glog/src/windows/glog'
    if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
        incs += ' ./third_party/msinttypes'
--- a/extern/libmv/bundle.sh
+++ b/extern/libmv/bundle.sh
@ -248,7 +248,7 @@ incs = '. ../Eigen3'
 incs += ' ' + env['BF_PNG_INC']
 incs += ' ' + env['BF_ZLIB_INC']

-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
    incs += ' ./third_party/glog/src/windows ./third_party/glog/src/windows/glog'
    if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
        incs += ' ./third_party/msinttypes'
--- a/extern/libmv/files.txt
+++ b/extern/libmv/files.txt
@ -42,6 +42,8 @@ libmv/simple_pipeline/initialize_reconstruction.cc
 libmv/simple_pipeline/initialize_reconstruction.h
 libmv/simple_pipeline/intersect.cc
 libmv/simple_pipeline/intersect.h
+libmv/simple_pipeline/modal_solver.cc
+libmv/simple_pipeline/modal_solver.h
 libmv/simple_pipeline/pipeline.cc
 libmv/simple_pipeline/pipeline.h
 libmv/simple_pipeline/reconstruction.cc
--- a/extern/libmv/libmv-capi.cpp
+++ b/extern/libmv/libmv-capi.cpp
@ -54,6 +54,7 @@
 #include "libmv/simple_pipeline/pipeline.h"
 #include "libmv/simple_pipeline/camera_intrinsics.h"
 #include "libmv/simple_pipeline/rigid_registration.h"
+#include "libmv/simple_pipeline/modal_solver.h"

 #include <stdlib.h>
 #include <assert.h>
@ -384,6 +385,31 @@ int libmv_refineParametersAreValid(int parameters) {
 	                       LIBMV_REFINE_RADIAL_DISTORTION_K1));
 }

+void libmv_solveRefineIntrinsics(libmv::Tracks *tracks, libmv::CameraIntrinsics *intrinsics,
+			libmv::EuclideanReconstruction *reconstruction, int refine_intrinsics,
+			reconstruct_progress_update_cb progress_update_callback, void *callback_customdata)
+{
+	/* only a few combinations are supported but trust the caller */
+	int libmv_refine_flags = 0;
+
+	if (refine_intrinsics & LIBMV_REFINE_FOCAL_LENGTH) {
+		libmv_refine_flags |= libmv::BUNDLE_FOCAL_LENGTH;
+	}
+	if (refine_intrinsics & LIBMV_REFINE_PRINCIPAL_POINT) {
+		libmv_refine_flags |= libmv::BUNDLE_PRINCIPAL_POINT;
+	}
+	if (refine_intrinsics & LIBMV_REFINE_RADIAL_DISTORTION_K1) {
+		libmv_refine_flags |= libmv::BUNDLE_RADIAL_K1;
+	}
+	if (refine_intrinsics & LIBMV_REFINE_RADIAL_DISTORTION_K2) {
+		libmv_refine_flags |= libmv::BUNDLE_RADIAL_K2;
+	}
+
+	progress_update_callback(callback_customdata, 1.0, "Refining solution");
+
+	libmv::EuclideanBundleCommonIntrinsics(*(libmv::Tracks *)tracks, libmv_refine_flags,
+		reconstruction, intrinsics);
+}

 libmv_Reconstruction *libmv_solveReconstruction(libmv_Tracks *tracks, int keyframe1, int keyframe2,
 			int refine_intrinsics, double focal_length, double principal_x, double principal_y, double k1, double k2, double k3,
@ -423,26 +449,45 @@ libmv_Reconstruction *libmv_solveReconstruction(libmv_Tracks *tracks, int keyfra
 	libmv::EuclideanCompleteReconstruction(normalized_tracks, reconstruction, &update_callback);

 	if (refine_intrinsics) {
-		/* only a few combinations are supported but trust the caller */
-		int libmv_refine_flags = 0;
-		if (refine_intrinsics & LIBMV_REFINE_FOCAL_LENGTH) {
-			libmv_refine_flags |= libmv::BUNDLE_FOCAL_LENGTH;
-		}
-		if (refine_intrinsics & LIBMV_REFINE_PRINCIPAL_POINT) {
-			libmv_refine_flags |= libmv::BUNDLE_PRINCIPAL_POINT;
-		}
-		if (refine_intrinsics & LIBMV_REFINE_RADIAL_DISTORTION_K1) {
-			libmv_refine_flags |= libmv::BUNDLE_RADIAL_K1;
-		}
-		if (refine_intrinsics & LIBMV_REFINE_RADIAL_DISTORTION_K2) {
-			libmv_refine_flags |= libmv::BUNDLE_RADIAL_K2;
+		libmv_solveRefineIntrinsics((libmv::Tracks *)tracks, intrinsics, reconstruction,
+			refine_intrinsics, progress_update_callback, callback_customdata);
 	}

-		progress_update_callback(callback_customdata, 1.0, "Refining solution");
-		libmv::EuclideanBundleCommonIntrinsics(*(libmv::Tracks *)tracks, libmv_refine_flags,
-			reconstruction, intrinsics);
+	progress_update_callback(callback_customdata, 1.0, "Finishing solution");
+	libmv_reconstruction->tracks = *(libmv::Tracks *)tracks;
+	libmv_reconstruction->error = libmv::EuclideanReprojectionError(*(libmv::Tracks *)tracks, *reconstruction, *intrinsics);
+
+	return (libmv_Reconstruction *)libmv_reconstruction;
+}
+
+struct libmv_Reconstruction *libmv_solveModal(struct libmv_Tracks *tracks, double focal_length,
+			double principal_x, double principal_y, double k1, double k2, double k3,
+			reconstruct_progress_update_cb progress_update_callback, void *callback_customdata)
+{
+	/* Invert the camera intrinsics. */
+	libmv::vector<libmv::Marker> markers = ((libmv::Tracks*)tracks)->AllMarkers();
+	libmv_Reconstruction *libmv_reconstruction = new libmv_Reconstruction();
+	libmv::EuclideanReconstruction *reconstruction = &libmv_reconstruction->reconstruction;
+	libmv::CameraIntrinsics *intrinsics = &libmv_reconstruction->intrinsics;
+
+	ReconstructUpdateCallback update_callback =
+		ReconstructUpdateCallback(progress_update_callback, callback_customdata);
+
+	intrinsics->SetFocalLength(focal_length, focal_length);
+	intrinsics->SetPrincipalPoint(principal_x, principal_y);
+	intrinsics->SetRadialDistortion(k1, k2, k3);
+
+	for (int i = 0; i < markers.size(); ++i) {
+		intrinsics->InvertIntrinsics(markers[i].x,
+			markers[i].y,
+			&(markers[i].x),
+			&(markers[i].y));
 	}

+	libmv::Tracks normalized_tracks(markers);
+
+	libmv::ModalSolver(normalized_tracks, reconstruction, &update_callback);
+
 	progress_update_callback(callback_customdata, 1.0, "Finishing solution");
 	libmv_reconstruction->tracks = *(libmv::Tracks *)tracks;
 	libmv_reconstruction->error = libmv::EuclideanReprojectionError(*(libmv::Tracks *)tracks, *reconstruction, *intrinsics);
--- a/extern/libmv/libmv-capi.h
+++ b/extern/libmv/libmv-capi.h
@ -68,6 +68,9 @@ int libmv_refineParametersAreValid(int parameters);
 struct libmv_Reconstruction *libmv_solveReconstruction(struct libmv_Tracks *tracks, int keyframe1, int keyframe2,
 			int refine_intrinsics, double focal_length, double principal_x, double principal_y, double k1, double k2, double k3,
 			reconstruct_progress_update_cb progress_update_callback, void *callback_customdata);
+struct libmv_Reconstruction *libmv_solveModal(struct libmv_Tracks *tracks, double focal_length,
+			double principal_x, double principal_y, double k1, double k2, double k3,
+			reconstruct_progress_update_cb progress_update_callback, void *callback_customdata);
 int libmv_reporojectionPointForTrack(struct libmv_Reconstruction *libmv_reconstruction, int track, double pos[3]);
 double libmv_reporojectionErrorForTrack(struct libmv_Reconstruction *libmv_reconstruction, int track);
 double libmv_reporojectionErrorForImage(struct libmv_Reconstruction *libmv_reconstruction, int image);
--- a/extern/libmv/libmv/numeric/numeric.h
+++ b/extern/libmv/libmv/numeric/numeric.h
@ -33,7 +33,7 @@
 #include <Eigen/QR>
 #include <Eigen/SVD>

-#if _WIN32 || __APPLE__ || __FreeBSD__
+#if (defined(_WIN32) || defined(__APPLE__) || defined(__FreeBSD__)) && !defined(__MINGW64__)
  void static sincos (double x, double *sinx, double *cosx) {
    *sinx = sin(x);
    *cosx = cos(x);
--- a/extern/libmv/libmv/simple_pipeline/modal_solver.cc
+++ b/extern/libmv/libmv/simple_pipeline/modal_solver.cc
@ -0,0 +1,126 @@
+// Copyright (c) 2012 libmv authors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+
+#include <cstdio>
+
+#include "libmv/logging/logging.h"
+#include "libmv/simple_pipeline/modal_solver.h"
+#include "libmv/simple_pipeline/rigid_registration.h"
+
+#ifdef _MSC_VER
+#  define snprintf _snprintf
+#endif
+
+namespace libmv {
+
+static void ProjectMarkerOnSphere(Marker &marker, Vec3 &X) {
+  X(0) = marker.x;
+  X(1) = marker.y;
+  X(2) = 1.0;
+
+  X *= 5.0 / X.norm();
+}
+
+static void ModalSolverLogProress(ProgressUpdateCallback *update_callback,
+    double progress)
+{
+  if (update_callback) {
+    char message[256];
+
+    snprintf(message, sizeof(message), "Solving progress %d%%", (int)(progress * 100));
+
+    update_callback->invoke(progress, message);
+  }
+}
+
+void ModalSolver(Tracks &tracks,
+                 EuclideanReconstruction *reconstruction,
+                 ProgressUpdateCallback *update_callback) {
+  int max_image = tracks.MaxImage();
+  int max_track = tracks.MaxTrack();
+
+  LG << "Max image: " << max_image;
+  LG << "Max track: " << max_track;
+
+  Mat3 R = Mat3::Identity();
+
+  for (int image = 0; image <= max_image; ++image) {
+    vector<Marker> all_markers = tracks.MarkersInImage(image);
+
+    ModalSolverLogProress(update_callback, (float) image / max_image);
+
+    // Skip empty frames without doing anything
+    if (all_markers.size() == 0) {
+      LG << "Skipping frame: " << image;
+      continue;
+    }
+
+    vector<Vec3> points, reference_points;
+
+    // Cnstruct pairs of markers from current and previous image,
+    // to reproject them and find rigid transformation between
+    // previous and current image
+    for (int track = 0; track <= max_track; ++track) {
+      EuclideanPoint *point = reconstruction->PointForTrack(track);
+
+      if (point) {
+        Marker marker = tracks.MarkerInImageForTrack(image, track);
+
+        if (marker.image == image) {
+          Vec3 X;
+
+          LG << "Use track " << track << " for rigid registration between image " <<
+            image - 1 << " and " << image;
+
+          ProjectMarkerOnSphere(marker, X);
+
+          points.push_back(point->X);
+          reference_points.push_back(X);
+        }
+      }
+    }
+
+    if (points.size()) {
+      // Find rigid delta transformation to current image
+      RigidRegistration(reference_points, points, R);
+    }
+
+    reconstruction->InsertCamera(image, R, Vec3::Zero());
+
+    // Review if there's new tracks for which position might be reconstructed
+    for (int track = 0; track <= max_track; ++track) {
+      if (!reconstruction->PointForTrack(track)) {
+        Marker marker = tracks.MarkerInImageForTrack(image, track);
+
+        if (marker.image == image) {
+          // New track appeared on this image, project it's position onto sphere
+
+          LG << "Projecting track " << track << " at image " << image;
+
+          Vec3 X;
+          ProjectMarkerOnSphere(marker, X);
+          reconstruction->InsertPoint(track, R.inverse() * X);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace libmv
--- a/extern/libmv/libmv/simple_pipeline/modal_solver.h
+++ b/extern/libmv/libmv/simple_pipeline/modal_solver.h
@ -0,0 +1,48 @@
+// Copyright (c) 2012 libmv authors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+
+#ifndef LIBMV_SIMPLE_PIPELINE_MODAL_SOLVER_H_
+#define LIBMV_SIMPLE_PIPELINE_MODAL_SOLVER_H_
+
+#include "libmv/simple_pipeline/tracks.h"
+#include "libmv/simple_pipeline/reconstruction.h"
+#include "libmv/simple_pipeline/callbacks.h"
+
+namespace libmv {
+
+/*!
+    This solver solves such camera motion as tripod rotation, reconstructing
+    only camera motion itself. Bundles are not reconstructing properly, they're
+    just getting projected onto sphere.
+
+    Markers from tracks object would be used for recosntruction, and algorithm
+    assumes thir's positions are undistorted already and they're in nnormalized
+    space.
+
+    Reconstructed cameras and projected bundles would be added to reconstruction
+    object.
+*/
+void ModalSolver(Tracks &tracks,
+                 EuclideanReconstruction *reconstruction,
+                 ProgressUpdateCallback *update_callback = NULL);
+
+}  // namespace libmv
+
+#endif  // LIBMV_SIMPLE_PIPELINE_MODAL_SOLVER_H_
--- a/extern/libmv/patches/mingw_w64_support.patch
+++ b/extern/libmv/patches/mingw_w64_support.patch
@ -0,0 +1,58 @@
+Index: bundle.sh
+===================================================================
+--- bundle.sh	(revision 45912)
+++ bundle.sh	(working copy)
+@@ -248,7 +248,7 @@
+ incs += ' ' + env['BF_PNG_INC']
+ incs += ' ' + env['BF_ZLIB_INC']
+ 
+-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+     incs += ' ./third_party/glog/src/windows ./third_party/glog/src/windows/glog'
+     if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
+         incs += ' ./third_party/msinttypes'
+Index: libmv/numeric/numeric.h
+===================================================================
+--- libmv/numeric/numeric.h	(revision 45912)
+++ libmv/numeric/numeric.h	(working copy)
+@@ -33,7 +33,7 @@
+ #include <Eigen/QR>
+ #include <Eigen/SVD>
+ 
+-#if _WIN32 || __APPLE__ || __FreeBSD__
+#if (defined(_WIN32) || defined(__APPLE__) || defined(__FreeBSD__)) && !defined(__MINGW64__)
+   void static sincos (double x, double *sinx, double *cosx) {
+     *sinx = sin(x);
+     *cosx = cos(x);
+Index: third_party/glog/src/windows/port.cc
+===================================================================
+--- third_party/glog/src/windows/port.cc	(revision 45912)
+++ third_party/glog/src/windows/port.cc	(working copy)
+@@ -55,6 +55,8 @@
+   return _vsnprintf(str, size-1, format, ap);
+ }
+ 
+// MinGW64 defines
+#ifndef __MINGW64__
+ int snprintf(char *str, size_t size, const char *format, ...) {
+   va_list ap;
+   va_start(ap, format);
+@@ -62,3 +64,4 @@
+   va_end(ap);
+   return r;
+ }
+#endif
+Index: third_party/glog/src/windows/port.h
+===================================================================
+--- third_party/glog/src/windows/port.h	(revision 45912)
+++ third_party/glog/src/windows/port.h	(working copy)
+@@ -120,7 +120,9 @@
+ #define DEFAULT_TEMPLATE_ROOTDIR  ".."
+ 
+ // ----------------------------------- SYSTEM/PROCESS
+#ifndef __MINGW64__
+ typedef int pid_t;
+#endif
+ #define getpid  _getpid
+ 
+ // ----------------------------------- THREADS
--- a/extern/libmv/third_party/glog/src/windows/port.cc
+++ b/extern/libmv/third_party/glog/src/windows/port.cc
@ -55,6 +55,8 @@ int safe_vsnprintf(char *str, size_t size, const char *format, va_list ap) {
  return _vsnprintf(str, size-1, format, ap);
 }

+// MinGW64 defines
+#ifndef __MINGW64__
 int snprintf(char *str, size_t size, const char *format, ...) {
  va_list ap;
  va_start(ap, format);
@ -62,3 +64,4 @@ int snprintf(char *str, size_t size, const char *format, ...) {
  va_end(ap);
  return r;
 }
+#endif
--- a/extern/libmv/third_party/glog/src/windows/port.h
+++ b/extern/libmv/third_party/glog/src/windows/port.h
@ -120,7 +120,9 @@ extern int safe_vsnprintf(char *str, size_t size,
 #define DEFAULT_TEMPLATE_ROOTDIR  ".."

 // ----------------------------------- SYSTEM/PROCESS
+#ifndef __MINGW64__
 typedef int pid_t;
+#endif
 #define getpid  _getpid

 // ----------------------------------- THREADS
--- a/intern/SConscript
+++ b/intern/SConscript
@ -36,5 +36,5 @@ if NEW_CSG == 'false':
 else:
    SConscript(['csg/SConscript'])

-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'win64-mingw', 'linuxcross', 'win64-vc'):
    SConscript(['utfconv/SConscript'])
--- a/intern/audaspace/SConscript
+++ b/intern/audaspace/SConscript
@ -41,7 +41,7 @@ if env['WITH_BF_PYTHON']:
    incs += ' Python ' + env['BF_PYTHON_INC']
    defs.append('WITH_PYTHON')

-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
    incs += ' ' + env['BF_PTHREADS_INC']

 env.BlenderLib ('bf_intern_audaspace', sources, Split(incs), defs, libtype=['intern','player'], priority = [25,215] )
--- a/intern/audaspace/intern/AUD_C-API.cpp
+++ b/intern/audaspace/intern/AUD_C-API.cpp
@ -41,6 +41,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <cmath>
+#include <sstream>

 #include "AUD_NULLDevice.h"
 #include "AUD_I3DDevice.h"
@ -1236,6 +1237,47 @@ const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int lengt
 	}
 }

+const char* AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate)
+{
+	try
+	{
+		AUD_SequencerFactory* f = dynamic_cast<AUD_SequencerFactory*>(sound->get());
+
+		f->setSpecs(specs.specs);
+
+		std::vector<AUD_Reference<AUD_IWriter> > writers;
+
+		int channels = specs.channels;
+		specs.channels = AUD_CHANNELS_MONO;
+
+		for(int i = 0; i < channels; i++)
+		{
+			std::stringstream stream;
+			std::string fn = filename;
+			size_t index = fn.find_last_of('.');
+			size_t index_slash = fn.find_last_of('/');
+			size_t index_backslash = fn.find_last_of('\\');
+			if((index == std::string::npos) ||
+					((index < index_slash) && (index_slash != std::string::npos)) ||
+					((index < index_backslash) && (index_backslash != std::string::npos)))
+				stream << filename << "_" << (i + 1);
+			else
+				stream << fn.substr(0, index) << "_" << (i + 1) << fn.substr(index);
+			writers.push_back(AUD_FileWriter::createWriter(stream.str(), specs, format, codec, bitrate));
+		}
+
+		AUD_Reference<AUD_IReader> reader = f->createQualityReader();
+		reader->seek(start);
+		AUD_FileWriter::writeReader(reader, writers, length, buffersize);
+
+		return NULL;
+	}
+	catch(AUD_Exception& e)
+	{
+		return e.str;
+	}
+}
+
 AUD_Device* AUD_openMixdownDevice(AUD_DeviceSpecs specs, AUD_Sound* sequencer, float volume, float start)
 {
 	try
--- a/intern/audaspace/intern/AUD_C-API.h
+++ b/intern/audaspace/intern/AUD_C-API.h
@ -709,6 +709,21 @@ extern void* AUD_getSet(void* set);
 */
 extern const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate);

+/**
+ * Mixes a sound down into multiple files.
+ * \param sound The sound scene to mix down.
+ * \param start The start frame.
+ * \param length The count of frames to write.
+ * \param buffersize How many samples should be written at once.
+ * \param filename The file to write to, the channel number and an underscore are added at the beginning.
+ * \param specs The file's audio specification.
+ * \param format The file's container format.
+ * \param codec The codec used for encoding the audio data.
+ * \param bitrate The bitrate for encoding.
+ * \return An error message or NULL in case of success.
+ */
+extern const char* AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate);
+
 /**
 * Opens a read device and prepares it for mixdown of the sound scene.
 * \param specs Output audio specifications.
--- a/intern/audaspace/intern/AUD_FileWriter.cpp
+++ b/intern/audaspace/intern/AUD_FileWriter.cpp
@ -93,3 +93,39 @@ void AUD_FileWriter::writeReader(AUD_Reference<AUD_IReader> reader, AUD_Referenc
 		writer->write(len, buf);
 	}
 }
+
+void AUD_FileWriter::writeReader(AUD_Reference<AUD_IReader> reader, std::vector<AUD_Reference<AUD_IWriter> >& writers, unsigned int length, unsigned int buffersize)
+{
+	AUD_Buffer buffer(buffersize * AUD_SAMPLE_SIZE(reader->getSpecs()));
+	AUD_Buffer buffer2(buffersize * sizeof(sample_t));
+	sample_t* buf = buffer.getBuffer();
+	sample_t* buf2 = buffer2.getBuffer();
+
+	int len;
+	bool eos = false;
+	int channels = reader->getSpecs().channels;
+
+	for(unsigned int pos = 0; ((pos < length) || (length <= 0)) && !eos; pos += len)
+	{
+		len = buffersize;
+		if((len > length - pos) && (length > 0))
+			len = length - pos;
+		reader->read(len, eos, buf);
+
+		for(int channel = 0; channel < channels; channel++)
+		{
+			for(int i = 0; i < len; i++)
+			{
+				// clamping!
+				if(buf[i * channels + channel] > 1)
+					buf2[i] = 1;
+				else if(buf[i * channels + channel] < -1)
+					buf2[i] = -1;
+				else
+					buf2[i] = buf[i * channels + channel];
+			}
+
+			writers[channel]->write(len, buf2);
+		}
+	}
+}
--- a/intern/audaspace/intern/AUD_FileWriter.h
+++ b/intern/audaspace/intern/AUD_FileWriter.h
@ -31,6 +31,7 @@
 #define __AUD_FILEWRITER_H__

 #include <string>
+#include <vector>

 #include "AUD_Reference.h"

@ -68,6 +69,15 @@ public:
 	 * \param buffersize How many samples should be transfered at once.
 	 */
 	static void writeReader(AUD_Reference<AUD_IReader> reader, AUD_Reference<AUD_IWriter> writer, unsigned int length, unsigned int buffersize);
+
+	/**
+	 * Writes a reader to several writers.
+	 * \param reader The reader to read from.
+	 * \param writers The writers to write to.
+	 * \param length How many samples should be transfered.
+	 * \param buffersize How many samples should be transfered at once.
+	 */
+	static void writeReader(AUD_Reference<AUD_IReader> reader, std::vector<AUD_Reference<AUD_IWriter> >& writers, unsigned int length, unsigned int buffersize);
 };

 #endif //__AUD_FILEWRITER_H__
--- a/intern/audaspace/intern/AUD_Reference.h
+++ b/intern/audaspace/intern/AUD_Reference.h
@ -31,6 +31,7 @@

 #include <map>
 #include <cstddef>
+#include <pthread.h>

 // #define MEM_DEBUG

@ -49,8 +50,13 @@ private:
 	 * Saves the reference counts.
 	 */
 	static std::map<void*, unsigned int> m_references;
+	static pthread_mutex_t m_mutex;
+	static bool m_mutex_initialised;

 public:
+
+	static pthread_mutex_t* getMutex();
+
 	/**
 	 * Reference increment.
 	 * \param reference The reference.
@ -108,6 +114,7 @@ public:
 	template <class U>
 	AUD_Reference(U* reference)
 	{
+		pthread_mutex_lock(AUD_ReferenceHandler::getMutex());
 		m_original = reference;
 		m_reference = dynamic_cast<T*>(reference);
 		AUD_ReferenceHandler::incref(m_original);
@ -115,6 +122,7 @@ public:
 		if(m_reference != NULL)
 			std::cerr << "+" << typeid(*m_reference).name() << std::endl;
 #endif
+		pthread_mutex_unlock(AUD_ReferenceHandler::getMutex());
 	}

 	AUD_Reference()
@ -129,6 +137,7 @@ public:
 	 */
 	AUD_Reference(const AUD_Reference& ref)
 	{
+		pthread_mutex_lock(AUD_ReferenceHandler::getMutex());
 		m_original = ref.m_original;
 		m_reference = ref.m_reference;
 		AUD_ReferenceHandler::incref(m_original);
@ -136,11 +145,13 @@ public:
 		if(m_reference != NULL)
 			std::cerr << "+" << typeid(*m_reference).name() << std::endl;
 #endif
+		pthread_mutex_unlock(AUD_ReferenceHandler::getMutex());
 	}

 	template <class U>
 	explicit AUD_Reference(const AUD_Reference<U>& ref)
 	{
+		pthread_mutex_lock(AUD_ReferenceHandler::getMutex());
 		m_original = ref.get();
 		m_reference = dynamic_cast<T*>(ref.get());
 		AUD_ReferenceHandler::incref(m_original);
@ -148,6 +159,7 @@ public:
 		if(m_reference != NULL)
 			std::cerr << "+" << typeid(*m_reference).name() << std::endl;
 #endif
+		pthread_mutex_unlock(AUD_ReferenceHandler::getMutex());
 	}

 	/**
@ -156,12 +168,14 @@ public:
 	 */
 	~AUD_Reference()
 	{
+		pthread_mutex_lock(AUD_ReferenceHandler::getMutex());
 #ifdef MEM_DEBUG
 		if(m_reference != NULL)
 			std::cerr << "-" << typeid(*m_reference).name() << std::endl;
 #endif
 		if(AUD_ReferenceHandler::decref(m_original))
 			delete m_reference;
+		pthread_mutex_unlock(AUD_ReferenceHandler::getMutex());
 	}

 	/**
@ -173,6 +187,8 @@ public:
 		if(&ref == this)
 			return *this;

+		pthread_mutex_lock(AUD_ReferenceHandler::getMutex());
+
 #ifdef MEM_DEBUG
 		if(m_reference != NULL)
 			std::cerr << "-" << typeid(*m_reference).name() << std::endl;
@ -188,6 +204,8 @@ public:
 			std::cerr << "+" << typeid(*m_reference).name() << std::endl;
 #endif

+		pthread_mutex_unlock(AUD_ReferenceHandler::getMutex());
+
 		return *this;
 	}

--- a/intern/audaspace/intern/AUD_ReferenceHandler.cpp
+++ b/intern/audaspace/intern/AUD_ReferenceHandler.cpp
@ -29,3 +29,24 @@
 #include "AUD_Reference.h"

 std::map<void*, unsigned int> AUD_ReferenceHandler::m_references;
+pthread_mutex_t AUD_ReferenceHandler::m_mutex;
+bool AUD_ReferenceHandler::m_mutex_initialised = false;
+
+pthread_mutex_t *AUD_ReferenceHandler::getMutex()
+{
+	if(!m_mutex_initialised)
+	{
+		pthread_mutexattr_t attr;
+		pthread_mutexattr_init(&attr);
+		pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+
+		pthread_mutex_init(&m_mutex, &attr);
+
+		pthread_mutexattr_destroy(&attr);
+
+		m_mutex_initialised = true;
+	}
+
+	return &m_mutex;
+}
+
--- a/intern/boolop/SConscript
+++ b/intern/boolop/SConscript
@ -18,13 +18,13 @@ else:
    if env['WITH_BF_BOOST']:
        if env['OURPLATFORM'] not in ('win32-vc', 'win64-vc'):
            # Boost is setting as preferred collections library in the Carve code when using MSVC compiler
-            if env['OURPLATFORM'] != 'win32-mingw':
+            if env['OURPLATFORM'] not in ('win32-mingw', 'win64-mingw'):
                defs.append('HAVE_BOOST_UNORDERED_COLLECTIONS')

        defs.append('CARVE_SYSTEM_BOOST')
        incs +=  ' ' + env['BF_BOOST_INC']

-if (env['OURPLATFORM'] == 'win32-mingw'):
+if (env['OURPLATFORM'] in ('win32-mingw', 'win64-mingw')):
    env.BlenderLib ('bf_intern_bop', sources, Split(incs) , [], libtype='intern', priority = 5 )
 else:
    env.BlenderLib ('bf_intern_bop', sources, Split(incs) , defs, libtype='intern', priority = 5 )
--- a/intern/container/CTR_Map.h
+++ b/intern/container/CTR_Map.h
@ -63,7 +63,7 @@ public:
 		for (int i = 0; i < m_num_buckets; ++i) {
 			m_buckets[i] = 0;

-			for(Entry *entry = map.m_buckets[i]; entry; entry=entry->m_next)
+			for (Entry *entry = map.m_buckets[i]; entry; entry=entry->m_next)
 				insert(entry->m_key, entry->m_value);
 		}
 	}
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@ -39,7 +39,7 @@ if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
 else:
    cxxflags.append('-ffast-math'.split())

-if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc'):
+if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
    incs.append(env['BF_PTHREADS_INC'])

 # optimized kernel
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -85,10 +85,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                description="Leave out caustics, resulting in a darker image with less noise",
                default=False,
                )
-        cls.blur_caustics = FloatProperty(
-                name="Blur Caustics",
-                description="Blur caustics to reduce noise",
-                min=0.0, max=1.0,
+        cls.blur_glossy = FloatProperty(
+                name="Filter Glossy",
+                description="Adaptively blur glossy shaders after blurry bounces, to reduce noise at the cost of accuracy",
+                min=0.0, max=10.0,
                default=0.0,
                )

--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -87,11 +87,11 @@ class CyclesRender_PT_integrator(CyclesButtonsPanel, Panel):
        sub.prop(cscene, "diffuse_bounces", text="Diffuse")
        sub.prop(cscene, "glossy_bounces", text="Glossy")
        sub.prop(cscene, "transmission_bounces", text="Transmission")
-        sub.prop(cscene, "no_caustics")

-        #row = col.row()
-        #row.prop(cscene, "blur_caustics")
-        #row.active = not cscene.no_caustics
+        col.separator()
+
+        col.prop(cscene, "no_caustics")
+        col.prop(cscene, "blur_glossy")


 class CyclesRender_PT_film(CyclesButtonsPanel, Panel):
@ -178,10 +178,7 @@ class CyclesRender_PT_layers(CyclesButtonsPanel, Panel):

        col = split.column()
        col.prop(scene, "layers", text="Scene")
-        col.label(text="Material:")
-        col.prop(rl, "material_override", text="")
-
-        col.prop(rl, "use_sky", "Use Environment")
+        col.prop(rl, "layers_exclude", text="Exclude")

        col = split.column()
        col.prop(rl, "layers", text="Layer")
@ -190,6 +187,16 @@ class CyclesRender_PT_layers(CyclesButtonsPanel, Panel):

        split = layout.split()

+        col = split.column()
+        col.label(text="Material:")
+        col.prop(rl, "material_override", text="")
+
+        col = split.column()
+        col.prop(rl, "samples")
+        col.prop(rl, "use_sky", "Use Environment")
+
+        split = layout.split()
+
        col = split.column()
        col.label(text="Passes:")
        col.prop(rl, "use_pass_combined")
@ -783,6 +790,31 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
            layout.template_color_ramp(mapping, "color_ramp", expand=True)


+class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
+    bl_label = "Simplify"
+    bl_context = "scene"
+    COMPAT_ENGINES = {'CYCLES'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        self.layout.prop(rd, "use_simplify", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        rd = context.scene.render
+
+        layout.active = rd.use_simplify
+
+        split = layout.split()
+
+        col = split.column()
+        col.prop(rd, "simplify_subdivision", text="Subdivision")
+
+        col = split.column()
+        col.prop(rd, "simplify_child_particles", text="Child Particles")
+
+
 def draw_device(self, context):
    scene = context.scene
    layout = self.layout
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@ -227,6 +227,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool holdout, bool object_updated)
 	/* test if we can instance or if the object is modified */
 	BL::ID b_ob_data = b_ob.data();
 	BL::ID key = (object_is_modified(b_ob) || holdout)? b_ob: b_ob_data;
+	BL::Material material_override = render_layer.material_override;

 	/* find shader indices */
 	vector<uint> used_shaders;
@ -246,6 +247,8 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool holdout, bool object_updated)
 	if(used_shaders.size() == 0) {
 		if(holdout)
 			used_shaders.push_back(scene->default_holdout);
+		else if(material_override)
+			find_shader(material_override, used_shaders, scene->default_surface);
 		else
 			used_shaders.push_back(scene->default_surface);
 	}
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@ -218,12 +218,13 @@ void BlenderSession::render()
 		scene->film->passes = passes;
 		scene->film->tag_update(scene);

-		/* update session */
-		session->reset(buffer_params, session_params.samples);
-
 		/* update scene */
 		sync->sync_data(b_v3d, b_iter->name().c_str());

+		/* update session */
+		int samples = sync->get_layer_samples();
+		session->reset(buffer_params, (samples == 0)? session_params.samples: samples);
+
 		/* render */
 		session->start();
 		session->wait();
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@ -153,6 +153,8 @@ void BlenderSync::sync_integrator()
 	integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");

 	integrator->no_caustics = get_boolean(cscene, "no_caustics");
+	integrator->filter_glossy = get_float(cscene, "blur_glossy");
+
 	integrator->seed = get_int(cscene, "seed");

 	integrator->layer_flag = render_layer.layer;
@ -208,6 +210,7 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
 			render_layer.holdout_layer = 0;
 			render_layer.material_override = PointerRNA_NULL;
 			render_layer.use_background = true;
+			render_layer.samples = 0;
 			return;
 		}
 	}
@ -220,12 +223,13 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
 	for(r.layers.begin(b_rlay); b_rlay != r.layers.end(); ++b_rlay) {
 		if((!layer && first_layer) || (layer && b_rlay->name() == layer)) {
 			render_layer.name = b_rlay->name();
-			render_layer.scene_layer = get_layer(b_scene.layers());
+			render_layer.scene_layer = get_layer(b_scene.layers()) & ~get_layer(b_rlay->layers_exclude());
 			render_layer.layer = get_layer(b_rlay->layers());
 			render_layer.holdout_layer = get_layer(b_rlay->layers_zmask());
 			render_layer.layer |= render_layer.holdout_layer;
 			render_layer.material_override = b_rlay->material_override();
 			render_layer.use_background = b_rlay->use_sky();
+			render_layer.samples = b_rlay->samples();
 		}

 		first_layer = false;
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@ -57,6 +57,7 @@ public:
 	void sync_data(BL::SpaceView3D b_v3d, const char *layer = 0);
 	void sync_camera(BL::Object b_override, int width, int height);
 	void sync_view(BL::SpaceView3D b_v3d, BL::RegionView3D b_rv3d, int width, int height);
+	int get_layer_samples() { return render_layer.samples; }

 	/* get parameters */
 	static SceneParams get_scene_params(BL::Scene b_scene, bool background);
@ -108,7 +109,8 @@ private:
 		RenderLayerInfo()
 		: scene_layer(0), layer(0), holdout_layer(0),
 		  material_override(PointerRNA_NULL),
-		  use_background(true)
+		  use_background(true),
+		  samples(0)
 		{}

 		string name;
@ -117,6 +119,7 @@ private:
 		uint holdout_layer;
 		BL::Material material_override;
 		bool use_background;
+		int samples;
 	} render_layer;
 };

--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@ -10,17 +10,21 @@ set(INC

 set(SRC
 	bvh.cpp
+	bvh_binning.cpp
 	bvh_build.cpp
 	bvh_node.cpp
 	bvh_sort.cpp
+	bvh_split.cpp
 )

 set(SRC_HEADERS
 	bvh.h
+	bvh_binning.h
 	bvh_build.h
 	bvh_node.h
 	bvh_params.h
 	bvh_sort.h
+	bvh_split.h
 )

 include_directories(${INC})
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@ -530,7 +530,7 @@ void RegularBVH::refit_nodes()
 {
 	assert(!params.top_level);

-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;
 	uint visibility = 0;
 	refit_node(0, (pack.is_leaf[0])? true: false, bbox, visibility);
 }
@ -572,7 +572,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 	}
 	else {
 		/* refit inner node, set bbox from children */
-		BoundBox bbox0, bbox1;
+		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
 		uint visibility0 = 0, visibility1 = 0;

 		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@ -0,0 +1,223 @@
+/*
+ * Adapted from code copyright 2009-2011 Intel Corporation
+ * Modifications Copyright 2012, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#define __KERNEL_SSE__
+
+#include <stdlib.h>
+
+#include "bvh_binning.h"
+
+#include "util_algorithm.h"
+#include "util_boundbox.h"
+#include "util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* SSE replacements */
+
+__forceinline void prefetch_L1 (const void* ptr) { }
+__forceinline void prefetch_L2 (const void* ptr) { }
+__forceinline void prefetch_L3 (const void* ptr) { }
+__forceinline void prefetch_NTA(const void* ptr) { }
+
+template<size_t src> __forceinline float extract(const int4& b)
+{ return b[src]; }
+template<size_t dst> __forceinline const float4 insert(const float4& a, const float b)
+{ float4 r = a; r[dst] = b; return r; }
+
+__forceinline int get_best_dimension(const float4& bestSAH)
+{
+	// return (int)__bsf(movemask(reduce_min(bestSAH) == bestSAH));
+
+	float minSAH = min(bestSAH.x, min(bestSAH.y, bestSAH.z));
+
+	if(bestSAH.x == minSAH) return 0;
+	else if(bestSAH.y == minSAH) return 1;
+	else return 2;
+}
+
+/* BVH Object Binning */
+
+BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
+: BVHRange(job), splitSAH(FLT_MAX), dim(0), pos(0)
+{
+	/* compute number of bins to use and precompute scaling factor for binning */
+	num_bins = min(size_t(MAX_BINS), size_t(4.0f + 0.05f*size()));
+	scale = rcp(cent_bounds().size()) * make_float3((float)num_bins);
+
+	/* initialize binning counter and bounds */
+	BoundBox bin_bounds[MAX_BINS][4];	/* bounds for every bin in every dimension */
+	int4 bin_count[MAX_BINS];			/* number of primitives mapped to bin */
+
+	for(size_t i = 0; i < num_bins; i++) {
+		bin_count[i] = make_int4(0);
+		bin_bounds[i][0] = bin_bounds[i][1] = bin_bounds[i][2] = BoundBox::empty;
+	}
+
+	/* map geometry to bins, unrolled once */
+	{
+		ssize_t i;
+
+		for(i = 0; i < ssize_t(size()) - 1; i += 2) {
+			prefetch_L2(&prims[start() + i + 8]);
+
+			/* map even and odd primitive to bin */
+			BVHReference prim0 = prims[start() + i + 0];
+			BVHReference prim1 = prims[start() + i + 1];
+
+			int4 bin0 = get_bin(prim0.bounds());
+			int4 bin1 = get_bin(prim1.bounds());
+
+			/* increase bounds for bins for even primitive */
+			int b00 = extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
+			int b01 = extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
+			int b02 = extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+
+			/* increase bounds of bins for odd primitive */
+			int b10 = extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(prim1.bounds());
+			int b11 = extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(prim1.bounds());
+			int b12 = extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(prim1.bounds());
+		}
+
+		/* for uneven number of primitives */
+		if(i < ssize_t(size())) {
+			/* map primitive to bin */
+			BVHReference prim0 = prims[start() + i];
+			int4 bin0 = get_bin(prim0.bounds());
+
+			/* increase bounds of bins */
+			int b00 = extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
+			int b01 = extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
+			int b02 = extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+		}
+	}
+
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	float4 r_area[MAX_BINS];	/* area of bounds of primitives on the right */
+	float4 r_count[MAX_BINS];	/* number of primitives on the right */
+	int4 count = make_int4(0);
+
+	BoundBox bx = BoundBox::empty;
+	BoundBox by = BoundBox::empty;
+	BoundBox bz = BoundBox::empty;
+
+	for(size_t i = num_bins - 1; i > 0; i--) {
+		count = count + bin_count[i];
+		r_count[i] = blocks(count);
+
+		bx = merge(bx,bin_bounds[i][0]); r_area[i][0] = bx.half_area();
+		by = merge(by,bin_bounds[i][1]); r_area[i][1] = by.half_area();
+		bz = merge(bz,bin_bounds[i][2]); r_area[i][2] = bz.half_area();
+	}
+
+	/* sweep from left to right and compute SAH */
+	int4 ii = make_int4(1);
+	float4 bestSAH = make_float4(FLT_MAX);
+	int4 bestSplit = make_int4(-1);
+
+	count = make_int4(0);
+
+	bx = BoundBox::empty;
+	by = BoundBox::empty;
+	bz = BoundBox::empty;
+
+	for(size_t i = 1; i < num_bins; i++, ii += make_int4(1)) {
+		count = count + bin_count[i-1];
+
+		bx = merge(bx,bin_bounds[i-1][0]); float Ax = bx.half_area();
+		by = merge(by,bin_bounds[i-1][1]); float Ay = by.half_area();
+		bz = merge(bz,bin_bounds[i-1][2]); float Az = bz.half_area();
+
+		float4 lCount = blocks(count);
+		float4 lArea = make_float4(Ax,Ay,Az,Az);
+		float4 sah = lArea*lCount + r_area[i]*r_count[i];
+
+		bestSplit = select(sah < bestSAH,ii,bestSplit);
+		bestSAH = min(sah,bestSAH);
+	}
+
+	int4 mask = float3_to_float4(cent_bounds().size()) <= make_float4(0.0f);
+	bestSAH = insert<3>(select(mask, make_float4(FLT_MAX), bestSAH), FLT_MAX);
+
+	/* find best dimension */
+	dim = get_best_dimension(bestSAH);
+	splitSAH = bestSAH[dim];
+	pos = bestSplit[dim];
+	leafSAH	= bounds().half_area() * blocks(size());
+}
+
+void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const
+{
+	size_t N = size();
+
+	BoundBox lgeom_bounds = BoundBox::empty;
+	BoundBox rgeom_bounds = BoundBox::empty;
+	BoundBox lcent_bounds = BoundBox::empty;
+	BoundBox rcent_bounds = BoundBox::empty;
+
+	ssize_t l = 0, r = N-1;
+
+	while(l <= r) {
+		prefetch_L2(&prims[start() + l + 8]);
+		prefetch_L2(&prims[start() + r - 8]);
+
+		BVHReference prim = prims[start() + l];
+		float3 center = prim.bounds().center2();
+
+		if(get_bin(center)[dim] < pos) {
+			lgeom_bounds.grow(prim.bounds());
+			lcent_bounds.grow(center);
+			l++;
+		}
+		else {
+			rgeom_bounds.grow(prim.bounds());
+			rcent_bounds.grow(center);
+			swap(prims[start()+l],prims[start()+r]);
+			r--;
+		}
+	}
+
+	/* finish */
+	if(l != 0 && N-1-r != 0) {
+		right_o = BVHObjectBinning(BVHRange(rgeom_bounds, rcent_bounds, start() + l, N-1-r), prims);
+		left_o  = BVHObjectBinning(BVHRange(lgeom_bounds, lcent_bounds, start(), l), prims);
+		return;
+	}
+
+	/* object medium split if we did not make progress, can happen when all
+	   primitives have same centroid */
+	lgeom_bounds = BoundBox::empty;
+	rgeom_bounds = BoundBox::empty;
+	lcent_bounds = BoundBox::empty;
+	rcent_bounds = BoundBox::empty;
+
+	for(size_t i = 0; i < N/2; i++) {
+		lgeom_bounds.grow(prims[start()+i].bounds());
+		lcent_bounds.grow(prims[start()+i].bounds().center2());
+	}
+
+	for(size_t i = N/2; i < N; i++) {
+		rgeom_bounds.grow(prims[start()+i].bounds());
+		rcent_bounds.grow(prims[start()+i].bounds().center2());
+	}
+
+	right_o = BVHObjectBinning(BVHRange(rgeom_bounds, rcent_bounds, start() + N/2, N/2 + N%2), prims);
+	left_o  = BVHObjectBinning(BVHRange(lgeom_bounds, lcent_bounds, start(), N/2), prims);
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@ -0,0 +1,86 @@
+/*
+ * Adapted from code copyright 2009-2011 Intel Corporation
+ * Modifications Copyright 2012, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_BINNING_H__
+#define __BVH_BINNING_H__
+
+#include "bvh_params.h"
+
+#include "util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Single threaded object binner. Finds the split with the best SAH heuristic
+ * by testing for each dimension multiple partitionings for regular spaced
+ * partition locations. A partitioning for a partition location is computed,
+ * by putting primitives whose centroid is on the left and right of the split
+ * location to different sets. The SAH is evaluated by computing the number of
+ * blocks occupied by the primitives in the partitions. */
+
+class BVHObjectBinning : public BVHRange
+{
+public:
+	__forceinline BVHObjectBinning() {}
+	BVHObjectBinning(const BVHRange& job, BVHReference *prims);
+
+	void split(BVHReference *prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const;
+
+	float splitSAH;	/* SAH cost of the best split */
+	float leafSAH;	/* SAH cost of creating a leaf */
+
+protected:
+	int dim;			/* best split dimension */
+	int pos;			/* best split position */
+	size_t num_bins;	/* actual number of bins to use */
+	float3 scale;		/* scaling factor to compute bin */
+
+	enum { MAX_BINS = 32 };
+	enum { LOG_BLOCK_SIZE = 2 };
+
+	/* computes the bin numbers for each dimension for a box. */
+	__forceinline int4 get_bin(const BoundBox& box) const
+	{
+		int4 a = make_int4((box.center2() - cent_bounds().min)*scale - make_float3(0.5f));
+		int4 mn = make_int4(0);
+		int4 mx = make_int4((int)num_bins-1);
+
+		return clamp(a, mn, mx);
+	}
+
+	/* computes the bin numbers for each dimension for a point. */
+	__forceinline int4 get_bin(const float3& c) const
+	{
+		return make_int4((c - cent_bounds().min)*scale - make_float3(0.5f));
+	}
+
+	/* compute the number of blocks occupied for each dimension. */
+	__forceinline float4 blocks(const int4& a) const
+	{
+		return make_float4((a + make_int4((1 << LOG_BLOCK_SIZE)-1)) >> LOG_BLOCK_SIZE);
+	}
+
+	/* compute the number of blocks occupied in one dimension. */
+	__forceinline int blocks(size_t a) const
+	{
+		return (int)((a+((1LL << LOG_BLOCK_SIZE)-1)) >> LOG_BLOCK_SIZE);
+	}
+};
+
+CCL_NAMESPACE_END
+
+#endif
+
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@ -15,22 +15,36 @@
 * limitations under the License.
 */

+#include "bvh_binning.h"
 #include "bvh_build.h"
 #include "bvh_node.h"
 #include "bvh_params.h"
-#include "bvh_sort.h"
+#include "bvh_split.h"

 #include "mesh.h"
 #include "object.h"
 #include "scene.h"

-#include "util_algorithm.h"
+#include "util_debug.h"
 #include "util_foreach.h"
 #include "util_progress.h"
 #include "util_time.h"

 CCL_NAMESPACE_BEGIN

+/* BVH Build Task */
+
+class BVHBuildTask : public Task {
+public:
+	BVHBuildTask(InnerNode *node_, int child_, BVHObjectBinning& range_, int level_)
+	: node(node_), child(child_), level(level_), range(range_) {}
+
+	InnerNode *node;
+	int child;
+	int level;
+	BVHObjectBinning range;
+};
+
 /* Constructor / Destructor */

 BVHBuild::BVHBuild(const vector<Object*>& objects_,
@ -41,10 +55,10 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
  prim_object(prim_object_),
  params(params_),
  progress(progress_),
-  progress_start_time(0.0)
+  progress_start_time(0.0),
+  task_pool(function_bind(&BVHBuild::thread_build_node, this, _1, _2))
 {
 	spatial_min_overlap = 0.0f;
-	progress_num_duplicates = 0;
 }

 BVHBuild::~BVHBuild()
@ -53,57 +67,63 @@ BVHBuild::~BVHBuild()

 /* Adding References */

-void BVHBuild::add_reference_mesh(NodeSpec& root, Mesh *mesh, int i)
+void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
 {
 	for(uint j = 0; j < mesh->triangles.size(); j++) {
 		Mesh::Triangle t = mesh->triangles[j];
-		Reference ref;
+		BoundBox bounds = BoundBox::empty;

 		for(int k = 0; k < 3; k++) {
 			float3 pt = mesh->verts[t.v[k]];
-			ref.bounds.grow(pt);
+			bounds.grow(pt);
 		}

-		if(ref.bounds.valid()) {
-			ref.prim_index = j;
-			ref.prim_object = i;
-
-			references.push_back(ref);
-			root.bounds.grow(ref.bounds);
+		if(bounds.valid()) {
+			references.push_back(BVHReference(bounds, j, i));
+			root.grow(bounds);
+			center.grow(bounds.center2());
 		}
 	}
 }

-void BVHBuild::add_reference_object(NodeSpec& root, Object *ob, int i)
+void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i)
 {
-	Reference ref;
-
-	ref.prim_index = -1;
-	ref.prim_object = i;
-	ref.bounds = ob->bounds;
-
-	references.push_back(ref);
-	root.bounds.grow(ref.bounds);
+	references.push_back(BVHReference(ob->bounds, -1, i));
+	root.grow(ob->bounds);
+	center.grow(ob->bounds.center2());
 }

-void BVHBuild::add_references(NodeSpec& root)
+void BVHBuild::add_references(BVHRange& root)
 {
-	/* init root spec */
-	root.num = 0;
-	root.bounds = BoundBox();
+	/* reserve space for references */
+	size_t num_alloc_references = 0;

-	/* add objects */
+	foreach(Object *ob, objects) {
+		if(params.top_level) {
+			if(ob->mesh->transform_applied)
+				num_alloc_references += ob->mesh->triangles.size();
+			else
+				num_alloc_references++;
+		}
+		else
+			num_alloc_references += ob->mesh->triangles.size();
+	}
+
+	references.reserve(num_alloc_references);
+
+	/* add references from objects */
+	BoundBox bounds = BoundBox::empty, center = BoundBox::empty;
 	int i = 0;

 	foreach(Object *ob, objects) {
 		if(params.top_level) {
 			if(ob->mesh->transform_applied)
-				add_reference_mesh(root, ob->mesh, i);
+				add_reference_mesh(bounds, center, ob->mesh, i);
 			else
-				add_reference_object(root, ob, i);
+				add_reference_object(bounds, center, ob, i);
 		}
 		else
-			add_reference_mesh(root, ob->mesh, i);
+			add_reference_mesh(bounds, center, ob->mesh, i);

 		i++;

@ -111,45 +131,78 @@ void BVHBuild::add_references(NodeSpec& root)
 	}

 	/* happens mostly on empty meshes */
-	if(!root.bounds.valid())
-		root.bounds.grow(make_float3(0.0f, 0.0f, 0.0f));
+	if(!bounds.valid())
+		bounds.grow(make_float3(0.0f, 0.0f, 0.0f));

-	root.num = references.size();
+	root = BVHRange(bounds, center, 0, references.size());
 }

 /* Build */

 BVHNode* BVHBuild::run()
 {
-	NodeSpec root;
+	BVHRange root;

 	/* add references */
 	add_references(root);

-	if(progress.get_cancel()) return NULL;
+	if(progress.get_cancel())
+		return NULL;

 	/* init spatial splits */
 	if(params.top_level) /* todo: get rid of this */
 		params.use_spatial_split = false;

-	spatial_min_overlap = root.bounds.area() * params.spatial_split_alpha;
+	spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
 	spatial_right_bounds.clear();
-	spatial_right_bounds.resize(max(root.num, (int)BVHParams::NUM_SPATIAL_BINS) - 1);
+	spatial_right_bounds.resize(max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1);

 	/* init progress updates */
-	progress_num_duplicates = 0;
 	progress_start_time = time_dt();
+	progress_count = 0;
+	progress_total = references.size();
+	progress_original_total = progress_total;
+
+	prim_index.resize(references.size());
+	prim_object.resize(references.size());

 	/* build recursively */
-	return build_node(root, 0, 0.0f, 1.0f);
+	BVHNode *rootnode;
+
+	if(params.use_spatial_split) {
+		/* singlethreaded spatial split build */
+		rootnode = build_node(root, 0);
+	}
+	else {
+		/* multithreaded binning build */
+		BVHObjectBinning rootbin(root, &references[0]);
+		rootnode = build_node(rootbin, 0);
+		task_pool.wait();
+	}
+
+	/* delete if we cancelled */
+	if(rootnode) {
+		if(progress.get_cancel()) {
+			rootnode->deleteSubtree();
+			rootnode = NULL;
+		}
+		else if(!params.use_spatial_split) {
+			/*rotate(rootnode, 4, 5);*/
+			rootnode->update_visibility();
+		}
+	}
+
+	return rootnode;
 }

-void BVHBuild::progress_update(float progress_start, float progress_end)
+void BVHBuild::progress_update()
 {
 	if(time_dt() - progress_start_time < 0.25f)
 		return;
 	
-	float duplicates = (float)progress_num_duplicates/(float)references.size();
+	double progress_start = (double)progress_count/(double)progress_total;
+	double duplicates = (double)(progress_total - progress_original_total)/(double)progress_total;
+
 	string msg = string_printf("Building BVH %.0f%%, duplicates %.0f%%",
 		progress_start*100.0f, duplicates*100.0f);

@ -157,83 +210,134 @@ void BVHBuild::progress_update(float progress_start, float progress_end)
 	progress_start_time = time_dt(); 
 }

-BVHNode* BVHBuild::build_node(const NodeSpec& spec, int level, float progress_start, float progress_end)
+void BVHBuild::thread_build_node(Task *task_, int thread_id)
 {
-	/* progress update */
-	progress_update(progress_start, progress_end);
-	if(progress.get_cancel()) return NULL;
+	if(progress.get_cancel())
+		return;

-	/* small enough or too deep => create leaf. */
-	if(spec.num <= params.min_leaf_size || level >= BVHParams::MAX_DEPTH)
-		return create_leaf_node(spec);
+	/* build nodes */
+	BVHBuildTask *task = (BVHBuildTask*)task_;
+	BVHNode *node = build_node(task->range, task->level);

-	/* find split candidates. */
-	float area = spec.bounds.area();
-	float leafSAH = area * params.triangle_cost(spec.num);
-	float nodeSAH = area * params.node_cost(2);
-	ObjectSplit object = find_object_split(spec, nodeSAH);
-	SpatialSplit spatial;
+	/* set child in inner node */
+	task->node->children[task->child] = node;

-	if(params.use_spatial_split && level < BVHParams::MAX_SPATIAL_DEPTH) {
-		BoundBox overlap = object.left_bounds;
-		overlap.intersect(object.right_bounds);
+	/* update progress */
+	if(task->range.size() < THREAD_TASK_SIZE) {
+		/*rotate(node, INT_MAX, 5);*/

-		if(overlap.area() >= spatial_min_overlap)
-			spatial = find_spatial_split(spec, nodeSAH);
+		thread_scoped_lock lock(build_mutex);
+
+		progress_count += task->range.size();
+		progress_update();
 	}
-
-	/* leaf SAH is the lowest => create leaf. */
-	float minSAH = min(min(leafSAH, object.sah), spatial.sah);
-
-	if(minSAH == leafSAH && spec.num <= params.max_leaf_size)
-		return create_leaf_node(spec);
-
-	/* perform split. */
-	NodeSpec left, right;
-
-	if(params.use_spatial_split && minSAH == spatial.sah)
-		do_spatial_split(left, right, spec, spatial);
-	if(!left.num || !right.num)
-		do_object_split(left, right, spec, object);
-
-	/* create inner node. */
-	progress_num_duplicates += left.num + right.num - spec.num;
-
-	float progress_mid = lerp(progress_start, progress_end, (float)right.num / (float)(left.num + right.num));
-
-	BVHNode* rightNode = build_node(right, level + 1, progress_start, progress_mid);
-	if(progress.get_cancel()) {
-		if(rightNode) rightNode->deleteSubtree();
-		return NULL;
-	}
-
-	BVHNode* leftNode = build_node(left, level + 1, progress_mid, progress_end);
-	if(progress.get_cancel()) {
-		if(leftNode) leftNode->deleteSubtree();
-		return NULL;
-	}
-
-	return new InnerNode(spec.bounds, leftNode, rightNode);
 }

-BVHNode *BVHBuild::create_object_leaf_nodes(const Reference *ref, int num)
+/* multithreaded binning builder */
+BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
+{
+	size_t size = range.size();
+	float leafSAH = params.sah_triangle_cost * range.leafSAH;
+	float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_triangle_cost * range.splitSAH;
+
+	/* make leaf node when threshold reached or SAH tells us */
+	if(params.small_enough_for_leaf(size, level) || (size <= params.max_leaf_size && leafSAH < splitSAH))
+		return create_leaf_node(range);
+
+	/* perform split */
+	BVHObjectBinning left, right;
+	range.split(&references[0], left, right);
+
+	/* create inner node. */
+	InnerNode *inner;
+
+	if(range.size() < THREAD_TASK_SIZE) {
+		/* local build */
+		BVHNode *leftnode = build_node(left, level + 1);
+		BVHNode *rightnode = build_node(right, level + 1);
+
+		inner = new InnerNode(range.bounds(), leftnode, rightnode);
+	}
+	else {
+		/* threaded build */
+		inner = new InnerNode(range.bounds());
+
+		task_pool.push(new BVHBuildTask(inner, 0, left, level + 1), true);
+		task_pool.push(new BVHBuildTask(inner, 1, right, level + 1), true);
+	}
+
+	return inner;
+}
+
+/* single threaded spatial split builder */
+BVHNode* BVHBuild::build_node(const BVHRange& range, int level)
+{
+	/* progress update */
+	progress_update();
+	if(progress.get_cancel())
+		return NULL;
+
+	/* small enough or too deep => create leaf. */
+	if(params.small_enough_for_leaf(range.size(), level)) {
+		progress_count += range.size();
+		return create_leaf_node(range);
+	}
+
+	/* splitting test */
+	BVHMixedSplit split(this, range, level);
+
+	if(split.no_split) {
+		progress_count += range.size();
+		return create_leaf_node(range);
+	}
+	
+	/* do split */
+	BVHRange left, right;
+	split.split(this, left, right, range);
+
+	progress_total += left.size() + right.size() - range.size();
+	size_t total = progress_total;
+
+	/* leaft node */
+	BVHNode *leftnode = build_node(left, level + 1);
+
+	/* right node (modify start for splits) */
+	right.set_start(right.start() + progress_total - total);
+	BVHNode *rightnode = build_node(right, level + 1);
+
+	/* inner node */
+	return new InnerNode(range.bounds(), leftnode, rightnode);
+}
+
+/* Create Nodes */
+
+BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, int num)
 {
 	if(num == 0) {
-		BoundBox bounds;
+		BoundBox bounds = BoundBox::empty;
 		return new LeafNode(bounds, 0, 0, 0);
 	}
 	else if(num == 1) {
-		prim_index.push_back(ref[0].prim_index);
-		prim_object.push_back(ref[0].prim_object);
-		uint visibility = objects[ref[0].prim_object]->visibility;
-		return new LeafNode(ref[0].bounds, visibility, prim_index.size()-1, prim_index.size());
+		if(start == prim_index.size()) {
+			assert(params.use_spatial_split);
+
+			prim_index.push_back(ref->prim_index());
+			prim_object.push_back(ref->prim_object());
+		}
+		else {
+			prim_index[start] = ref->prim_index();
+			prim_object[start] = ref->prim_object();
+		}
+
+		uint visibility = objects[ref->prim_object()]->visibility;
+		return new LeafNode(ref->bounds(), visibility, start, start+1);
 	}
 	else {
 		int mid = num/2;
-		BVHNode *leaf0 = create_object_leaf_nodes(ref, mid); 
-		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, num-mid); 
+		BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid); 
+		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid); 

-		BoundBox bounds;
+		BoundBox bounds = BoundBox::empty;
 		bounds.grow(leaf0->m_bounds);
 		bounds.grow(leaf1->m_bounds);

@ -241,310 +345,136 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const Reference *ref, int num)
 	}
 }

-BVHNode* BVHBuild::create_leaf_node(const NodeSpec& spec)
+BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
 {
 	vector<int>& p_index = prim_index;
 	vector<int>& p_object = prim_object;
-	BoundBox bounds;
-	int num = 0;
+	BoundBox bounds = BoundBox::empty;
+	int num = 0, ob_num = 0;
 	uint visibility = 0;

-	for(int i = 0; i < spec.num; i++) {
-		if(references.back().prim_index != -1) {
-			p_index.push_back(references.back().prim_index);
-			p_object.push_back(references.back().prim_object);
-			bounds.grow(references.back().bounds);
-			visibility |= objects[references.back().prim_object]->visibility;
-			references.pop_back();
+	for(int i = 0; i < range.size(); i++) {
+		BVHReference& ref = references[range.start() + i];
+
+		if(ref.prim_index() != -1) {
+			if(range.start() + num == prim_index.size()) {
+				assert(params.use_spatial_split);
+
+				p_index.push_back(ref.prim_index());
+				p_object.push_back(ref.prim_object());
+			}
+			else {
+				p_index[range.start() + num] = ref.prim_index();
+				p_object[range.start() + num] = ref.prim_object();
+			}
+
+			bounds.grow(ref.bounds());
+			visibility |= objects[ref.prim_object()]->visibility;
 			num++;
 		}
+		else {
+			if(ob_num < i)
+				references[range.start() + ob_num] = ref;
+			ob_num++;
+		}
 	}

 	BVHNode *leaf = NULL;
 	
 	if(num > 0) {
-		leaf = new LeafNode(bounds, visibility, p_index.size() - num, p_index.size());
+		leaf = new LeafNode(bounds, visibility, range.start(), range.start() + num);

-		if(num == spec.num)
+		if(num == range.size())
 			return leaf;
 	}

 	/* while there may be multiple triangles in a leaf, for object primitives
-	 * we want them to be the only one, so we  */
-	int ob_num = spec.num - num;
-	const Reference *ref = (ob_num)? &references.back() - (ob_num - 1): NULL;
-	BVHNode *oleaf = create_object_leaf_nodes(ref, ob_num);
-	for(int i = 0; i < ob_num; i++)
-		references.pop_back();
+	 * we want there to be the only one, so we keep splitting */
+	const BVHReference *ref = (ob_num)? &references[range.start()]: NULL;
+	BVHNode *oleaf = create_object_leaf_nodes(ref, range.start() + num, ob_num);
 	
 	if(leaf)
-		return new InnerNode(spec.bounds, leaf, oleaf);
+		return new InnerNode(range.bounds(), leaf, oleaf);
 	else
 		return oleaf;
 }

-/* Object Split */
+/* Tree Rotations */

-BVHBuild::ObjectSplit BVHBuild::find_object_split(const NodeSpec& spec, float nodeSAH)
+void BVHBuild::rotate(BVHNode *node, int max_depth, int iterations)
 {
-	ObjectSplit split;
-	const Reference *ref_ptr = &references[references.size() - spec.num];
-
-	for(int dim = 0; dim < 3; dim++) {
-		/* sort references */
-		bvh_reference_sort(references.size() - spec.num, references.size(), &references[0], dim);
-
-		/* sweep right to left and determine bounds. */
-		BoundBox right_bounds;
-
-		for(int i = spec.num - 1; i > 0; i--) {
-			right_bounds.grow(ref_ptr[i].bounds);
-			spatial_right_bounds[i - 1] = right_bounds;
-		}
-
-		/* sweep left to right and select lowest SAH. */
-		BoundBox left_bounds;
-
-		for(int i = 1; i < spec.num; i++) {
-			left_bounds.grow(ref_ptr[i - 1].bounds);
-			right_bounds = spatial_right_bounds[i - 1];
-
-			float sah = nodeSAH +
-				left_bounds.area() * params.triangle_cost(i) +
-				right_bounds.area() * params.triangle_cost(spec.num - i);
-
-			if(sah < split.sah) {
-				split.sah = sah;
-				split.dim = dim;
-				split.num_left = i;
-				split.left_bounds = left_bounds;
-				split.right_bounds = right_bounds;
-			}
-		}
-	}
-
-	return split;
+	/* in tested scenes, this resulted in slightly slower raytracing, so disabled
+	 * it for now. could be implementation bug, or depend on the scene */
+	if(node)
+		for(int i = 0; i < iterations; i++)
+			rotate(node, max_depth);
 }

-void BVHBuild::do_object_split(NodeSpec& left, NodeSpec& right, const NodeSpec& spec, const ObjectSplit& split)
+void BVHBuild::rotate(BVHNode *node, int max_depth)
 {
-	/* sort references according to split */
-	int start = references.size() - spec.num;
-	int end = references.size(); /* todo: is this right? */
+	/* nothing to rotate if we reached a leaf node. */
+	if(node->is_leaf() || max_depth < 0)
+		return;
 	
-	bvh_reference_sort(start, end, &references[0], split.dim);
+	InnerNode *parent = (InnerNode*)node;

-	/* split node specs */
-	left.num = split.num_left;
-	left.bounds = split.left_bounds;
-	right.num = spec.num - split.num_left;
-	right.bounds = split.right_bounds;
-}
+	/* rotate all children first */
+	for(size_t c = 0; c < 2; c++)
+		rotate(parent->children[c], max_depth-1);

-/* Spatial Split */
+	/* compute current area of all children */
+	BoundBox bounds0 = parent->children[0]->m_bounds;
+	BoundBox bounds1 = parent->children[1]->m_bounds;

-BVHBuild::SpatialSplit BVHBuild::find_spatial_split(const NodeSpec& spec, float nodeSAH)
-{
-	/* initialize bins. */
-	float3 origin = spec.bounds.min;
-	float3 binSize = (spec.bounds.max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS);
-	float3 invBinSize = 1.0f / binSize;
+	float area0 = bounds0.half_area();
+	float area1 = bounds1.half_area();
+	float4 child_area = make_float4(area0, area1, 0.0f, 0.0f);

-	for(int dim = 0; dim < 3; dim++) {
-		for(int i = 0; i < BVHParams::NUM_SPATIAL_BINS; i++) {
-			SpatialBin& bin = spatial_bins[dim][i];
+	/* find best rotation. we pick a target child of a first child, and swap
+	 * this with an other child. we perform the best such swap. */
+	float best_cost = FLT_MAX;
+	int best_child = -1, bets_target = -1, best_other = -1;

-			bin.bounds = BoundBox();
-			bin.enter = 0;
-			bin.exit = 0;
-		}
-	}
+	for(size_t c = 0; c < 2; c++) {
+		/* ignore leaf nodes as we cannot descent into */
+		if(parent->children[c]->is_leaf())
+			continue;

-	/* chop references into bins. */
-	for(unsigned int refIdx = references.size() - spec.num; refIdx < references.size(); refIdx++) {
-		const Reference& ref = references[refIdx];
-		float3 firstBinf = (ref.bounds.min - origin) * invBinSize;
-		float3 lastBinf = (ref.bounds.max - origin) * invBinSize;
-		int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z);
-		int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z);
+		InnerNode *child = (InnerNode*)parent->children[c];
+		BoundBox& other = (c == 0)? bounds1: bounds0;

-		firstBin = clamp(firstBin, 0, BVHParams::NUM_SPATIAL_BINS - 1);
-		lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1);
+		/* transpose child bounds */
+		BoundBox target0 = child->children[0]->m_bounds;
+		BoundBox target1 = child->children[1]->m_bounds;

-		for(int dim = 0; dim < 3; dim++) {
-			Reference currRef = ref;
+		/* compute cost for both possible swaps */
+		float cost0 = merge(other, target1).half_area() - child_area[c];
+		float cost1 = merge(target0, other).half_area() - child_area[c];

-			for(int i = firstBin[dim]; i < lastBin[dim]; i++) {
-				Reference leftRef, rightRef;
+		if(min(cost0,cost1) < best_cost) {
+			best_child = (int)c;
+			best_other = (int)(1-c);

-				split_reference(leftRef, rightRef, currRef, dim, origin[dim] + binSize[dim] * (float)(i + 1));
-				spatial_bins[dim][i].bounds.grow(leftRef.bounds);
-				currRef = rightRef;
-			}
-
-			spatial_bins[dim][lastBin[dim]].bounds.grow(currRef.bounds);
-			spatial_bins[dim][firstBin[dim]].enter++;
-			spatial_bins[dim][lastBin[dim]].exit++;
-		}
-	}
-
-	/* select best split plane. */
-	SpatialSplit split;
-
-	for(int dim = 0; dim < 3; dim++) {
-		/* sweep right to left and determine bounds. */
-		BoundBox right_bounds;
-
-		for(int i = BVHParams::NUM_SPATIAL_BINS - 1; i > 0; i--) {
-			right_bounds.grow(spatial_bins[dim][i].bounds);
-			spatial_right_bounds[i - 1] = right_bounds;
-		}
-
-		/* sweep left to right and select lowest SAH. */
-		BoundBox left_bounds;
-		int leftNum = 0;
-		int rightNum = spec.num;
-
-		for(int i = 1; i < BVHParams::NUM_SPATIAL_BINS; i++) {
-			left_bounds.grow(spatial_bins[dim][i - 1].bounds);
-			leftNum += spatial_bins[dim][i - 1].enter;
-			rightNum -= spatial_bins[dim][i - 1].exit;
-
-			float sah = nodeSAH +
-				left_bounds.area() * params.triangle_cost(leftNum) +
-				spatial_right_bounds[i - 1].area() * params.triangle_cost(rightNum);
-
-			if(sah < split.sah) {
-				split.sah = sah;
-				split.dim = dim;
-				split.pos = origin[dim] + binSize[dim] * (float)i;
-			}
-		}
-	}
-
-	return split;
-}
-
-void BVHBuild::do_spatial_split(NodeSpec& left, NodeSpec& right, const NodeSpec& spec, const SpatialSplit& split)
-{
-	/* Categorize references and compute bounds.
-	 *
-	 * Left-hand side:			[left_start, left_end[
-	 * Uncategorized/split:		[left_end, right_start[
-	 * Right-hand side:			[right_start, refs.size()[ */
-
-	vector<Reference>& refs = references;
-	int left_start = refs.size() - spec.num;
-	int left_end = left_start;
-	int right_start = refs.size();
-
-	left.bounds = right.bounds = BoundBox();
-
-	for(int i = left_end; i < right_start; i++) {
-		if(refs[i].bounds.max[split.dim] <= split.pos) {
-			/* entirely on the left-hand side */
-			left.bounds.grow(refs[i].bounds);
-			swap(refs[i], refs[left_end++]);
-		}
-		else if(refs[i].bounds.min[split.dim] >= split.pos) {
-			/* entirely on the right-hand side */
-			right.bounds.grow(refs[i].bounds);
-			swap(refs[i--], refs[--right_start]);
-		}
-	}
-
-	/* duplicate or unsplit references intersecting both sides. */
-	while(left_end < right_start) {
-		/* split reference. */
-		Reference lref, rref;
-
-		split_reference(lref, rref, refs[left_end], split.dim, split.pos);
-
-		/* compute SAH for duplicate/unsplit candidates. */
-		BoundBox lub = left.bounds;		// Unsplit to left:		new left-hand bounds.
-		BoundBox rub = right.bounds;	// Unsplit to right:	new right-hand bounds.
-		BoundBox ldb = left.bounds;		// Duplicate:			new left-hand bounds.
-		BoundBox rdb = right.bounds;	// Duplicate:			new right-hand bounds.
-
-		lub.grow(refs[left_end].bounds);
-		rub.grow(refs[left_end].bounds);
-		ldb.grow(lref.bounds);
-		rdb.grow(rref.bounds);
-
-		float lac = params.triangle_cost(left_end - left_start);
-		float rac = params.triangle_cost(refs.size() - right_start);
-		float lbc = params.triangle_cost(left_end - left_start + 1);
-		float rbc = params.triangle_cost(refs.size() - right_start + 1);
-
-		float unsplitLeftSAH = lub.area() * lbc + right.bounds.area() * rac;
-		float unsplitRightSAH = left.bounds.area() * lac + rub.area() * rbc;
-		float duplicateSAH = ldb.area() * lbc + rdb.area() * rbc;
-		float minSAH = min(min(unsplitLeftSAH, unsplitRightSAH), duplicateSAH);
-
-		if(minSAH == unsplitLeftSAH) {
-			/* unsplit to left */
-			left.bounds = lub;
-			left_end++;
-		}
-		else if(minSAH == unsplitRightSAH) {
-			/* unsplit to right */
-			right.bounds = rub;
-			swap(refs[left_end], refs[--right_start]);
+			if(cost0 < cost1) {
+				best_cost = cost0;
+				bets_target = 0;
 			}
 			else {
-			/* duplicate */
-			left.bounds = ldb;
-			right.bounds = rdb;
-			refs[left_end++] = lref;
-			refs.push_back(rref);
+				best_cost = cost0;
+				bets_target = 1;
+			}
 		}
 	}

-	left.num = left_end - left_start;
-	right.num = refs.size() - right_start;
-}
+	/* if we did not find a swap that improves the SAH then do nothing */
+	if(best_cost >= 0)
+		return;

-void BVHBuild::split_reference(Reference& left, Reference& right, const Reference& ref, int dim, float pos)
-{
-	/* initialize references. */
-	left.prim_index = right.prim_index = ref.prim_index;
-	left.prim_object = right.prim_object = ref.prim_object;
-	left.bounds = right.bounds = BoundBox();
+	/* perform the best found tree rotation */
+	InnerNode *child = (InnerNode*)parent->children[best_child];

-	/* loop over vertices/edges. */
-	Object *ob = objects[ref.prim_object];
-	const Mesh *mesh = ob->mesh;
-	const int *inds = mesh->triangles[ref.prim_index].v;
-	const float3 *verts = &mesh->verts[0];
-	const float3* v1 = &verts[inds[2]];
-
-	for(int i = 0; i < 3; i++) {
-		const float3* v0 = v1;
-		int vindex = inds[i];
-		v1 = &verts[vindex];
-		float v0p = (*v0)[dim];
-		float v1p = (*v1)[dim];
-
-		/* insert vertex to the boxes it belongs to. */
-		if(v0p <= pos)
-			left.bounds.grow(*v0);
-
-		if(v0p >= pos)
-			right.bounds.grow(*v0);
-
-		/* edge intersects the plane => insert intersection to both boxes. */
-		if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
-			float3 t = lerp(*v0, *v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
-			left.bounds.grow(t);
-			right.bounds.grow(t);
-		}
-	}
-
-	/* intersect with original bounds. */
-	left.bounds.max[dim] = pos;
-	right.bounds.min[dim] = pos;
-	left.bounds.intersect(ref.bounds);
-	right.bounds.intersect(ref.bounds);
+	swap(parent->children[best_other], child->children[bets_target]);
+	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@ -21,8 +21,10 @@
 #include <float.h>

 #include "bvh.h"
+#include "bvh_binning.h"

 #include "util_boundbox.h"
+#include "util_task.h"
 #include "util_vector.h"

 CCL_NAMESPACE_BEGIN
@ -37,28 +39,7 @@ class Progress;
 class BVHBuild
 {
 public:
-	struct Reference
-	{
-		int prim_index;
-		int prim_object;
-		BoundBox bounds;
-
-		Reference()
-		{
-		}
-	};
-
-	struct NodeSpec
-	{
-		int num;
-		BoundBox bounds;
-
-		NodeSpec()
-		{
-			num = 0;
-		}
-	};
-
+	/* Constructor/Destructor */
 	BVHBuild(
 		const vector<Object*>& objects,
 		vector<int>& prim_index,
@ -70,63 +51,37 @@ public:
 	BVHNode *run();

 protected:
+	friend class BVHMixedSplit;
+	friend class BVHObjectSplit;
+	friend class BVHSpatialSplit;
+
 	/* adding references */
-	void add_reference_mesh(NodeSpec& root, Mesh *mesh, int i);
-	void add_reference_object(NodeSpec& root, Object *ob, int i);
-	void add_references(NodeSpec& root);
+	void add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
+	void add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i);
+	void add_references(BVHRange& root);

 	/* building */
-	BVHNode *build_node(const NodeSpec& spec, int level, float progress_start, float progress_end);
-	BVHNode *create_leaf_node(const NodeSpec& spec);
-	BVHNode *create_object_leaf_nodes(const Reference *ref, int num);
+	BVHNode *build_node(const BVHRange& range, int level);
+	BVHNode *build_node(const BVHObjectBinning& range, int level);
+	BVHNode *create_leaf_node(const BVHRange& range);
+	BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);

-	void progress_update(float progress_start, float progress_end);
+	/* threads */
+	enum { THREAD_TASK_SIZE = 4096 };
+	void thread_build_node(Task *task_, int thread_id);
+	thread_mutex build_mutex;

-	/* object splits */
-	struct ObjectSplit
-	{
-		float sah;
-		int dim;
-		int num_left;
-		BoundBox left_bounds;
-		BoundBox right_bounds;
+	/* progress */
+	void progress_update();

-		ObjectSplit()
-		: sah(FLT_MAX), dim(0), num_left(0)
-		{
-		}
-	};
-
-	ObjectSplit find_object_split(const NodeSpec& spec, float nodeSAH);
-	void do_object_split(NodeSpec& left, NodeSpec& right, const NodeSpec& spec, const ObjectSplit& split);
-
-	/* spatial splits */
-	struct SpatialSplit
-	{
-		float sah;
-		int dim;
-		float pos;
-
-		SpatialSplit()
-		: sah(FLT_MAX), dim(0), pos(0.0f)
-		{
-		}
-	};
-
-	struct SpatialBin
-	{
-		BoundBox bounds;
-		int enter;
-		int exit;
-	};
-
-	SpatialSplit find_spatial_split(const NodeSpec& spec, float nodeSAH);
-	void do_spatial_split(NodeSpec& left, NodeSpec& right, const NodeSpec& spec, const SpatialSplit& split);
-	void split_reference(Reference& left, Reference& right, const Reference& ref, int dim, float pos);
+	/* tree rotations */
+	void rotate(BVHNode *node, int max_depth);
+	void rotate(BVHNode *node, int max_depth, int iterations);

 	/* objects and primitive references */
 	vector<Object*> objects;
-	vector<Reference> references;
+	vector<BVHReference> references;
+	int num_original_references;

 	/* output primitive indexes and objects */
 	vector<int>& prim_index;
@ -138,12 +93,17 @@ protected:
 	/* progress reporting */
 	Progress& progress;
 	double progress_start_time;
-	int progress_num_duplicates;
+	size_t progress_count;
+	size_t progress_total;
+	size_t progress_original_total;

 	/* spatial splitting */
 	float spatial_min_overlap;
 	vector<BoundBox> spatial_right_bounds;
-	SpatialBin spatial_bins[3][BVHParams::NUM_SPATIAL_BINS];
+	BVHSpatialBin spatial_bins[3][BVHParams::NUM_SPATIAL_BINS];
+
+	/* threads */
+	TaskPool task_pool;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@ -24,6 +24,8 @@

 CCL_NAMESPACE_BEGIN

+/* BVH Node */
+
 int BVHNode::getSubtreeSize(BVH_STAT stat) const
 {
 	int cnt = 0;
@ -59,6 +61,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 void BVHNode::deleteSubtree()
 {
 	for(int i=0;i<num_children();i++)
+		if(get_child(i))
 			get_child(i)->deleteSubtree();

 	delete this;
@ -70,12 +73,27 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons

 	for(int i=0;i<num_children();i++) {
 		BVHNode *child = get_child(i);
-		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.area()/m_bounds.area());
+		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area());
 	}

 	return SAH;
 }

+uint BVHNode::update_visibility()
+{
+	if(!is_leaf() && m_visibility == 0) {
+		InnerNode *inner = (InnerNode*)this;
+		BVHNode *child0 = inner->children[0];
+		BVHNode *child1 = inner->children[1];
+
+		m_visibility = child0->update_visibility()|child1->update_visibility();
+	}
+
+	return m_visibility;
+}
+
+/* Inner Node */
+
 void InnerNode::print(int depth) const
 {
 	for(int i = 0; i < depth; i++)
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@ -49,8 +49,6 @@ public:
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;

-	float getArea() const { return m_bounds.area(); }
-
 	BoundBox m_bounds;
 	uint m_visibility;

@ -58,6 +56,8 @@ public:
 	int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const;
 	float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const;	
 	void deleteSubtree();
+
+	uint update_visibility();
 };

 class InnerNode : public BVHNode
@ -66,9 +66,21 @@ public:
 	InnerNode(const BoundBox& bounds, BVHNode* child0, BVHNode* child1)
 	{
 		m_bounds = bounds;
-		m_visibility = child0->m_visibility|child1->m_visibility;
 		children[0] = child0;
 		children[1] = child1;
+
+		if(child0 && child1)
+			m_visibility = child0->m_visibility|child1->m_visibility;
+		else
+			m_visibility = 0; /* happens on build cancel */
+	}
+
+	InnerNode(const BoundBox& bounds)
+	{
+		m_bounds = bounds;
+		m_visibility = 0;
+		children[0] = NULL;
+		children[1] = NULL;
 	}

 	bool is_leaf() const { return false; }
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@ -18,6 +18,8 @@
 #ifndef __BVH_PARAMS_H__
 #define __BVH_PARAMS_H__

+#include "util_boundbox.h"
+
 CCL_NAMESPACE_BEGIN

 /* BVH Parameters */
@ -73,14 +75,97 @@ public:
 	}

 	/* SAH costs */
-	float cost(int num_nodes, int num_tris) const
+	__forceinline float cost(int num_nodes, int num_tris) const
 	{ return node_cost(num_nodes) + triangle_cost(num_tris); }

-	float triangle_cost(int n) const
+	__forceinline float triangle_cost(int n) const
 	{ return n*sah_triangle_cost; }

-	float node_cost(int n) const
+	__forceinline float node_cost(int n) const
 	{ return n*sah_node_cost; }
+
+	__forceinline bool small_enough_for_leaf(int size, int level)
+	{ return (size <= min_leaf_size || level >= MAX_DEPTH); }
+};
+
+/* BVH Reference
+ *
+ * Reference to a primitive. Primitive index and object are sneakily packed
+ * into BoundBox to reduce memory usage and align nicely */
+
+class BVHReference
+{
+public:
+	__forceinline BVHReference() {}
+
+	__forceinline BVHReference(const BoundBox& bounds_, int prim_index, int prim_object)
+	: rbounds(bounds_)
+	{
+		rbounds.min.w = __int_as_float(prim_index);
+		rbounds.max.w = __int_as_float(prim_object);
+	}
+
+	__forceinline const BoundBox& bounds() const { return rbounds; }
+	__forceinline int prim_index() const { return __float_as_int(rbounds.min.w); }
+	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
+
+protected:
+	BoundBox rbounds;
+};
+
+/* BVH Range
+ *
+ * Build range used during construction, to indicate the bounds and place in
+ * the reference array of a subset of pirmitives Again uses trickery to pack
+ * integers into BoundBox for alignment purposes. */
+
+class BVHRange
+{
+public:
+	__forceinline BVHRange()
+	{
+		rbounds.min.w = __int_as_float(0);
+		rbounds.max.w = __int_as_float(0);
+	}
+
+	__forceinline BVHRange(const BoundBox& bounds_, int start_, int size_)
+	: rbounds(bounds_)
+	{
+		rbounds.min.w = __int_as_float(start_);
+		rbounds.max.w = __int_as_float(size_);
+	}
+
+	__forceinline BVHRange(const BoundBox& bounds_, const BoundBox& cbounds_, int start_, int size_)
+	: rbounds(bounds_), cbounds(cbounds_)
+	{
+		rbounds.min.w = __int_as_float(start_);
+		rbounds.max.w = __int_as_float(size_);
+	}
+
+	__forceinline void set_start(int start_) { rbounds.min.w = __int_as_float(start_); }
+
+	__forceinline const BoundBox& bounds() const { return rbounds; }
+	__forceinline const BoundBox& cent_bounds() const { return cbounds; }
+	__forceinline int start() const { return __float_as_int(rbounds.min.w); }
+	__forceinline int size() const { return __float_as_int(rbounds.max.w); }
+	__forceinline int end() const { return start() + size(); }
+
+protected:
+	BoundBox rbounds;
+	BoundBox cbounds;
+};
+
+/* BVH Spatial Bin */
+
+struct BVHSpatialBin
+{
+	BoundBox bounds;
+	int enter;
+	int exit;
+
+	__forceinline BVHSpatialBin()
+	{
+	}
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@ -32,23 +32,23 @@ public:
 		dim = dim_;
 	}

-	bool operator()(const BVHBuild::Reference& ra, const BVHBuild::Reference& rb)
+	bool operator()(const BVHReference& ra, const BVHReference& rb)
 	{
-		float ca = ra.bounds.min[dim] + ra.bounds.max[dim];
-		float cb = rb.bounds.min[dim] + rb.bounds.max[dim];
+		float ca = ra.bounds().min[dim] + ra.bounds().max[dim];
+		float cb = rb.bounds().min[dim] + rb.bounds().max[dim];

 		if(ca < cb) return true;
 		else if(ca > cb) return false;
-		else if(ra.prim_object < rb.prim_object) return true;
-		else if(ra.prim_object > rb.prim_object) return false;
-		else if(ra.prim_index < rb.prim_index) return true;
-		else if(ra.prim_index > rb.prim_index) return false;
+		else if(ra.prim_object() < rb.prim_object()) return true;
+		else if(ra.prim_object() > rb.prim_object()) return false;
+		else if(ra.prim_index() < rb.prim_index()) return true;
+		else if(ra.prim_index() > rb.prim_index()) return false;

 		return false;
 	}
 };

-void bvh_reference_sort(int start, int end, BVHBuild::Reference *data, int dim)
+void bvh_reference_sort(int start, int end, BVHReference *data, int dim)
 {
 	sort(data+start, data+end, BVHReferenceCompare(dim));
 }
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@ -20,7 +20,7 @@

 CCL_NAMESPACE_BEGIN

-void bvh_reference_sort(int start, int end, BVHBuild::Reference *data, int dim);
+void bvh_reference_sort(int start, int end, BVHReference *data, int dim);

 CCL_NAMESPACE_END

--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@ -0,0 +1,293 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh_build.h"
+#include "bvh_split.h"
+#include "bvh_sort.h"
+
+#include "mesh.h"
+#include "object.h"
+
+#include "util_algorithm.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Object Split */
+
+BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, const BVHRange& range, float nodeSAH)
+: sah(FLT_MAX), dim(0), num_left(0), left_bounds(BoundBox::empty), right_bounds(BoundBox::empty)
+{
+	const BVHReference *ref_ptr = &builder->references[range.start()];
+	float min_sah = FLT_MAX;
+
+	for(int dim = 0; dim < 3; dim++) {
+		/* sort references */
+		bvh_reference_sort(range.start(), range.end(), &builder->references[0], dim);
+
+		/* sweep right to left and determine bounds. */
+		BoundBox right_bounds = BoundBox::empty;
+
+		for(int i = range.size() - 1; i > 0; i--) {
+			right_bounds.grow(ref_ptr[i].bounds());
+			builder->spatial_right_bounds[i - 1] = right_bounds;
+		}
+
+		/* sweep left to right and select lowest SAH. */
+		BoundBox left_bounds = BoundBox::empty;
+
+		for(int i = 1; i < range.size(); i++) {
+			left_bounds.grow(ref_ptr[i - 1].bounds());
+			right_bounds = builder->spatial_right_bounds[i - 1];
+
+			float sah = nodeSAH +
+				left_bounds.safe_area() * builder->params.triangle_cost(i) +
+				right_bounds.safe_area() * builder->params.triangle_cost(range.size() - i);
+
+			if(sah < min_sah) {
+				min_sah = sah;
+
+				this->sah = sah;
+				this->dim = dim;
+				this->num_left = i;
+				this->left_bounds = left_bounds;
+				this->right_bounds = right_bounds;
+			}
+		}
+	}
+}
+
+void BVHObjectSplit::split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range)
+{
+	/* sort references according to split */
+	bvh_reference_sort(range.start(), range.end(), &builder->references[0], this->dim);
+
+	/* split node ranges */
+	left = BVHRange(this->left_bounds, range.start(), this->num_left);
+	right = BVHRange(this->right_bounds, left.end(), range.size() - this->num_left);
+
+}
+
+/* Spatial Split */
+
+BVHSpatialSplit::BVHSpatialSplit(BVHBuild *builder, const BVHRange& range, float nodeSAH)
+: sah(FLT_MAX), dim(0), pos(0.0f)
+{
+	/* initialize bins. */
+	float3 origin = range.bounds().min;
+	float3 binSize = (range.bounds().max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS);
+	float3 invBinSize = 1.0f / binSize;
+
+	for(int dim = 0; dim < 3; dim++) {
+		for(int i = 0; i < BVHParams::NUM_SPATIAL_BINS; i++) {
+			BVHSpatialBin& bin = builder->spatial_bins[dim][i];
+
+			bin.bounds = BoundBox::empty;
+			bin.enter = 0;
+			bin.exit = 0;
+		}
+	}
+
+	/* chop references into bins. */
+	for(unsigned int refIdx = range.start(); refIdx < range.end(); refIdx++) {
+		const BVHReference& ref = builder->references[refIdx];
+		float3 firstBinf = (ref.bounds().min - origin) * invBinSize;
+		float3 lastBinf = (ref.bounds().max - origin) * invBinSize;
+		int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z);
+		int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z);
+
+		firstBin = clamp(firstBin, 0, BVHParams::NUM_SPATIAL_BINS - 1);
+		lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1);
+
+		for(int dim = 0; dim < 3; dim++) {
+			BVHReference currRef = ref;
+
+			for(int i = firstBin[dim]; i < lastBin[dim]; i++) {
+				BVHReference leftRef, rightRef;
+
+				split_reference(builder, leftRef, rightRef, currRef, dim, origin[dim] + binSize[dim] * (float)(i + 1));
+				builder->spatial_bins[dim][i].bounds.grow(leftRef.bounds());
+				currRef = rightRef;
+			}
+
+			builder->spatial_bins[dim][lastBin[dim]].bounds.grow(currRef.bounds());
+			builder->spatial_bins[dim][firstBin[dim]].enter++;
+			builder->spatial_bins[dim][lastBin[dim]].exit++;
+		}
+	}
+
+	/* select best split plane. */
+	for(int dim = 0; dim < 3; dim++) {
+		/* sweep right to left and determine bounds. */
+		BoundBox right_bounds = BoundBox::empty;
+
+		for(int i = BVHParams::NUM_SPATIAL_BINS - 1; i > 0; i--) {
+			right_bounds.grow(builder->spatial_bins[dim][i].bounds);
+			builder->spatial_right_bounds[i - 1] = right_bounds;
+		}
+
+		/* sweep left to right and select lowest SAH. */
+		BoundBox left_bounds = BoundBox::empty;
+		int leftNum = 0;
+		int rightNum = range.size();
+
+		for(int i = 1; i < BVHParams::NUM_SPATIAL_BINS; i++) {
+			left_bounds.grow(builder->spatial_bins[dim][i - 1].bounds);
+			leftNum += builder->spatial_bins[dim][i - 1].enter;
+			rightNum -= builder->spatial_bins[dim][i - 1].exit;
+
+			float sah = nodeSAH +
+				left_bounds.safe_area() * builder->params.triangle_cost(leftNum) +
+				builder->spatial_right_bounds[i - 1].safe_area() * builder->params.triangle_cost(rightNum);
+
+			if(sah < this->sah) {
+				this->sah = sah;
+				this->dim = dim;
+				this->pos = origin[dim] + binSize[dim] * (float)i;
+			}
+		}
+	}
+}
+
+void BVHSpatialSplit::split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range)
+{
+	/* Categorize references and compute bounds.
+	 *
+	 * Left-hand side:			[left_start, left_end[
+	 * Uncategorized/split:		[left_end, right_start[
+	 * Right-hand side:			[right_start, refs.size()[ */
+
+	vector<BVHReference>& refs = builder->references;
+	int left_start = range.start();
+	int left_end = left_start;
+	int right_start = range.end();
+	int right_end = range.end();
+	BoundBox left_bounds = BoundBox::empty;
+	BoundBox right_bounds = BoundBox::empty;
+
+	for(int i = left_end; i < right_start; i++) {
+		if(refs[i].bounds().max[this->dim] <= this->pos) {
+			/* entirely on the left-hand side */
+			left_bounds.grow(refs[i].bounds());
+			swap(refs[i], refs[left_end++]);
+		}
+		else if(refs[i].bounds().min[this->dim] >= this->pos) {
+			/* entirely on the right-hand side */
+			right_bounds.grow(refs[i].bounds());
+			swap(refs[i--], refs[--right_start]);
+		}
+	}
+
+	/* duplicate or unsplit references intersecting both sides. */
+	while(left_end < right_start) {
+		/* split reference. */
+		BVHReference lref, rref;
+
+		split_reference(builder, lref, rref, refs[left_end], this->dim, this->pos);
+
+		/* compute SAH for duplicate/unsplit candidates. */
+		BoundBox lub = left_bounds;		// Unsplit to left:		new left-hand bounds.
+		BoundBox rub = right_bounds;	// Unsplit to right:	new right-hand bounds.
+		BoundBox ldb = left_bounds;		// Duplicate:			new left-hand bounds.
+		BoundBox rdb = right_bounds;	// Duplicate:			new right-hand bounds.
+
+		lub.grow(refs[left_end].bounds());
+		rub.grow(refs[left_end].bounds());
+		ldb.grow(lref.bounds());
+		rdb.grow(rref.bounds());
+
+		float lac = builder->params.triangle_cost(left_end - left_start);
+		float rac = builder->params.triangle_cost(right_end - right_start);
+		float lbc = builder->params.triangle_cost(left_end - left_start + 1);
+		float rbc = builder->params.triangle_cost(right_end - right_start + 1);
+
+		float unsplitLeftSAH = lub.safe_area() * lbc + right_bounds.safe_area() * rac;
+		float unsplitRightSAH = left_bounds.safe_area() * lac + rub.safe_area() * rbc;
+		float duplicateSAH = ldb.safe_area() * lbc + rdb.safe_area() * rbc;
+		float minSAH = min(min(unsplitLeftSAH, unsplitRightSAH), duplicateSAH);
+
+		if(minSAH == unsplitLeftSAH) {
+			/* unsplit to left */
+			left_bounds = lub;
+			left_end++;
+		}
+		else if(minSAH == unsplitRightSAH) {
+			/* unsplit to right */
+			right_bounds = rub;
+			swap(refs[left_end], refs[--right_start]);
+		}
+		else {
+			/* duplicate */
+			left_bounds = ldb;
+			right_bounds = rdb;
+			refs[left_end++] = lref;
+			refs.insert(refs.begin() + right_end, rref);
+			right_end++;
+		}
+	}
+
+	left = BVHRange(left_bounds, left_start, left_end - left_start);
+	right = BVHRange(right_bounds, right_start, right_end - right_start);
+}
+
+void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVHReference& right, const BVHReference& ref, int dim, float pos)
+{
+	/* initialize boundboxes */
+	BoundBox left_bounds = BoundBox::empty;
+	BoundBox right_bounds = BoundBox::empty;
+
+	/* loop over vertices/edges. */
+	Object *ob = builder->objects[ref.prim_object()];
+	const Mesh *mesh = ob->mesh;
+	const int *inds = mesh->triangles[ref.prim_index()].v;
+	const float3 *verts = &mesh->verts[0];
+	const float3* v1 = &verts[inds[2]];
+
+	for(int i = 0; i < 3; i++) {
+		const float3* v0 = v1;
+		int vindex = inds[i];
+		v1 = &verts[vindex];
+		float v0p = (*v0)[dim];
+		float v1p = (*v1)[dim];
+
+		/* insert vertex to the boxes it belongs to. */
+		if(v0p <= pos)
+			left_bounds.grow(*v0);
+
+		if(v0p >= pos)
+			right_bounds.grow(*v0);
+
+		/* edge intersects the plane => insert intersection to both boxes. */
+		if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
+			float3 t = lerp(*v0, *v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
+			left_bounds.grow(t);
+			right_bounds.grow(t);
+		}
+	}
+
+	/* intersect with original bounds. */
+	left_bounds.max[dim] = pos;
+	right_bounds.min[dim] = pos;
+	left_bounds.intersect(ref.bounds());
+	right_bounds.intersect(ref.bounds());
+
+	/* set referecnes */
+	left = BVHReference(left_bounds, ref.prim_index(), ref.prim_object());
+	right = BVHReference(right_bounds, ref.prim_index(), ref.prim_object());
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@ -0,0 +1,110 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_SPLIT_H__
+#define __BVH_SPLIT_H__
+
+#include "bvh_build.h"
+#include "bvh_params.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHBuild;
+
+/* Object Split */
+
+class BVHObjectSplit
+{
+public:
+	float sah;
+	int dim;
+	int num_left;
+	BoundBox left_bounds;
+	BoundBox right_bounds;
+
+	BVHObjectSplit() {}
+	BVHObjectSplit(BVHBuild *builder, const BVHRange& range, float nodeSAH);
+
+	void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range);
+};
+
+/* Spatial Split */
+
+class BVHSpatialSplit
+{
+public:
+	float sah;
+	int dim;
+	float pos;
+
+	BVHSpatialSplit() : sah(FLT_MAX), dim(0), pos(0.0f) {}
+	BVHSpatialSplit(BVHBuild *builder, const BVHRange& range, float nodeSAH);
+
+	void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range);
+	void split_reference(BVHBuild *builder, BVHReference& left, BVHReference& right, const BVHReference& ref, int dim, float pos);
+};
+
+/* Mixed Object-Spatial Split */
+
+class BVHMixedSplit
+{
+public:
+	BVHObjectSplit object;
+	BVHSpatialSplit spatial;
+
+	float leafSAH;
+	float nodeSAH;
+	float minSAH;
+
+	bool no_split;
+
+	__forceinline BVHMixedSplit(BVHBuild *builder, const BVHRange& range, int level)
+	{
+		/* find split candidates. */
+		float area = range.bounds().safe_area();
+
+		leafSAH = area * builder->params.triangle_cost(range.size());
+		nodeSAH = area * builder->params.node_cost(2);
+
+		object = BVHObjectSplit(builder, range, nodeSAH);
+
+		if(builder->params.use_spatial_split && level < BVHParams::MAX_SPATIAL_DEPTH) {
+			BoundBox overlap = object.left_bounds;
+			overlap.intersect(object.right_bounds);
+
+			if(overlap.safe_area() >= builder->spatial_min_overlap)
+				spatial = BVHSpatialSplit(builder, range, nodeSAH);
+		}
+
+		/* leaf SAH is the lowest => create leaf. */
+		minSAH = min(min(leafSAH, object.sah), spatial.sah);
+		no_split = (minSAH == leafSAH && range.size() <= builder->params.max_leaf_size);
+	}
+
+	__forceinline void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range)
+	{
+		if(builder->params.use_spatial_split && minSAH == spatial.sah)
+			spatial.split(builder, left, right, range);
+		if(!left.size() || !right.size())
+			object.split(builder, left, right, range);
+	}
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH_SPLIT_H__ */
+
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -58,15 +58,6 @@ void DeviceTask::split_max_size(list<DeviceTask>& tasks, int max_size)
 	split(tasks, num);
 }

-void DeviceTask::split(ThreadQueue<DeviceTask>& queue, int num)
-{
-	list<DeviceTask> tasks;
-	split(tasks, num);
-
-	foreach(DeviceTask& task, tasks)
-		queue.push(task);
-}
-
 void DeviceTask::split(list<DeviceTask>& tasks, int num)
 {
 	if(type == SHADER) {
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -25,6 +25,7 @@

 #include "util_list.h"
 #include "util_string.h"
+#include "util_task.h"
 #include "util_thread.h"
 #include "util_types.h"
 #include "util_vector.h"
@ -66,7 +67,7 @@ public:

 /* Device Task */

-class DeviceTask {
+class DeviceTask : public Task {
 public:
 	typedef enum { PATH_TRACE, TONEMAP, SHADER } Type;
 	Type type;
@ -87,7 +88,6 @@ public:
 	DeviceTask(Type type = PATH_TRACE);

 	void split(list<DeviceTask>& tasks, int num);
-	void split(ThreadQueue<DeviceTask>& tasks, int num);
 	void split_max_size(list<DeviceTask>& tasks, int max_size);
 };

--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@ -40,35 +40,21 @@ CCL_NAMESPACE_BEGIN
 class CPUDevice : public Device
 {
 public:
-	vector<thread*> threads;
-	ThreadQueue<DeviceTask> tasks;
+	TaskPool task_pool;
 	KernelGlobals *kg;
 	
 	CPUDevice(int threads_num)
+	: task_pool(function_bind(&CPUDevice::thread_run, this, _1, _2))
 	{
 		kg = kernel_globals_create();

 		/* do now to avoid thread issues */
 		system_cpu_support_optimized();
-
-		if(threads_num == 0)
-			threads_num = system_cpu_thread_count();
-
-		threads.resize(threads_num);
-
-		for(size_t i = 0; i < threads.size(); i++)
-			threads[i] = new thread(function_bind(&CPUDevice::thread_run, this, i));
 	}

 	~CPUDevice()
 	{
-		tasks.stop();
-
-		foreach(thread *t, threads) {
-			t->join();
-			delete t;
-		}
-
+		task_pool.stop();
 		kernel_globals_free(kg);
 	}

@ -127,25 +113,21 @@ public:
 #endif
 	}

-	void thread_run(int t)
+	void thread_run(Task *task_, int thread_id)
 	{
-		DeviceTask task;
+		DeviceTask *task = (DeviceTask*)task_;

-		while(tasks.worker_wait_pop(task)) {
-			if(task.type == DeviceTask::PATH_TRACE)
-				thread_path_trace(task);
-			else if(task.type == DeviceTask::TONEMAP)
-				thread_tonemap(task);
-			else if(task.type == DeviceTask::SHADER)
-				thread_shader(task);
-
-			tasks.worker_done();
-		}
+		if(task->type == DeviceTask::PATH_TRACE)
+			thread_path_trace(*task);
+		else if(task->type == DeviceTask::TONEMAP)
+			thread_tonemap(*task);
+		else if(task->type == DeviceTask::SHADER)
+			thread_shader(*task);
 	}

 	void thread_path_trace(DeviceTask& task)
 	{
-		if(tasks.worker_cancel())
+		if(task_pool.cancelled())
 			return;

 #ifdef WITH_OSL
@ -160,7 +142,7 @@ public:
 					kernel_cpu_optimized_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
 						task.sample, x, y, task.offset, task.stride);

-				if(tasks.worker_cancel())
+				if(task_pool.cancelled())
 					break;
 			}
 		}
@ -172,7 +154,7 @@ public:
 					kernel_cpu_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
 						task.sample, x, y, task.offset, task.stride);

-				if(tasks.worker_cancel())
+				if(task_pool.cancelled())
 					break;
 			}
 		}
@ -214,7 +196,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_optimized_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);

-				if(tasks.worker_cancel())
+				if(task_pool.cancelled())
 					break;
 			}
 		}
@ -224,7 +206,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);

-				if(tasks.worker_cancel())
+				if(task_pool.cancelled())
 					break;
 			}
 		}
@ -239,17 +221,22 @@ public:
 	{
 		/* split task into smaller ones, more than number of threads for uneven
 		   workloads where some parts of the image render slower than others */
-		task.split(tasks, threads.size()*10);
+		list<DeviceTask> tasks;
+
+		task.split(tasks, TaskScheduler::num_threads()*10);
+
+		foreach(DeviceTask& task, tasks)
+			task_pool.push(new DeviceTask(task));
 	}

 	void task_wait()
 	{
-		tasks.wait_done();
+		task_pool.wait();
 	}

 	void task_cancel()
 	{
-		tasks.cancel();
+		task_pool.cancel();
 	}
 };

--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@ -257,13 +257,14 @@ public:

 	void task_add(DeviceTask& task)
 	{
-		ThreadQueue<DeviceTask> tasks;
+		list<DeviceTask> tasks;
 		task.split(tasks, devices.size());

 		foreach(SubDevice& sub, devices) {
-			DeviceTask subtask;
+			if(!tasks.empty()) {
+				DeviceTask subtask = tasks.front();
+				tasks.pop_front();

-			if(tasks.worker_wait_pop(subtask)) {
 				if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer];
 				if(task.rng_state) subtask.rng_state = sub.ptr_map[task.rng_state];
 				if(task.rgba) subtask.rgba = sub.ptr_map[task.rgba];
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@ -266,7 +266,7 @@ __device_inline void path_radiance_accum_background(PathRadiance *L, float3 thro
 #endif
 }

-__device_inline float3 path_radiance_sum(PathRadiance *L)
+__device_inline float3 path_radiance_sum(KernelGlobals *kg, PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@ -283,9 +283,14 @@ __device_inline float3 path_radiance_sum(PathRadiance *L)
 		L->indirect_glossy *= L->indirect;
 		L->indirect_transmission *= L->indirect;

-		return L->emission + L->background
+		float3 L_sum = L->emission
 			+ L->direct_diffuse + L->direct_glossy + L->direct_transmission
 			+ L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission;
+
+		if(!kernel_data.background.transparent)
+			L_sum += L->background;
+
+		return L_sum;
 	}
 	else
 		return L->emission;
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@ -223,6 +223,7 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R

 	path_radiance_init(&L, kernel_data.film.use_light_pass);

+	float min_ray_pdf = FLT_MAX;
 	float ray_pdf = 0.0f;
 	PathState state;
 	int rng_offset = PRNG_BASE_NUM;
@ -239,13 +240,17 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 			/* eval background shader if nothing hit */
 			if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
 				L_transparent += average(throughput);
+
+#ifdef __PASSES__
+				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+					break;
 			}
+
 #ifdef __BACKGROUND__
-			else {
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &ray, state.flag, ray_pdf);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-			}
 #endif

 			break;
@ -259,6 +264,18 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R

 		kernel_write_data_passes(kg, buffer, &L, &sd, sample, state.flag, throughput);

+		/* blurring of bsdf after bounces, for rays that have a small likelihood
+		   of following this particular path (diffuse, rough glossy) */
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+			float blur_pdf = kernel_data.integrator.filter_glossy*min_ray_pdf;
+
+			if(blur_pdf < 1.0f) {
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				shader_bsdf_blur(kg, &sd, blur_roughness);
+			}
+		}
+
+		/* holdout */
 #ifdef __HOLDOUT__
 		if((sd.flag & SD_HOLDOUT) && (state.flag & PATH_RAY_CAMERA)) {
 			float3 holdout_weight = shader_holdout_eval(kg, &sd);
@ -378,8 +395,10 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 		path_radiance_bsdf_bounce(&L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label);

 		/* set labels */
-		if(!(label & LABEL_TRANSPARENT))
+		if(!(label & LABEL_TRANSPARENT)) {
 			ray_pdf = bsdf_pdf;
+			min_ray_pdf = fminf(bsdf_pdf, min_ray_pdf);
+		}

 		/* update path state */
 		path_state_next(kg, &state, label);
@ -394,7 +413,7 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 #endif
 	}

-	float3 L_sum = path_radiance_sum(&L);
+	float3 L_sum = path_radiance_sum(kg, &L);

 #ifdef __CLAMP_SAMPLE__
 	path_radiance_clamp(&L, &L_sum, kernel_data.integrator.sample_clamp);
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -516,6 +516,7 @@ typedef struct KernelIntegrator {

 	/* caustics */
 	int no_caustics;
+	float filter_glossy;

 	/* seed */
 	int seed;
@ -525,9 +526,6 @@ typedef struct KernelIntegrator {

 	/* clamp */
 	float sample_clamp;
-
-	/* padding */
-	int pad;
 } KernelIntegrator;

 typedef struct KernelBVH {
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@ -40,6 +40,15 @@ __device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, float *stack
 				data = sd->P;
 			break;
 		}
+		case NODE_TEXCO_NORMAL: {
+			if(sd->object != ~0) {
+				Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+				data = transform_direction(&tfm, sd->N);
+			}
+			else
+				data = sd->N;
+			break;
+		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

@ -85,6 +94,15 @@ __device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, floa
 				data = sd->P + sd->dP.dx;
 			break;
 		}
+		case NODE_TEXCO_NORMAL: {
+			if(sd->object != ~0) {
+				Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+				data = transform_direction(&tfm, sd->N);
+			}
+			else
+				data = sd->N;
+			break;
+		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

@ -133,6 +151,15 @@ __device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, floa
 				data = sd->P + sd->dP.dy;
 			break;
 		}
+		case NODE_TEXCO_NORMAL: {
+			if(sd->object != ~0) {
+				Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+				data = normalize(transform_direction(&tfm, sd->N));
+			}
+			else
+				data = sd->N;
+			break;
+		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;

--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
 /* Stack */

 /* SVM stack has a fixed size */
-#define SVM_STACK_SIZE 64
+#define SVM_STACK_SIZE 255
 /* SVM stack offsets with this value indicate that it's not on the stack */
 #define SVM_STACK_INVALID 255 

@ -119,6 +119,7 @@ typedef enum NodeLightPath {
 } NodeLightPath;

 typedef enum NodeTexCoord {
+	NODE_TEXCO_NORMAL,
 	NODE_TEXCO_OBJECT,
 	NODE_TEXCO_CAMERA,
 	NODE_TEXCO_WINDOW,
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@ -41,6 +41,7 @@ Integrator::Integrator()
 	transparent_shadows = false;

 	no_caustics = false;
+	filter_glossy = 0.0f;
 	seed = 0;
 	layer_flag = ~0;
 	sample_clamp = 0.0f;
@ -81,6 +82,8 @@ void Integrator::device_update(Device *device, DeviceScene *dscene)
 	kintegrator->transparent_shadows = transparent_shadows;

 	kintegrator->no_caustics = no_caustics;
+	kintegrator->filter_glossy = (filter_glossy == 0.0f)? FLT_MAX: 1.0f/filter_glossy;
+
 	kintegrator->seed = hash_int(seed);
 	kintegrator->layer_flag = layer_flag << PATH_RAY_LAYER_SHIFT;

@ -119,6 +122,7 @@ bool Integrator::modified(const Integrator& integrator)
 		transparent_probalistic == integrator.transparent_probalistic &&
 		transparent_shadows == integrator.transparent_shadows &&
 		no_caustics == integrator.no_caustics &&
+		filter_glossy == integrator.filter_glossy &&
 		layer_flag == integrator.layer_flag &&
 		seed == integrator.seed &&
 		sample_clamp == integrator.sample_clamp);
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@ -41,6 +41,7 @@ public:
 	bool transparent_shadows;

 	bool no_caustics;
+	float filter_glossy;

 	int seed;
 	int layer_flag;
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@ -43,6 +43,7 @@ Mesh::Mesh()
 	transform_applied = false;
 	transform_negative_scaled = false;
 	displacement_method = DISPLACE_BUMP;
+	bounds = BoundBox::empty;

 	bvh = NULL;

@ -96,7 +97,7 @@ void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_)

 void Mesh::compute_bounds()
 {
-	BoundBox bnds;
+	BoundBox bnds = BoundBox::empty;
 	size_t verts_size = verts.size();

 	for(size_t i = 0; i < verts_size; i++)
@ -697,6 +698,8 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 				progress.set_status(msg, "Building BVH");

 				mesh->compute_bvh(&scene->params, progress);
+
+				i++;
 			}

 			if(progress.get_cancel()) return;
@ -704,8 +707,6 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 			mesh->need_update = false;
 			mesh->need_update_rebuild = false;
 		}
-
-		i++;
 	}
 	
 	foreach(Shader *shader, scene->shaders)
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@ -1503,6 +1503,7 @@ TextureCoordinateNode::TextureCoordinateNode()
 {
 	add_input("Normal", SHADER_SOCKET_NORMAL, ShaderInput::NORMAL, true);
 	add_output("Generated", SHADER_SOCKET_POINT);
+	add_output("Normal", SHADER_SOCKET_NORMAL);
 	add_output("UV", SHADER_SOCKET_POINT);
 	add_output("Object", SHADER_SOCKET_POINT);
 	add_output("Camera", SHADER_SOCKET_POINT);
@ -1551,6 +1552,12 @@ void TextureCoordinateNode::compile(SVMCompiler& compiler)
 		}
 	}

+	out = output("Normal");
+	if(!out->links.empty()) {
+		compiler.stack_assign(out);
+		compiler.add_node(texco_node, NODE_TEXCO_NORMAL, out->stack_offset);
+	}
+
 	out = output("UV");
 	if(!out->links.empty()) {
 		int attr = compiler.attribute(Attribute::STD_UV);
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@ -37,6 +37,7 @@ Object::Object()
 	tfm = transform_identity();
 	visibility = ~0;
 	pass_id = 0;
+	bounds = BoundBox::empty;
 }

 Object::~Object()
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@ -27,6 +27,7 @@

 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_task.h"
 #include "util_time.h"

 CCL_NAMESPACE_BEGIN
@ -37,6 +38,8 @@ Session::Session(const SessionParams& params_)
 {
 	device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background);

+	TaskScheduler::init(params.threads);
+
 	device = Device::create(params.device, params.background, params.threads);
 	buffers = new RenderBuffers(device);
 	display = new DisplayBuffer(device);
@ -88,6 +91,8 @@ Session::~Session()
 	delete display;
 	delete scene;
 	delete device;
+
+	TaskScheduler::exit();
 }

 void Session::start()
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@ -93,7 +93,7 @@ void LinearQuadPatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u, float

 BoundBox LinearQuadPatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 4; i++)
 		bbox.grow(hull[i]);
@ -115,7 +115,7 @@ void LinearTrianglePatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u, f

 BoundBox LinearTrianglePatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 3; i++)
 		bbox.grow(hull[i]);
@ -132,7 +132,7 @@ void BicubicPatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u, float v)

 BoundBox BicubicPatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 16; i++)
 		bbox.grow(hull[i]);
@ -152,7 +152,7 @@ void BicubicTangentPatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u, f

 BoundBox BicubicTangentPatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 16; i++)
 		bbox.grow(hull[i]);
@ -205,7 +205,7 @@ void GregoryQuadPatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u, floa

 BoundBox GregoryQuadPatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 20; i++)
 		bbox.grow(hull[i]);
@ -276,7 +276,7 @@ void GregoryTrianglePatch::eval(float3 *P, float3 *dPdu, float3 *dPdv, float u,

 BoundBox GregoryTrianglePatch::bound()
 {
-	BoundBox bbox;
+	BoundBox bbox = BoundBox::empty;

 	for(int i = 0; i < 20; i++)
 		bbox.grow(hull[i]);
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -15,6 +15,7 @@ set(SRC
 	util_path.cpp
 	util_string.cpp
 	util_system.cpp
+	util_task.cpp
 	util_time.cpp
 	util_transform.cpp
 )
@ -50,6 +51,7 @@ set(SRC_HEADERS
 	util_set.h
 	util_string.h
 	util_system.h
+	util_task.h
 	util_thread.h
 	util_time.h
 	util_transform.h
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@ -23,6 +23,7 @@
 #include <float.h>

 #include "util_math.h"
+#include "util_string.h"
 #include "util_transform.h"
 #include "util_types.h"

@ -35,45 +36,81 @@ class BoundBox
 public:
 	float3 min, max;

-	BoundBox(void)
+	__forceinline BoundBox()
 	{
-		min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
-		max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
 	}

-	BoundBox(const float3& min_, const float3& max_)
+	__forceinline BoundBox(const float3& pt)
+	: min(pt), max(pt)
+	{
+	}
+
+	__forceinline BoundBox(const float3& min_, const float3& max_)
 	: min(min_), max(max_)
 	{
 	}

-	void grow(const float3& pt)  
+	static struct empty_t {} empty;
+
+	__forceinline BoundBox(empty_t)
+	: min(make_float3(FLT_MAX, FLT_MAX, FLT_MAX)), max(make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX))
+	{
+	}
+
+	__forceinline void grow(const float3& pt)  
 	{
 		min = ccl::min(min, pt);
 		max = ccl::max(max, pt);
 	}

-	void grow(const BoundBox& bbox)
+	__forceinline void grow(const BoundBox& bbox)
 	{
 		grow(bbox.min);
 		grow(bbox.max);
 	}

-	void intersect(const BoundBox& bbox) 
+	__forceinline void intersect(const BoundBox& bbox) 
 	{
 		min = ccl::max(min, bbox.min);
 		max = ccl::min(max, bbox.max);
 	}

-	float area(void) const
+	/* todo: avoid using this */
+	__forceinline float safe_area() const
 	{
-		if(!valid())
+		if(!((min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z)))
 			return 0.0f;

-		float3 d = max - min;
-		return dot(d, d)*2.0f;
+		return area();
 	}

-	bool valid(void) const
+	__forceinline float area() const
+	{
+		return half_area()*2.0f;
+	}
+
+	__forceinline float half_area() const
+	{
+		float3 d = max - min;
+		return (d.x*d.z + d.y*d.z + d.x*d.y);
+	}
+
+	__forceinline float3 center() const
+	{
+		return 0.5f*(min + max);
+	}
+
+	__forceinline float3 center2() const
+	{
+		return min + max;
+	}
+
+	__forceinline float3 size() const
+	{
+		return max - min;
+	}
+	
+	__forceinline bool valid() const
 	{
 		return (min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z) &&
 		       (isfinite(min.x) && isfinite(min.y) && isfinite(min.z)) &&
@ -82,7 +119,7 @@ public:

 	BoundBox transformed(const Transform *tfm)
 	{
-		BoundBox result;
+		BoundBox result = BoundBox::empty;

 		for(int i = 0; i < 8; i++) {
 			float3 p;
@ -98,6 +135,31 @@ public:
 	}
 };

+__forceinline BoundBox merge(const BoundBox& bbox, const float3& pt)
+{
+	return BoundBox(min(bbox.min, pt), max(bbox.max, pt));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b)
+{
+	return BoundBox(min(a.min, b.min), max(a.max, b.max));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b, const BoundBox& c, const BoundBox& d)
+{
+	return merge(merge(a, b), merge(c, d));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b)
+{
+	return BoundBox(max(a.min, b.min), min(a.max, b.max));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b, const BoundBox& c)
+{
+	return intersect(a, intersect(b, c));
+}
+
 CCL_NAMESPACE_END

 #endif /* __UTIL_BOUNDBOX_H__ */
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@ -182,93 +182,74 @@ __device_inline float average(const float2 a)

 __device_inline float2 operator-(const float2 a)
 {
-	float2 r = {-a.x, -a.y};
-	return r;
+	return make_float2(-a.x, -a.y);
 }

 __device_inline float2 operator*(const float2 a, const float2 b)
 {
-	float2 r = {a.x*b.x, a.y*b.y};
-	return r;
+	return make_float2(a.x*b.x, a.y*b.y);
 }

 __device_inline float2 operator*(const float2 a, float f)
 {
-	float2 r = {a.x*f, a.y*f};
-	return r;
+	return make_float2(a.x*f, a.y*f);
 }

 __device_inline float2 operator*(float f, const float2 a)
 {
-	float2 r = {a.x*f, a.y*f};
-	return r;
+	return make_float2(a.x*f, a.y*f);
 }

 __device_inline float2 operator/(float f, const float2 a)
 {
-	float2 r = {f/a.x, f/a.y};
-	return r;
+	return make_float2(f/a.x, f/a.y);
 }

 __device_inline float2 operator/(const float2 a, float f)
 {
 	float invf = 1.0f/f;
-	float2 r = {a.x*invf, a.y*invf};
-	return r;
+	return make_float2(a.x*invf, a.y*invf);
 }

 __device_inline float2 operator/(const float2 a, const float2 b)
 {
-	float2 r = {a.x/b.x, a.y/b.y};
-	return r;
+	return make_float2(a.x/b.x, a.y/b.y);
 }

 __device_inline float2 operator+(const float2 a, const float2 b)
 {
-	float2 r = {a.x+b.x, a.y+b.y};
-	return r;
+	return make_float2(a.x+b.x, a.y+b.y);
 }

 __device_inline float2 operator-(const float2 a, const float2 b)
 {
-	float2 r = {a.x-b.x, a.y-b.y};
-	return r;
+	return make_float2(a.x-b.x, a.y-b.y);
 }

 __device_inline float2 operator+=(float2& a, const float2 b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	return a;
+	return a = a + b;
 }

 __device_inline float2 operator*=(float2& a, const float2 b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	return a;
+	return a = a * b;
 }

 __device_inline float2 operator*=(float2& a, float f)
 {
-	a.x *= f;
-	a.y *= f;
-	return a;
+	return a = a * f;
 }

 __device_inline float2 operator/=(float2& a, const float2 b)
 {
-	a.x /= b.x;
-	a.y /= b.y;
-	return a;
+	return a = a / b;
 }

 __device_inline float2 operator/=(float2& a, float f)
 {
 	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	return a;
+	return a = a * invf;
 }


@ -314,14 +295,12 @@ __device_inline bool operator!=(const float2 a, const float2 b)

 __device_inline float2 min(float2 a, float2 b)
 {
-	float2 r = {min(a.x, b.x), min(a.y, b.y)};
-	return r;
+	return make_float2(min(a.x, b.x), min(a.y, b.y));
 }

 __device_inline float2 max(float2 a, float2 b)
 {
-	float2 r = {max(a.x, b.x), max(a.y, b.y)};
-	return r;
+	return make_float2(max(a.x, b.x), max(a.y, b.y));
 }

 __device_inline float2 clamp(float2 a, float2 mn, float2 mx)
@ -361,112 +340,78 @@ __device_inline float2 interp(float2 a, float2 b, float t)

 /* Float3 Vector */

-__device_inline bool is_zero(const float3 a)
-{
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-}
-
-__device_inline float average(const float3 a)
-{
-	return (a.x + a.y + a.z)*(1.0f/3.0f);
-}
-
 #ifndef __KERNEL_OPENCL__

 __device_inline float3 operator-(const float3 a)
 {
-	float3 r = make_float3(-a.x, -a.y, -a.z);
-	return r;
+	return make_float3(-a.x, -a.y, -a.z);
 }

 __device_inline float3 operator*(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-	return r;
+	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
 }

 __device_inline float3 operator*(const float3 a, float f)
 {
-	float3 r = make_float3(a.x*f, a.y*f, a.z*f);
-	return r;
+	return make_float3(a.x*f, a.y*f, a.z*f);
 }

 __device_inline float3 operator*(float f, const float3 a)
 {
-	float3 r = make_float3(a.x*f, a.y*f, a.z*f);
-	return r;
+	return make_float3(a.x*f, a.y*f, a.z*f);
 }

 __device_inline float3 operator/(float f, const float3 a)
 {
-	float3 r = make_float3(f/a.x, f/a.y, f/a.z);
-	return r;
+	return make_float3(f/a.x, f/a.y, f/a.z);
 }

 __device_inline float3 operator/(const float3 a, float f)
 {
 	float invf = 1.0f/f;
-	float3 r = make_float3(a.x*invf, a.y*invf, a.z*invf);
-	return r;
+	return make_float3(a.x*invf, a.y*invf, a.z*invf);
 }

 __device_inline float3 operator/(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
-	return r;
+	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
 }

 __device_inline float3 operator+(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-	return r;
+	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
 }

 __device_inline float3 operator-(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-	return r;
+	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
 }

 __device_inline float3 operator+=(float3& a, const float3 b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	a.z += b.z;
-	return a;
+	return a = a + b;
 }

 __device_inline float3 operator*=(float3& a, const float3 b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	a.z *= b.z;
-	return a;
+	return a = a * b;
 }

 __device_inline float3 operator*=(float3& a, float f)
 {
-	a.x *= f;
-	a.y *= f;
-	a.z *= f;
-	return a;
+	return a = a * f;
 }

 __device_inline float3 operator/=(float3& a, const float3 b)
 {
-	a.x /= b.x;
-	a.y /= b.y;
-	a.z /= b.z;
-	return a;
+	return a = a / b;
 }

 __device_inline float3 operator/=(float3& a, float f)
 {
 	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	a.z *= invf;
-	return a;
+	return a = a * invf;
 }

 __device_inline float dot(const float3 a, const float3 b)
@ -506,7 +451,11 @@ __device_inline float3 normalize_len(const float3 a, float *t)

 __device_inline bool operator==(const float3 a, const float3 b)
 {
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
 	return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
 }

 __device_inline bool operator!=(const float3 a, const float3 b)
@ -516,14 +465,20 @@ __device_inline bool operator!=(const float3 a, const float3 b)

 __device_inline float3 min(float3 a, float3 b)
 {
-	float3 r = make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_min_ps(a.m128, b.m128);
+#else
+	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
 }

 __device_inline float3 max(float3 a, float3 b)
 {
-	float3 r = make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_max_ps(a.m128, b.m128);
+#else
+	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
 }

 __device_inline float3 clamp(float3 a, float3 mn, float3 mx)
@ -533,7 +488,12 @@ __device_inline float3 clamp(float3 a, float3 mn, float3 mx)

 __device_inline float3 fabs(float3 a)
 {
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+	return _mm_and_ps(a.m128, mask);
+#else
 	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
 }

 #endif
@ -555,6 +515,16 @@ __device_inline void print_float3(const char *label, const float3& a)
 	printf("%s: %.8f %.8f %.8f\n", label, a.x, a.y, a.z);
 }

+__device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 r = _mm_rcp_ps(a.m128);
+	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+
 #endif

 __device_inline float3 interp(float3 a, float3 b, float t)
@ -562,122 +532,257 @@ __device_inline float3 interp(float3 a, float3 b, float t)
 	return a + t*(b - a);
 }

+__device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float3(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	return (a.x + a.y + a.z);
+#else
+	return (a.x + a.y + a.z);
+#endif
+}
+
+__device_inline float average(const float3 a)
+{
+	return reduce_add(a)*(1.0f/3.0f);
+}
+
 /* Float4 Vector */

+#ifdef __KERNEL_SSE__
+
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
+{
+	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
+{
+	return _mm_moveldup_ps(b);
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+	return _mm_movehdup_ps(b);
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+	return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
+}
+
+#endif
+
 #ifndef __KERNEL_OPENCL__

-__device_inline bool is_zero(const float4& a)
-{
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-}
-
-__device_inline float average(const float4& a)
-{
-	return (a.x + a.y + a.z + a.w)*(1.0f/4.0f);
-}
-
 __device_inline float4 operator-(const float4& a)
 {
-	float4 r = {-a.x, -a.y, -a.z, -a.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+	return _mm_xor_ps(a.m128, mask);
+#else
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
 }

 __device_inline float4 operator*(const float4& a, const float4& b)
 {
-	float4 r = {a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_mul_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
 }

 __device_inline float4 operator*(const float4& a, float f)
 {
-	float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
-	return r;
+#ifdef __KERNEL_SSE__
+	return a * make_float4(f);
+#else
+	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
 }

 __device_inline float4 operator*(float f, const float4& a)
 {
-	float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
-	return r;
+	return a * f;
+}
+
+__device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 r = _mm_rcp_ps(a.m128);
+	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
 }

 __device_inline float4 operator/(const float4& a, float f)
 {
-	float invf = 1.0f/f;
-	float4 r = {a.x*invf, a.y*invf, a.z*invf, a.w*invf};
-	return r;
+	return a * (1.0f/f);
 }

 __device_inline float4 operator/(const float4& a, const float4& b)
 {
-	float4 r = {a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return a * rcp(b);
+#else
+	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
 }

 __device_inline float4 operator+(const float4& a, const float4& b)
 {
-	float4 r = {a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_add_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
 }

 __device_inline float4 operator-(const float4& a, const float4& b)
 {
-	float4 r = {a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_sub_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
 }

 __device_inline float4 operator+=(float4& a, const float4& b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	a.z += b.z;
-	a.w += b.w;
-	return a;
+	return a = a + b;
 }

 __device_inline float4 operator*=(float4& a, const float4& b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	a.z *= b.z;
-	a.w *= b.w;
-	return a;
+	return a = a * b;
 }

 __device_inline float4 operator/=(float4& a, float f)
 {
-	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	a.z *= invf;
-	a.w *= invf;
-	return a;
+	return a = a / f;
 }

-__device_inline float dot(const float4& a, const float4& b)
+__device_inline int4 operator<(const float4& a, const float4& b)
 {
-	return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+__device_inline int4 operator>=(float4 a, float4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+__device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+__device_inline bool operator==(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
 }

 __device_inline float4 cross(const float4& a, const float4& b)
 {
-	float4 r = {a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f};
-	return r;
+#ifdef __KERNEL_SSE__
+	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+	return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
+#endif
 }

 __device_inline float4 min(float4 a, float4 b)
 {
+#ifdef __KERNEL_SSE__
+	return _mm_min_ps(a.m128, b.m128);
+#else
 	return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
 }

 __device_inline float4 max(float4 a, float4 b)
 {
+#ifdef __KERNEL_SSE__
+	return _mm_max_ps(a.m128, b.m128);
+#else
 	return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
 }

 #endif

 #ifndef __KERNEL_GPU__

+__device_inline float4 select(const int4& mask, const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* blendv is sse4, and apparently broken on vs2008 */
+	return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
+#else
+	return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}
+
+__device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = min(shuffle<1,0,3,2>(a), a);
+	return min(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+__device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = max(shuffle<1,0,3,2>(a), a);
+	return max(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+#if 0
+__device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = shuffle<1,0,3,2>(a) + a;
+	return shuffle<2,3,0,1>(h) + h;
+#else
+	return make_float4((a.x + a.y) + (a.z + a.w));
+#endif
+}
+#endif
+
 __device_inline void print_float4(const char *label, const float4& a)
 {
 	printf("%s: %.8f %.8f %.8f %.8f\n", label, a.x, a.y, a.z, a.w);
@ -685,26 +790,77 @@ __device_inline void print_float4(const char *label, const float4& a)

 #endif

+#ifndef __KERNEL_OPENCL__
+
+__device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float4(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = shuffle<1,0,3,2>(a) + a;
+	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
+#else
+	return ((a.x + a.y) + (a.z + a.w));
+#endif
+}
+
+__device_inline float average(const float4& a)
+{
+	return reduce_add(a) * 0.25f;
+}
+
+__device_inline float dot(const float4& a, const float4& b)
+{
+	return reduce_add(a * b);
+}
+
+#endif
+
 /* Int3 */

 #ifndef __KERNEL_OPENCL__

+__device_inline int3 min(int3 a, int3 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_min_epi32(a.m128, b.m128);
+#else
+	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
 __device_inline int3 max(int3 a, int3 b)
 {
-	int3 r = {max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_max_epi32(a.m128, b.m128);
+#else
+	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
 }

 __device_inline int3 clamp(const int3& a, int mn, int mx)
 {
-	int3 r = {clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
 }

 __device_inline int3 clamp(const int3& a, int3& mn, int mx)
 {
-	int3 r = {clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return min(max(a, mn), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
+#endif
 }

 #endif
@ -720,16 +876,63 @@ __device_inline void print_int3(const char *label, const int3& a)

 /* Int4 */

-#ifndef __KERNEL_OPENCL__
+#ifndef __KERNEL_GPU__

-__device_inline int4 operator>=(float4 a, float4 b)
+__device_inline int4 operator+(const int4& a, const int4& b)
 {
-	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#ifdef __KERNEL_SSE__
+	return _mm_add_epi32(a.m128, b.m128);
+#else
+	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
 }

-#endif
+__device_inline int4 operator+=(int4& a, const int4& b)
+{
+	return a = a + b;
+}

-#ifndef __KERNEL_GPU__
+__device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_srai_epi32(a.m128, i);
+#else
+	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
+#endif
+}
+
+__device_inline int4 min(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_min_epi32(a.m128, b.m128);
+#else
+	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+__device_inline int4 max(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_max_epi32(a.m128, b.m128);
+#else
+	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+__device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+__device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	__m128 m = _mm_cvtepi32_ps(mask);
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
+#else
+	return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}

 __device_inline void print_int4(const char *label, const int4& a)
 {
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@ -0,0 +1,223 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "util_debug.h"
+#include "util_foreach.h"
+#include "util_system.h"
+#include "util_task.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Task Pool */
+
+TaskPool::TaskPool(const TaskRunFunction& run_)
+{
+	num = 0;
+	num_done = 0;
+
+	do_cancel = false;
+
+	run = run_;
+}
+
+TaskPool::~TaskPool()
+{
+	stop();
+}
+
+void TaskPool::push(Task *task, bool front)
+{
+	TaskScheduler::Entry entry;
+
+	entry.task = task;
+	entry.pool = this;
+
+	TaskScheduler::push(entry, front);
+}
+
+void TaskPool::wait()
+{
+	thread_scoped_lock lock(done_mutex);
+
+	while(num_done != num)
+		done_cond.wait(lock);
+}
+
+void TaskPool::cancel()
+{
+	TaskScheduler::clear(this);
+
+	do_cancel = true;
+	wait();
+	do_cancel = false;
+}
+
+void TaskPool::stop()
+{
+	TaskScheduler::clear(this);
+
+	assert(num_done == num);
+}
+
+bool TaskPool::cancelled()
+{
+	return do_cancel;
+}
+
+void TaskPool::done_increase(int done)
+{
+	done_mutex.lock();
+	num_done += done;
+	done_mutex.unlock();
+
+	assert(num_done <= num);
+	done_cond.notify_all();
+}
+
+/* Task Scheduler */
+
+thread_mutex TaskScheduler::mutex;
+int TaskScheduler::users = 0;
+vector<thread*> TaskScheduler::threads;
+volatile bool TaskScheduler::do_exit = false;
+
+list<TaskScheduler::Entry> TaskScheduler::queue;
+thread_mutex TaskScheduler::queue_mutex;
+thread_condition_variable TaskScheduler::queue_cond;
+
+void TaskScheduler::init(int num_threads)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* multiple cycles instances can use this task scheduler, sharing the same
+	   threads, so we keep track of the number of users. */
+	if(users == 0) {
+		do_exit = false;
+
+		/* launch threads that will be waiting for work */
+		if(num_threads == 0)
+			num_threads = system_cpu_thread_count();
+
+		threads.resize(num_threads);
+
+		for(size_t i = 0; i < threads.size(); i++)
+			threads[i] = new thread(function_bind(&TaskScheduler::thread_run, i));
+	}
+	
+	users++;
+}
+
+void TaskScheduler::exit()
+{
+	thread_scoped_lock lock(mutex);
+
+	users--;
+
+	if(users == 0) {
+		/* stop all waiting threads */
+		do_exit = true;
+		TaskScheduler::queue_cond.notify_all();
+
+		/* delete threads */
+		foreach(thread *t, threads) {
+			t->join();
+			delete t;
+		}
+
+		threads.clear();
+	}
+}
+
+bool TaskScheduler::thread_wait_pop(Entry& entry)
+{
+	thread_scoped_lock lock(queue_mutex);
+
+	while(queue.empty() && !do_exit)
+		queue_cond.wait(lock);
+
+	if(queue.empty()) {
+		assert(do_exit);
+		return false;
+	}
+	
+	entry = queue.front();
+	queue.pop_front();
+
+	return true;
+}
+
+void TaskScheduler::thread_run(int thread_id)
+{
+	Entry entry;
+
+	/* todo: test affinity/denormal mask */
+
+	/* keep popping off tasks */
+	while(thread_wait_pop(entry)) {
+		/* run task */
+		entry.pool->run(entry.task, thread_id);
+
+		/* delete task */
+		delete entry.task;
+
+		/* notify pool task was done */
+		entry.pool->done_increase(1);
+	}
+}
+
+void TaskScheduler::push(Entry& entry, bool front)
+{
+	/* add entry to queue */
+	TaskScheduler::queue_mutex.lock();
+	if(front)
+		TaskScheduler::queue.push_front(entry);
+	else
+		TaskScheduler::queue.push_back(entry);
+	entry.pool->num++;
+	TaskScheduler::queue_mutex.unlock();
+
+	TaskScheduler::queue_cond.notify_one();
+}
+
+void TaskScheduler::clear(TaskPool *pool)
+{
+	thread_scoped_lock lock(TaskScheduler::queue_mutex);
+
+	/* erase all tasks from this pool from the queue */
+	list<TaskScheduler::Entry>::iterator it = TaskScheduler::queue.begin();
+	int done = 0;
+
+	while(it != TaskScheduler::queue.end()) {
+		TaskScheduler::Entry& entry = *it;
+
+		if(entry.pool == pool) {
+			done++;
+			delete entry.task;
+
+			it = TaskScheduler::queue.erase(it);
+		}
+		else
+			it++;
+	}
+
+	/* notify done */
+	pool->done_increase(done);
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@ -0,0 +1,122 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __UTIL_TASK_H__
+#define __UTIL_TASK_H__
+
+#include "util_list.h"
+#include "util_thread.h"
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Task;
+class TaskPool;
+class TaskScheduler;
+
+typedef boost::function<void(Task*,int)> TaskRunFunction;
+
+/* Task
+ *
+ * Base class for tasks to be executed in threads. */
+
+class Task
+{
+public:
+	Task() {};
+	virtual ~Task() {}
+};
+
+/* Task Pool
+ *
+ * Pool of tasks that will be executed by the central TaskScheduler.For each
+ * pool, we can wait for all tasks to be done, or cancel them before they are
+ * done.
+ *
+ * The run callback that actually executes the task may be create like this:
+ * function_bind(&MyClass::task_execute, this, _1, _2) */
+
+class TaskPool
+{
+public:
+	TaskPool(const TaskRunFunction& run);
+	~TaskPool();
+
+	void push(Task *task, bool front = false);
+
+	void wait();		/* wait until all tasks are done */
+	void cancel();		/* cancel all tasks, keep worker threads running */
+	void stop();		/* stop all worker threads */
+
+	bool cancelled();	/* for worker threads, test if cancelled */
+
+protected:
+	friend class TaskScheduler;
+
+	void done_increase(int done);
+
+	TaskRunFunction run;
+
+	thread_mutex done_mutex;
+	thread_condition_variable done_cond;
+
+	volatile int num, num_done;
+	volatile bool do_cancel;
+};
+
+/* Task Scheduler
+ * 
+ * Central scheduler that holds running threads ready to execute tasks. A singe
+ * queue holds the task from all pools. */
+
+class TaskScheduler
+{
+public:
+	static void init(int num_threads = 0);
+	static void exit();
+
+	static int num_threads() { return threads.size(); }
+
+protected:
+	friend class TaskPool;
+
+	struct Entry {
+		Task *task;
+		TaskPool *pool;
+	};
+
+	static thread_mutex mutex;
+	static int users;
+	static vector<thread*> threads;
+	static volatile bool do_exit;
+
+	static list<Entry> queue;
+	static thread_mutex queue_mutex;
+	static thread_condition_variable queue_cond;
+
+	static void thread_run(int thread_id);
+	static bool thread_wait_pop(Entry& entry);
+
+	static void push(Entry& entry, bool front);
+	static void clear(TaskPool *pool);
+};
+
+CCL_NAMESPACE_END
+
+#endif
+
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@ -69,133 +69,6 @@ protected:
 	bool joined;
 };

-/* Thread Safe Queue to pass tasks from one thread to another. Tasks should be
- * pushed into the queue, while the worker thread waits to pop the next task
- * off the queue. Once all tasks are into the queue, calling stop() will stop
- * the worker threads from waiting for more tasks once all tasks are done. */
-
-template<typename T> class ThreadQueue
-{
-public:
-	ThreadQueue()
-	{
-		tot = 0;
-		tot_done = 0;
-		do_stop = false;
-		do_cancel = false;
-	}
-
-	/* Main thread functions */
-
-	/* push a task to be executed */
-	void push(const T& value)
-	{
-		thread_scoped_lock lock(queue_mutex);
-		queue.push(value);
-		tot++;
-		lock.unlock();
-
-		queue_cond.notify_one();
-	}
-
-	/* wait until all tasks are done */
-	void wait_done()
-	{
-		thread_scoped_lock lock(done_mutex);
-
-		while(tot_done != tot)
-			done_cond.wait(lock);
-	}
-
-	/* stop all worker threads */
-	void stop()
-	{
-		clear();
-		do_stop = true;
-		queue_cond.notify_all();
-	}
-
-	/* cancel all tasks, but keep worker threads running */
-	void cancel()
-	{
-		clear();
-		do_cancel = true;
-		wait_done();
-		do_cancel = false;
-	}
-
-	/* Worker thread functions
-     *
-	 * while(queue.worker_wait_pop(task)) {
-	 *		for(..) {
-	 *			... do work ...
-	 *
-	 *			if(queue.worker_cancel())
-	 *				break;
-	 *      }
-	 *		
-	 *		queue.worker_done();
-	 * }
-	 */
-
-	bool worker_wait_pop(T& value)
-	{
-		thread_scoped_lock lock(queue_mutex);
-
-		while(queue.empty() && !do_stop)
-			queue_cond.wait(lock);
-
-		if(queue.empty())
-			return false;
-		
-		value = queue.front();
-		queue.pop();
-
-		return true;
-	}
-
-	void worker_done()
-	{
-		thread_scoped_lock lock(done_mutex);
-		tot_done++;
-		lock.unlock();
-
-		assert(tot_done <= tot);
-
-		done_cond.notify_all();
-	}
-
-	bool worker_cancel()
-	{
-		return do_cancel;
-	}
-
-protected:
-	void clear()
-	{
-		thread_scoped_lock lock(queue_mutex);
-
-		while(!queue.empty()) {
-			thread_scoped_lock done_lock(done_mutex);
-			tot_done++;
-			done_lock.unlock();
-
-			queue.pop();
-		}
-
-		done_cond.notify_all();
-	}
-
-	std::queue<T> queue;
-	thread_mutex queue_mutex;
-	thread_mutex done_mutex;
-	thread_condition_variable queue_cond;
-	thread_condition_variable done_cond;
-	volatile bool do_stop;
-	volatile bool do_cancel;
-	volatile int tot, tot_done;
-};
-
 /* Thread Local Storage
 *
 * Boost implementation is a bit slow, and Mac OS X __thread is not supported
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@ -129,23 +129,26 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])

 Transform transform_inverse(const Transform& tfm)
 {
-	union { Transform T; float M[4][4]; } R, M;
+	Transform tfmR = transform_identity();
+	float M[4][4], R[4][4];

-	R.T = transform_identity();
-	M.T = tfm;
+	memcpy(R, &tfmR, sizeof(R));
+	memcpy(M, &tfm, sizeof(M));

-	if(!transform_matrix4_gj_inverse(R.M, M.M)) {
+	if(!transform_matrix4_gj_inverse(R, M)) {
 		/* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
 		   never be in this situation, but try to invert it anyway with tweak */
-		M.M[0][0] += 1e-8f;
-		M.M[1][1] += 1e-8f;
-		M.M[2][2] += 1e-8f;
+		M[0][0] += 1e-8f;
+		M[1][1] += 1e-8f;
+		M[2][2] += 1e-8f;

-		if(!transform_matrix4_gj_inverse(R.M, M.M))
+		if(!transform_matrix4_gj_inverse(R, M))
 			return transform_identity();
 	}

-	return R.T;
+	memcpy(&tfmR, R, sizeof(R));
+
+	return tfmR;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@ -36,23 +36,37 @@
 #define __shared
 #define __constant

-#ifdef __GNUC__
-#define __device_inline static inline __attribute__((always_inline))
-#else
+#ifdef _WIN32
 #define __device_inline static __forceinline
+#define __align(...) __declspec(align(__VA_ARGS__))
+#else
+#define __device_inline static inline __attribute__((always_inline))
+#define __forceinline inline __attribute__((always_inline))
+#define __align(...) __attribute__((aligned(__VA_ARGS__)))
 #endif

 #endif

+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __KERNEL_64_BIT__
+#endif
+
 /* SIMD Types */

-/* not needed yet, will be for qbvh
-#ifndef __KERNEL_GPU__
+/* not enabled, globally applying it just gives slowdown,
+ * but useful for testing. */
+//#define __KERNEL_SSE__
+#ifdef __KERNEL_SSE__

-#include <emmintrin.h>
-#include <xmmintrin.h>
+#include <xmmintrin.h> /* SSE 1 */
+#include <emmintrin.h> /* SSE 2 */
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#include <smmintrin.h> /* SSE 4 */

-#endif*/
+#endif

 #ifndef _WIN32
 #ifndef __KERNEL_GPU__
@ -97,6 +111,12 @@ typedef unsigned int uint32_t;
 typedef long long int64_t;
 typedef unsigned long long uint64_t;

+#ifdef __KERNEL_64_BIT__
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+
 #endif

 /* Generic Memory Pointer */
@ -108,89 +128,137 @@ typedef uint64_t device_ptr;
 struct uchar2 {
 	uchar x, y;

-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };

 struct uchar3 {
 	uchar x, y, z;

-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };

 struct uchar4 {
 	uchar x, y, z, w;

-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };

 struct int2 {
 	int x, y;

-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };

+#ifdef __KERNEL_SSE__
+struct __align(16) int3 {
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int3() {}
+	__forceinline int3(const __m128i a) : m128(a) {}
+	__forceinline operator const __m128i&(void) const { return m128; }
+	__forceinline operator __m128i&(void) { return m128; }
+#else
 struct int3 {
-	int x, y, z;
+	int x, y, z, w;
+#endif

-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };

+#ifdef __KERNEL_SSE__
+struct __align(16) int4 {
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int4() {}
+	__forceinline int4(const __m128i a) : m128(a) {}
+	__forceinline operator const __m128i&(void) const { return m128; }
+	__forceinline operator __m128i&(void) { return m128; }
+#else
 struct int4 {
 	int x, y, z, w;
+#endif

-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };

 struct uint2 {
 	uint x, y;

-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };

 struct uint3 {
 	uint x, y, z;

-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };

 struct uint4 {
 	uint x, y, z, w;

-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };

 struct float2 {
 	float x, y;

-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };

-struct float3 {
-	float x, y, z;
+#ifdef __KERNEL_SSE__
+struct __align(16) float3 {
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};

-#ifdef WITH_OPENCL
-	float w;
+	__forceinline float3() {}
+	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline operator const __m128&(void) const { return m128; }
+	__forceinline operator __m128&(void) { return m128; }
+#else
+struct float3 {
+	float x, y, z, w;
 #endif

-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };

+#ifdef __KERNEL_SSE__
+struct __align(16) float4 {
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float4() {}
+	__forceinline float4(const __m128 a) : m128(a) {}
+	__forceinline operator const __m128&(void) const { return m128; }
+	__forceinline operator __m128&(void) { return m128; }
+#else
 struct float4 {
 	float x, y, z, w;
+#endif

-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };

 #endif
@ -201,87 +269,179 @@ struct float4 {
 * 
 * OpenCL does not support C++ class, so we use these instead. */

-__device uchar2 make_uchar2(uchar x, uchar y)
+__device_inline uchar2 make_uchar2(uchar x, uchar y)
 {
 	uchar2 a = {x, y};
 	return a;
 }

-__device uchar3 make_uchar3(uchar x, uchar y, uchar z)
+__device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
 {
 	uchar3 a = {x, y, z};
 	return a;
 }

-__device uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+__device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
 {
 	uchar4 a = {x, y, z, w};
 	return a;
 }

-__device int2 make_int2(int x, int y)
+__device_inline int2 make_int2(int x, int y)
 {
 	int2 a = {x, y};
 	return a;
 }

-__device int3 make_int3(int x, int y, int z)
+__device_inline int3 make_int3(int x, int y, int z)
 {
-	int3 a = {x, y, z};
+#ifdef __KERNEL_SSE__
+	int3 a;
+	a.m128 = _mm_set_epi32(0, z, y, x);
+#else
+	int3 a = {x, y, z, 0};
+#endif
+
 	return a;
 }

-__device int4 make_int4(int x, int y, int z, int w)
+__device_inline int4 make_int4(int x, int y, int z, int w)
 {
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_set_epi32(w, z, y, x);
+#else
 	int4 a = {x, y, z, w};
+#endif
+
 	return a;
 }

-__device uint2 make_uint2(uint x, uint y)
+__device_inline uint2 make_uint2(uint x, uint y)
 {
 	uint2 a = {x, y};
 	return a;
 }

-__device uint3 make_uint3(uint x, uint y, uint z)
+__device_inline uint3 make_uint3(uint x, uint y, uint z)
 {
 	uint3 a = {x, y, z};
 	return a;
 }

-__device uint4 make_uint4(uint x, uint y, uint z, uint w)
+__device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
 {
 	uint4 a = {x, y, z, w};
 	return a;
 }

-__device float2 make_float2(float x, float y)
+__device_inline float2 make_float2(float x, float y)
 {
 	float2 a = {x, y};
 	return a;
 }

-__device float3 make_float3(float x, float y, float z)
+__device_inline float3 make_float3(float x, float y, float z)
 {
-#ifdef WITH_OPENCL
-	float3 a = {x, y, z, 0.0f};
+#ifdef __KERNEL_SSE__
+	float3 a;
+	a.m128 = _mm_set_ps(0.0f, z, y, x);
 #else
-	float3 a = {x, y, z};
+	float3 a = {x, y, z, 0.0f};
 #endif
+
 	return a;
 }

-__device float4 make_float4(float x, float y, float z, float w)
+__device_inline float4 make_float4(float x, float y, float z, float w)
 {
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_set_ps(w, z, y, x);
+#else
 	float4 a = {x, y, z, w};
+#endif
+
 	return a;
 }

-__device int align_up(int offset, int alignment)
+__device_inline int align_up(int offset, int alignment)
 {
 	return (offset + alignment - 1) & ~(alignment - 1);
 }

+__device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+	int3 a;
+	a.m128 = _mm_set1_epi32(i);
+#else
+	int3 a = {i, i, i, i};
+#endif
+
+	return a;
+}
+
+__device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_set1_epi32(i);
+#else
+	int4 a = {i, i, i, i};
+#endif
+
+	return a;
+}
+
+__device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+	float3 a;
+	a.m128 = _mm_set1_ps(f);
+#else
+	float3 a = {f, f, f, f};
+#endif
+
+	return a;
+}
+
+__device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_set1_ps(f);
+#else
+	float4 a = {f, f, f, f};
+#endif
+
+	return a;
+}
+
+__device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_cvtepi32_ps(i.m128);
+#else
+	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+
+	return a;
+}
+
+__device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_cvtps_epi32(f.m128);
+#else
+	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+
+	return a;
+}
+
 #endif

 CCL_NAMESPACE_END
--- a/intern/elbeem/intern/ntl_geometrymodel.cpp
+++ b/intern/elbeem/intern/ntl_geometrymodel.cpp
@ -174,7 +174,7 @@ int ntlGeometryObjModel::initModel(int numVertices, float *vertices, int numTria
 		anitimes.clear();
 		for(int frame=0; frame<channelSize; frame++) {
 			ntlSetVec3f averts; averts.mVerts.clear();
-			ntlSetVec3f anorms; averts.mVerts.clear();
+			ntlSetVec3f anorms; anorms.mVerts.clear();
 			int setsize = (3*numVertices+1);

 			ntlVec3Gfx p(0.),n(1.);
--- a/intern/elbeem/intern/ntl_world.cpp
+++ b/intern/elbeem/intern/ntl_world.cpp
@ -400,8 +400,8 @@ int ntlWorld::advanceSims(int framenum)
 	bool done = false;
 	bool allPanic = true;

-	// stop/quit, dont display/render
-	if(getElbeemState()==SIMWORLD_STOP) { 
+	// stop/quit (abort), dont display/render
+	if(!isSimworldOk()) { 
 		return 1;
 	}

@ -411,6 +411,9 @@ int ntlWorld::advanceSims(int framenum)
 	// time stopped? nothing else to do...
 	if( (*mpSims)[mFirstSim]->getFrameTime(framenum) <= 0.0 ){ 
 		done=true; allPanic=false; 
+
+		/* DG: Need to check for user cancel here (fix for [#30298]) */
+		(*mpSims)[mFirstSim]->checkCallerStatus(FLUIDSIM_CBSTATUS_STEP, 0);
 	}

 	int gstate = 0;
--- a/intern/elbeem/intern/solver_init.cpp
+++ b/intern/elbeem/intern/solver_init.cpp
@ -1453,9 +1453,7 @@ void LbmFsgrSolver::initMovingObstacles(bool staticInit) {
 			//errMsg("GEOACTT"," obj "<<obj->getName()<<" a:"<<active<<","<<wasActive<<"  s"<<sourceTime<<" t"<<targetTime <<" v"<<mObjectSpeeds[OId] );
 			// skip inactive in/out flows
 			if(ntype==CFInvalid){ errMsg("LbmFsgrSolver::initMovingObstacles","Invalid obj type "<<obj->getGeoInitType()); continue; }
-			/* DG: only inflows/outlfows could be activated/deactivated, test new code that everything can be activated
-			if((!active) && (otype&(CFMbndOutflow|CFMbndInflow)) ) continue; */
-			if((!active) /* && (otype&(CFMbndOutflow|CFMbndInflow)) */ ) continue;
+			if((!active) && (otype&(CFMbndOutflow|CFMbndInflow)) ) continue;

 			// copied from  recalculateObjectSpeeds
 			mObjectSpeeds[OId] = vec2L(mpParam->calculateLattVelocityFromRw( vec2P( (*mpGiObjects)[OId]->getInitialVelocity(mSimulationTime) )));
--- a/intern/elbeem/intern/solver_relax.h
+++ b/intern/elbeem/intern/solver_relax.h
@ -390,7 +390,7 @@
 #define  DEFAULT_STREAM  \
 	m[dC] = RAC(ccel,dC); \
 	 \
-	if((!nbored & CFBnd)) { \
+	if(((!nbored) & CFBnd)) { \
 	 \
 	m[dN ] = CSRC_N ; m[dS ] = CSRC_S ; \
 	m[dE ] = CSRC_E ; m[dW ] = CSRC_W ; \
--- a/intern/ghost/GHOST_Rect.h
+++ b/intern/ghost/GHOST_Rect.h
@ -241,8 +241,10 @@ inline void GHOST_Rect::wrapPoint(GHOST_TInt32 &x, GHOST_TInt32 &y, GHOST_TInt32
 	GHOST_TInt32 h= getHeight();

 	/* highly unlikely but avoid eternal loop */
-	if(w-ofs*2 <= 0 || h-ofs*2 <= 0)
+	if (w-ofs*2 <= 0 || h-ofs*2 <= 0) {
 		return;
+	}
+
 	while(x-ofs < m_l)		x+= w-(ofs*2);
 	while(y-ofs < m_t)		y+= h-(ofs*2);
 	while(x+ofs > m_r)		x-= w-(ofs*2);
--- a/intern/ghost/SConscript
+++ b/intern/ghost/SConscript
@ -49,7 +49,7 @@ elif window_system in ('linux', 'openbsd3', 'sunos5', 'freebsd7', 'freebsd8', 'f
    else:
        sources.remove('intern' + os.sep + 'GHOST_DropTargetX11.cpp')

-elif window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc'):
+elif window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc', 'win64-mingw'):
    for f in pf:
        try:
            sources.remove('intern' + os.sep + f + 'X11.cpp')
@ -98,7 +98,7 @@ if env['WITH_BF_3DMOUSE']:
 else:
    sources.remove('intern' + os.sep + 'GHOST_NDOFManager.cpp')
    try:
-        if window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc'):
+        if window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc', 'win64-mingw'):
            sources.remove('intern' + os.sep + 'GHOST_NDOFManagerWin32.cpp')
        elif window_system=='darwin':
            sources.remove('intern' + os.sep + 'GHOST_NDOFManagerCocoa.mm')
@ -108,7 +108,7 @@ else:
        pass


-if window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc'):
+if window_system in ('win32-vc', 'win32-mingw', 'cygwin', 'linuxcross', 'win64-vc', 'win64-mingw'):
    incs = env['BF_WINTAB_INC'] + ' ' + incs
    incs += ' ../utfconv'

--- a/intern/ghost/intern/GHOST_NDOFManagerX11.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManagerX11.cpp
@ -75,9 +75,11 @@ bool GHOST_NDOFManagerX11::available()

 bool GHOST_NDOFManagerX11::processEvents()
 {
+	bool anyProcessed = false;
+
+	if (m_available) {
 		GHOST_TUns64 now = m_system.getMilliSeconds();

-	bool anyProcessed = false;
 		spnav_event e;
 		while (spnav_poll_event(&e)) {
 			switch (e.type) {
@ -97,6 +99,8 @@ bool GHOST_NDOFManagerX11::processEvents()
 			}
 			anyProcessed = true;
 		}
+	}
+
 	return anyProcessed;
 }

--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@ -815,9 +815,10 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const& raw)
 	// send motion. Mark as 'sent' so motion will always get dispatched.
 	eventSent = true;

-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(FREE_WINDOWS64)
 	// using Microsoft compiler & header files
-	// they invented the RawInput API, so this version is (probably) correct
+	// they invented the RawInput API, so this version is (probably) correct.
+	// MinGW64 also works fine with this
 	BYTE const* data = raw.data.hid.bRawData;
 	// struct RAWHID {
 	// DWORD dwSizeHid;
--- a/intern/guardedalloc/MEM_sys_types.h
+++ b/intern/guardedalloc/MEM_sys_types.h
@ -108,8 +108,10 @@ typedef uint64_t  u_int64_t;
 #include <inttypes.h>

 #elif defined(FREE_WINDOWS)
+#ifndef FREE_WINDOWS64
 /* define htoln here, there must be a syntax error in winsock2.h in MinGW */
 unsigned long __attribute__((__stdcall__)) htonl(unsigned long);
+#endif
 #include <stdint.h>

 #else
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@ -243,7 +243,7 @@ void *MEM_dupallocN(void *vmemh)
 		MemHead *memh= vmemh;
 		memh--;
 		
-		if(memh->mmap)
+		if (memh->mmap)
 			newp= MEM_mapallocN(memh->len, "dupli_mapalloc");
 		else
 			newp= MEM_mallocN(memh->len, "dupli_alloc");
@ -265,8 +265,8 @@ void *MEM_reallocN(void *vmemh, size_t len)
 		memh--;

 		newp= MEM_mallocN(len, memh->name);
-		if(newp) {
-			if(len < memh->len)
+		if (newp) {
+			if (len < memh->len)
 				memcpy(newp, vmemh, len);
 			else
 				memcpy(newp, vmemh, memh->len);
@ -311,14 +311,14 @@ void *MEM_mallocN(size_t len, const char *str)
 	
 	memh= (MemHead *)malloc(len+sizeof(MemHead)+sizeof(MemTail));

-	if(memh) {
+	if (memh) {
 		make_memhead_header(memh, len, str);
 		mem_unlock_thread();
-		if(malloc_debug_memset && len)
+		if (malloc_debug_memset && len)
 			memset(memh+1, 255, len);

 #ifdef DEBUG_MEMCOUNTER
-		if(_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
+		if (_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
 			memcount_raise(__func__);
 		memh->_count= _mallocn_count++;
 #endif
@ -339,11 +339,11 @@ void *MEM_callocN(size_t len, const char *str)

 	memh= (MemHead *)calloc(len+sizeof(MemHead)+sizeof(MemTail),1);

-	if(memh) {
+	if (memh) {
 		make_memhead_header(memh, len, str);
 		mem_unlock_thread();
 #ifdef DEBUG_MEMCOUNTER
-		if(_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
+		if (_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
 			memcount_raise(__func__);
 		memh->_count= _mallocn_count++;
 #endif
@ -366,14 +366,14 @@ void *MEM_mapallocN(size_t len, const char *str)
 	memh= mmap(NULL, len+sizeof(MemHead)+sizeof(MemTail),
 			PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);

-	if(memh!=(MemHead *)-1) {
+	if (memh!=(MemHead *)-1) {
 		make_memhead_header(memh, len, str);
 		memh->mmap= 1;
 		mmap_in_use += len;
 		peak_mem = mmap_in_use > peak_mem ? mmap_in_use : peak_mem;
 		mem_unlock_thread();
 #ifdef DEBUG_MEMCOUNTER
-		if(_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
+		if (_mallocn_count==DEBUG_MEMCOUNTER_ERROR_VAL)
 			memcount_raise(__func__);
 		memh->_count= _mallocn_count++;
 #endif
@ -406,9 +406,9 @@ static int compare_len(const void *p1, const void *p2)
 	const MemPrintBlock *pb1= (const MemPrintBlock*)p1;
 	const MemPrintBlock *pb2= (const MemPrintBlock*)p2;

-	if(pb1->len < pb2->len)
+	if (pb1->len < pb2->len)
 		return 1;
-	else if(pb1->len == pb2->len)
+	else if (pb1->len == pb2->len)
 		return 0;
 	else
 		return -1;
@ -431,7 +431,7 @@ void MEM_printmemlist_stats(void)
 	membl = membase->first;
 	if (membl) membl = MEMNEXT(membl);

-	while(membl) {
+	while (membl) {
 		pb->name= membl->name;
 		pb->len= membl->len;
 		pb->items= 1;
@ -439,18 +439,18 @@ void MEM_printmemlist_stats(void)
 		totpb++;
 		pb++;

-		if(membl->next)
+		if (membl->next)
 			membl= MEMNEXT(membl->next);
 		else break;
 	}

 	/* sort by name and add together blocks with the same name */
 	qsort(printblock, totpb, sizeof(MemPrintBlock), compare_name);
-	for(a=0, b=0; a<totpb; a++) {
-		if(a == b) {
+	for (a = 0, b=0; a<totpb; a++) {
+		if (a == b) {
 			continue;
 		}
-		else if(strcmp(printblock[a].name, printblock[b].name) == 0) {
+		else if (strcmp(printblock[a].name, printblock[b].name) == 0) {
 			printblock[b].len += printblock[a].len;
 			printblock[b].items++;
 		}
@ -465,7 +465,7 @@ void MEM_printmemlist_stats(void)
 	qsort(printblock, totpb, sizeof(MemPrintBlock), compare_len);
 	printf("\ntotal memory len: %.3f MB\n", (double)mem_in_use/(double)(1024*1024));
 	printf(" ITEMS TOTAL-MiB AVERAGE-KiB TYPE\n");
-	for(a=0, pb=printblock; a<totpb; a++, pb++)
+	for (a = 0, pb=printblock; a<totpb; a++, pb++)
 		printf("%6d (%8.3f  %8.3f) %s\n", pb->items, (double)pb->len/(double)(1024*1024), (double)pb->len/1024.0/(double)pb->items, pb->name);

 	free(printblock);
@ -491,7 +491,7 @@ static void MEM_printmemlist_internal( int pydict )
 		print_error("# membase_debug.py\n");
 		print_error("membase = [\\\n");
 	}
-	while(membl) {
+	while (membl) {
 		if (pydict) {
 			fprintf(stderr, "{'len':" SIZET_FORMAT ", 'name':'''%s''', 'pointer':'%p'},\\\n", SIZET_ARG(membl->len), membl->name, (void *)(membl+1));
 		} else {
@ -501,7 +501,7 @@ static void MEM_printmemlist_internal( int pydict )
 			print_error("%s len: " SIZET_FORMAT " %p\n", membl->name, SIZET_ARG(membl->len), membl+1);
 #endif
 		}
-		if(membl->next)
+		if (membl->next)
 			membl= MEMNEXT(membl->next);
 		else break;
 	}
@ -536,9 +536,9 @@ void MEM_callbackmemlist(void (*func)(void*)) {
 	membl = membase->first;
 	if (membl) membl = MEMNEXT(membl);

-	while(membl) {
+	while (membl) {
 		func(membl+1);
-		if(membl->next)
+		if (membl->next)
 			membl= MEMNEXT(membl->next);
 		else break;
 	}
@ -554,13 +554,13 @@ short MEM_testN(void *vmemh) {
 	membl = membase->first;
 	if (membl) membl = MEMNEXT(membl);

-	while(membl) {
+	while (membl) {
 		if (vmemh == membl+1) {
 			mem_unlock_thread();
 			return 1;
 		}

-		if(membl->next)
+		if (membl->next)
 			membl= MEMNEXT(membl->next);
 		else break;
 	}
@ -585,13 +585,13 @@ short MEM_freeN(void *vmemh)		/* anders compileertie niet meer */
 	MemHead *memh= vmemh;
 	const char *name;

-	if (memh == NULL){
+	if (memh == NULL) {
 		MemorY_ErroR("free","attempt to free NULL pointer");
 		/* print_error(err_stream, "%d\n", (memh+4000)->tag1); */
 		return(-1);
 	}

-	if(sizeof(intptr_t)==8) {
+	if (sizeof(intptr_t)==8) {
 		if (((intptr_t) memh) & 0x7) {
 			MemorY_ErroR("free","attempt to free illegal pointer");
 			return(-1);
@ -605,7 +605,7 @@ short MEM_freeN(void *vmemh)		/* anders compileertie niet meer */
 	}
 	
 	memh--;
-	if(memh->tag1 == MEMFREE && memh->tag2 == MEMFREE) {
+	if (memh->tag1 == MEMFREE && memh->tag2 == MEMFREE) {
 		MemorY_ErroR(memh->name,"double free");
 		return(-1);
 	}
@ -613,7 +613,7 @@ short MEM_freeN(void *vmemh)		/* anders compileertie niet meer */
 	mem_lock_thread();
 	if ((memh->tag1 == MEMTAG1) && (memh->tag2 == MEMTAG2) && ((memh->len & 0x3) == 0)) {
 		memt = (MemTail *)(((char *) memh) + sizeof(MemHead) + memh->len);
-		if (memt->tag3 == MEMTAG3){
+		if (memt->tag3 == MEMTAG3) {
 			
 			memh->tag1 = MEMFREE;
 			memh->tag2 = MEMFREE;
@ -628,7 +628,7 @@ short MEM_freeN(void *vmemh)		/* anders compileertie niet meer */
 		error = 2;
 		MemorY_ErroR(memh->name,"end corrupt");
 		name = check_memlist(memh);
-		if (name != NULL){
+		if (name != NULL) {
 			if (name != memh->name) MemorY_ErroR(name,"is also corrupt");
 		}
 	} else{
@ -694,13 +694,13 @@ static void rem_memblock(MemHead *memh)
 	totblock--;
 	mem_in_use -= memh->len;

-	if(memh->mmap) {
+	if (memh->mmap) {
 		mmap_in_use -= memh->len;
 		if (munmap(memh, memh->len + sizeof(MemHead) + sizeof(MemTail)))
 			printf("Couldn't unmap memory %s\n", memh->name);
 	}
 	else {
-		if(malloc_debug_memset && memh->len)
+		if (malloc_debug_memset && memh->len)
 			memset(memh+1, 255, memh->len);
 		free(memh);
 	}
@ -723,7 +723,7 @@ static const char *check_memlist(MemHead *memh)
 	forw = membase->first;
 	if (forw) forw = MEMNEXT(forw);
 	forwok = NULL;
-	while(forw){
+	while (forw) {
 		if (forw->tag1 != MEMTAG1 || forw->tag2 != MEMTAG2) break;
 		forwok = forw;
 		if (forw->next) forw = MEMNEXT(forw->next);
@ -733,7 +733,7 @@ static const char *check_memlist(MemHead *memh)
 	back = (MemHead *) membase->last;
 	if (back) back = MEMNEXT(back);
 	backok = NULL;
-	while(back){
+	while (back) {
 		if (back->tag1 != MEMTAG1 || back->tag2 != MEMTAG2) break;
 		backok = back;
 		if (back->prev) back = MEMNEXT(back->prev);
@ -742,13 +742,13 @@ static const char *check_memlist(MemHead *memh)

 	if (forw != back) return ("MORE THAN 1 MEMORYBLOCK CORRUPT");

-	if (forw == NULL && back == NULL){
+	if (forw == NULL && back == NULL) {
 		/* geen foute headers gevonden dan maar op zoek naar memblock*/

 		forw = membase->first;
 		if (forw) forw = MEMNEXT(forw);
 		forwok = NULL;
-		while(forw){
+		while (forw) {
 			if (forw == memh) break;
 			if (forw->tag1 != MEMTAG1 || forw->tag2 != MEMTAG2) break;
 			forwok = forw;
@ -760,7 +760,7 @@ static const char *check_memlist(MemHead *memh)
 		back = (MemHead *) membase->last;
 		if (back) back = MEMNEXT(back);
 		backok = NULL;
-		while(back){
+		while (back) {
 			if (back == memh) break;
 			if (back->tag1 != MEMTAG1 || back->tag2 != MEMTAG2) break;
 			backok = back;
@ -772,10 +772,10 @@ static const char *check_memlist(MemHead *memh)
 	if (forwok) name = forwok->nextname;
 	else name = "No name found";

-	if (forw == memh){
+	if (forw == memh) {
 		/* voor alle zekerheid wordt dit block maar uit de lijst gehaald */
-		if (forwok){
-			if (backok){
+		if (forwok) {
+			if (backok) {
 				forwok->next = (MemHead *)&backok->next;
 				backok->prev = (MemHead *)&forwok->next;
 				forwok->nextname = backok->name;
@ -785,7 +785,7 @@ static const char *check_memlist(MemHead *memh)
 /*  				membase->last = (struct Link *) &forwok->next; */
 			}
 		} else{
-			if (backok){
+			if (backok) {
 				backok->prev = NULL;
 				membase->first = &backok->next;
 			} else{
--- a/intern/mikktspace/mikktspace.c
+++ b/intern/mikktspace/mikktspace.c
--- a/Show More
+++ b/Show More