diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index 3140bf23376..c12751979cd 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -14,22 +14,26 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 #include "bvh_build.h"
 #include "bvh_sort.h"
 
 #include "util_algorithm.h"
 #include "util_debug.h"
+#include "util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* silly workaround for float extended precision that happens when compiling
+static const int BVH_SORT_THRESHOLD = 4096;
+
+/* Silly workaround for float extended precision that happens when compiling
  * on x86, due to one float staying in 80 bit precision register and the other
- * not, which causes the strictly weak ordering to break */
+ * not, which causes the strictly weak ordering to break.
+ */
 #if !defined(__i386__)
-#define NO_EXTENDED_PRECISION
+#  define NO_EXTENDED_PRECISION
 #else
-#define NO_EXTENDED_PRECISION volatile
+#  define NO_EXTENDED_PRECISION volatile
 #endif
 
 struct BVHReferenceCompare {
@@ -41,28 +45,148 @@ public:
 		dim = dim_;
 	}
 
-	bool operator()(const BVHReference& ra, const BVHReference& rb)
+	/* Compare two references.
+	 *
+	 * Returns value is similar to return value of strcmp().
+	 */
+	__forceinline int compare(const BVHReference& ra,
+	                          const BVHReference& rb) const
 	{
 		NO_EXTENDED_PRECISION float ca = ra.bounds().min[dim] + ra.bounds().max[dim];
 		NO_EXTENDED_PRECISION float cb = rb.bounds().min[dim] + rb.bounds().max[dim];
 
-		if(ca < cb) return true;
-		else if(ca > cb) return false;
-		else if(ra.prim_object() < rb.prim_object()) return true;
-		else if(ra.prim_object() > rb.prim_object()) return false;
-		else if(ra.prim_index() < rb.prim_index()) return true;
-		else if(ra.prim_index() > rb.prim_index()) return false;
-		else if(ra.prim_type() < rb.prim_type()) return true;
-		else if(ra.prim_type() > rb.prim_type()) return false;
+		if(ca < cb) return -1;
+		else if(ca > cb) return 1;
+		else if(ra.prim_object() < rb.prim_object()) return -1;
+		else if(ra.prim_object() > rb.prim_object()) return 1;
+		else if(ra.prim_index() < rb.prim_index()) return -1;
+		else if(ra.prim_index() > rb.prim_index()) return 1;
+		else if(ra.prim_type() < rb.prim_type()) return -1;
+		else if(ra.prim_type() > rb.prim_type()) return 1;
 
-		return false;
+		return 0;
+	}
+
+	bool operator()(const BVHReference& ra, const BVHReference& rb)
+	{
+		return (compare(ra, rb) < 0);
 	}
 };
 
+static void bvh_reference_sort_threaded(TaskPool *task_pool,
+                                        BVHReference *data,
+                                        const int job_start,
+                                        const int job_end,
+                                        const BVHReferenceCompare& compare);
+
+class BVHSortTask : public Task {
+public:
+	BVHSortTask(TaskPool *task_pool,
+	            BVHReference *data,
+	            const int job_start,
+	            const int job_end,
+	            const BVHReferenceCompare& compare)
+	{
+		run = function_bind(bvh_reference_sort_threaded,
+		                    task_pool,
+		                    data,
+		                    job_start,
+		                    job_end,
+		                    compare);
+	}
+};
+
+/* Multi-threaded reference sort. */
+static void bvh_reference_sort_threaded(TaskPool *task_pool,
+                                        BVHReference *data,
+                                        const int job_start,
+                                        const int job_end,
+                                        const BVHReferenceCompare& compare)
+{
+	int start = job_start, end = job_end;
+	bool have_work = (start < end);
+	while(have_work) {
+		const int count = job_end - job_start;
+		if(count < BVH_SORT_THRESHOLD) {
+			/* Number of reference low enough, faster to finish the job
+			 * in one thread rather than to spawn more threads.
+			 */
+			sort(data+job_start, data+job_end+1, compare);
+			break;
+		}
+		/* Single QSort step.
+		 * Use median-of-three method for the pivot point.
+		 */
+		int left = start, right = end;
+		int center = (left + right) >> 1;
+		if(compare.compare(data[left], data[center]) > 0) {
+			swap(data[left], data[center]);
+		}
+		if(compare.compare(data[left], data[right]) > 0) {
+			swap(data[left], data[right]);
+		}
+		if (compare.compare(data[center], data[right]) > 0) {
+			swap(data[center], data[right]);
+		}
+		swap(data[center], data[right - 1]);
+		BVHReference median = data[right - 1];
+		do {
+			while(compare.compare(data[left], median) < 0) {
+				++left;
+			}
+			while(compare.compare(data[right], median) > 0) {
+				--right;
+			}
+			if(left <= right) {
+				swap(data[left], data[right]);
+				++left;
+				--right;
+			}
+		} while(left <= right);
+		/* We only create one new task here to reduce downside effects of
+		 * latency in TaskScheduler.
+		 * So generally current thread keeps working on the left part of the
+		 * array, and we create new task for the right side.
+		 * However, if there's nothing to be done in the left side of the array
+		 * we don't create any tasks and make it so current thread works on the
+		 * right side.
+		 */
+		have_work = false;
+		if(left < end) {
+			if(start < right) {
+				task_pool->push(new BVHSortTask(task_pool,
+				                                data,
+				                                left, end,
+				                                compare), true);
+			}
+			else {
+				start = left;
+				have_work = true;
+			}
+		}
+		if(start < right) {
+			end = right;
+			have_work = true;
+		}
+	}
+}
+
 void bvh_reference_sort(int start, int end, BVHReference *data, int dim)
 {
+	const int count = end - start;
 	BVHReferenceCompare compare(dim);
-	sort(data+start, data+end, compare);
+	if(count < BVH_SORT_THRESHOLD) {
+		/* It is important to not use any mutex if array is small enough,
+		 * otherwise we end up in situation when we're going to sleep far
+		 * too often.
+		 */
+		sort(data+start, data+end, compare);
+	}
+	else {
+		TaskPool task_pool;
+		bvh_reference_sort_threaded(&task_pool, data, start, end - 1, dim);
+		task_pool.wait_work();
+	}
 }
 
 CCL_NAMESPACE_END