Cycles: ensure any SSE data is allocated 16 byte aligned, happens automatically

on many platforms but is not assured everywhere.
This commit is contained in:
Brecht Van Lommel 2013-06-22 14:35:09 +00:00
parent 5da48f425f
commit 240fb6fa26
2 changed files with 43 additions and 12 deletions

@ -552,6 +552,30 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m12
} }
#endif #endif
#ifndef __KERNEL_GPU__
static inline void *malloc_aligned(size_t size, size_t alignment)
{
void *data = (void*)malloc(size + sizeof(void*) + alignment - 1);
union { void *ptr; size_t offset; } u;
u.ptr = (char*)data + sizeof(void*);
u.offset = (u.offset + alignment - 1) & ~(alignment - 1);
*(((void**)u.ptr) - 1) = data;
return u.ptr;
}
static inline void free_aligned(void *ptr)
{
if(ptr) {
void *data = *(((void**)ptr) - 1);
free(data);
}
}
#endif
CCL_NAMESPACE_END CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_H__ */ #endif /* __UTIL_TYPES_H__ */

@ -24,18 +24,22 @@
#include <string.h> #include <string.h>
#include <vector> #include <vector>
#include "util_types.h"
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
using std::vector; using std::vector;
/* Array /* Array
* *
* Simplified version of vector, serving two purposes: * Simplified version of vector, serving multiple purposes:
* - somewhat faster in that it does not clear memory on resize/alloc, * - somewhat faster in that it does not clear memory on resize/alloc,
* this was actually showing up in profiles quite significantly * this was actually showing up in profiles quite significantly. it
* - if this is used, we are not tempted to use inefficient operations */ * also does not run any constructors/destructors
* - if this is used, we are not tempted to use inefficient operations
* - aligned allocation for SSE data types */
template<typename T> template<typename T, size_t alignment = 16>
class array class array
{ {
public: public:
@ -52,7 +56,7 @@ public:
datasize = 0; datasize = 0;
} }
else { else {
data = new T[newsize]; data = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
datasize = newsize; datasize = newsize;
} }
} }
@ -69,7 +73,7 @@ public:
datasize = 0; datasize = 0;
} }
else { else {
data = new T[from.datasize]; data = (T*)malloc_aligned(sizeof(T)*from.datasize, alignment);
memcpy(data, from.data, from.datasize*sizeof(T)); memcpy(data, from.data, from.datasize*sizeof(T));
datasize = from.datasize; datasize = from.datasize;
} }
@ -83,7 +87,10 @@ public:
data = NULL; data = NULL;
if(datasize > 0) { if(datasize > 0) {
data = new T[datasize]; data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
memcpy(data, &from[0], datasize*sizeof(T));
free_aligned(data);
data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
memcpy(data, &from[0], datasize*sizeof(T)); memcpy(data, &from[0], datasize*sizeof(T));
} }
@ -92,7 +99,7 @@ public:
~array() ~array()
{ {
delete [] data; free_aligned(data);
} }
void resize(size_t newsize) void resize(size_t newsize)
@ -100,10 +107,10 @@ public:
if(newsize == 0) { if(newsize == 0) {
clear(); clear();
} }
else { else if(newsize != datasize) {
T *newdata = new T[newsize]; T *newdata = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T)); memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
delete [] data; free_aligned(data);
data = newdata; data = newdata;
datasize = newsize; datasize = newsize;
@ -112,7 +119,7 @@ public:
void clear() void clear()
{ {
delete [] data; free_aligned(data);
data = NULL; data = NULL;
datasize = 0; datasize = 0;
} }