Cycles: ensure any SSE data is allocated 16 byte aligned, happens automatically

on many platforms but is not assured everywhere.
This commit is contained in:
Brecht Van Lommel 2013-06-22 14:35:09 +00:00
parent 5da48f425f
commit 240fb6fa26
2 changed files with 43 additions and 12 deletions

@ -552,6 +552,30 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m12
}
#endif
#ifndef __KERNEL_GPU__
static inline void *malloc_aligned(size_t size, size_t alignment)
{
void *data = (void*)malloc(size + sizeof(void*) + alignment - 1);
union { void *ptr; size_t offset; } u;
u.ptr = (char*)data + sizeof(void*);
u.offset = (u.offset + alignment - 1) & ~(alignment - 1);
*(((void**)u.ptr) - 1) = data;
return u.ptr;
}
static inline void free_aligned(void *ptr)
{
if(ptr) {
void *data = *(((void**)ptr) - 1);
free(data);
}
}
#endif
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_H__ */

@ -24,18 +24,22 @@
#include <string.h>
#include <vector>
#include "util_types.h"
CCL_NAMESPACE_BEGIN
using std::vector;
/* Array
*
* Simplified version of vector, serving two purposes:
* Simplified version of vector, serving multiple purposes:
* - somewhat faster in that it does not clear memory on resize/alloc,
* this was actually showing up in profiles quite significantly
* - if this is used, we are not tempted to use inefficient operations */
* this was actually showing up in profiles quite significantly. it
* also does not run any constructors/destructors
* - if this is used, we are not tempted to use inefficient operations
* - aligned allocation for SSE data types */
template<typename T>
template<typename T, size_t alignment = 16>
class array
{
public:
@ -52,7 +56,7 @@ public:
datasize = 0;
}
else {
data = new T[newsize];
data = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
datasize = newsize;
}
}
@ -69,7 +73,7 @@ public:
datasize = 0;
}
else {
data = new T[from.datasize];
data = (T*)malloc_aligned(sizeof(T)*from.datasize, alignment);
memcpy(data, from.data, from.datasize*sizeof(T));
datasize = from.datasize;
}
@ -83,7 +87,10 @@ public:
data = NULL;
if(datasize > 0) {
data = new T[datasize];
data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
memcpy(data, &from[0], datasize*sizeof(T));
free_aligned(data);
data = (T*)malloc_aligned(sizeof(T)*datasize, alignment);
memcpy(data, &from[0], datasize*sizeof(T));
}
@ -92,7 +99,7 @@ public:
~array()
{
delete [] data;
free_aligned(data);
}
void resize(size_t newsize)
@ -100,10 +107,10 @@ public:
if(newsize == 0) {
clear();
}
else {
T *newdata = new T[newsize];
else if(newsize != datasize) {
T *newdata = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
delete [] data;
free_aligned(data);
data = newdata;
datasize = newsize;
@ -112,7 +119,7 @@ public:
void clear()
{
delete [] data;
free_aligned(data);
data = NULL;
datasize = 0;
}