forked from bartvdbraak/blender
WIP commit to be able to revert later (known bug: unstable without sse enabled - weird)
This commit is contained in:
parent
f28ab5de21
commit
4cb5470f82
@ -271,7 +271,7 @@ DO_INLINE float dot_lfvector(lfVector *fLongVectorA, lfVector *fLongVectorB, uns
|
||||
unsigned int i = 0;
|
||||
float temp = 0.0;
|
||||
// schedule(guided, 2)
|
||||
#pragma omp parallel for reduction(+: temp) schedule(static)
|
||||
#pragma omp parallel for reduction(+: temp) private(i) schedule(static)
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
temp += INPR(fLongVectorA[i], fLongVectorB[i]);
|
||||
@ -287,8 +287,34 @@ DO_INLINE void add_lfvector_lfvector(lfVector *to, lfVector *fLongVectorA, lfVec
|
||||
{
|
||||
VECADD(to[i], fLongVectorA[i], fLongVectorB[i]);
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
#ifdef __SSE3__
|
||||
DO_INLINE void add_lfvector(lfVector *to, lfVector *fLongVectorA, unsigned int verts) {
|
||||
__m128 v1, v2;
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
v1 = _mm_load_ps(to[i]);
|
||||
v2 = _mm_load_ps(fLongVectorA[i]);
|
||||
|
||||
v1 = _mm_add_ps(v1, v2);
|
||||
|
||||
_mm_store_ps(to[i], v1);
|
||||
}
|
||||
}
|
||||
#else */
|
||||
DO_INLINE void add_lfvector(lfVector *to, lfVector *fLongVectorA, unsigned int verts) {
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
VECADD(to[i], to[i], fLongVectorA[i]);
|
||||
}
|
||||
}
|
||||
// #endif
|
||||
|
||||
/* A = B + C * float --> for big vector */
|
||||
DO_INLINE void add_lfvector_lfvectorS(lfVector *to, lfVector *fLongVectorA, lfVector *fLongVectorB, float bS, unsigned int verts)
|
||||
{
|
||||
@ -300,6 +326,76 @@ DO_INLINE void add_lfvector_lfvectorS(lfVector *to, lfVector *fLongVectorA, lfVe
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* A = A + B * float --> for big vector */
|
||||
// tested
|
||||
/*
|
||||
#ifdef __SSE3__
|
||||
DO_INLINE void add_lfvectorS(lfVector *to, lfVector *fLongVectorA, float bS, unsigned int verts) {
|
||||
__m128 v1, v2, v3;
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
v1 = _mm_load_ps(to[i]);
|
||||
v2 = _mm_load_ps(fLongVectorA[i]);
|
||||
v3 = _mm_set1_ps(bS);
|
||||
|
||||
v2 = _mm_mul_ps(v2, v3);
|
||||
v1 = _mm_add_ps(v1, v2);
|
||||
|
||||
_mm_store_ps(to[i], v1);
|
||||
}
|
||||
}
|
||||
#else */
|
||||
DO_INLINE void add_lfvectorS(lfVector *to, lfVector *fLongVectorA, float bS, unsigned int verts) {
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
VECADDS(to[i], to[i], fLongVectorA[i], bS);
|
||||
}
|
||||
}
|
||||
// #endif
|
||||
|
||||
|
||||
// tested
|
||||
/*
|
||||
#ifdef __SSE3__
|
||||
DO_INLINE float add_lfvectorS_dot(lfVector *to, lfVector *fLongVectorA, float bS, unsigned int verts) {
|
||||
register __m128 v1, v2, v3, v4;
|
||||
unsigned int i = 0;
|
||||
float temp;
|
||||
|
||||
v4 = _mm_setzero_ps();
|
||||
// #pragma omp parallel for reduction(+: v4) private(i, v1, v2, v3) schedule(static)
|
||||
for(i = 0; i < verts; i++)
|
||||
{
|
||||
v1 = _mm_load_ps(to[i]);
|
||||
v2 = _mm_load_ps(fLongVectorA[i]);
|
||||
v3 = _mm_set1_ps(bS);
|
||||
|
||||
v2 = _mm_mul_ps(v2, v3);
|
||||
v1 = _mm_add_ps(v1, v2);
|
||||
|
||||
_mm_stream_ps(to[i], v1);
|
||||
|
||||
v4 = _mm_add_ps(v4, _mm_mul_ps(v1,v1));
|
||||
}
|
||||
|
||||
v4 = _mm_hadd_ps(v4, v4);
|
||||
v4 = _mm_hadd_ps(v4, v4);
|
||||
_mm_store_ss(&temp, v4);
|
||||
|
||||
return temp;
|
||||
}
|
||||
#else */
|
||||
DO_INLINE float add_lfvectorS_dot(lfVector *to, lfVector *fLongVectorA, float bS, unsigned int verts) {
|
||||
add_lfvectorS(to, fLongVectorA, bS, verts);
|
||||
return dot_lfvector(to, to, verts);
|
||||
}
|
||||
// #endif
|
||||
|
||||
/* A = B * float + C * float --> for big vector */
|
||||
DO_INLINE void add_lfvectorS_lfvectorS(lfVector *to, lfVector *fLongVectorA, float aS, lfVector *fLongVectorB, float bS, unsigned int verts)
|
||||
{
|
||||
@ -412,8 +508,9 @@ DO_INLINE void mul_fvector_fmatrix(float *to, float *from, float matrix[3][4])
|
||||
|
||||
/* 3x3 matrix multiplied by a vector */
|
||||
/* STATUS: verified */
|
||||
/*
|
||||
#ifdef __SSE3__
|
||||
DO_INLINE void mul_fmatrix_fvector(float *to, float matrix[3][4], float *from) {
|
||||
DO_INLINE void mul_fmatrix_fvector(float to[4], float matrix[3][4], float from[4]) {
|
||||
__m128 v1, v2, v3, v4;
|
||||
|
||||
v1 = _mm_load_ps(&matrix[0][0]);
|
||||
@ -431,14 +528,18 @@ DO_INLINE void mul_fmatrix_fvector(float *to, float matrix[3][4], float *from) {
|
||||
|
||||
_mm_store_ps(to, v4);
|
||||
}
|
||||
#else
|
||||
DO_INLINE void mul_fmatrix_fvector(float *to, float matrix[3][4], float *from)
|
||||
#else */
|
||||
DO_INLINE void mul_fmatrix_fvector(float to[4], float matrix[3][4], float from[4])
|
||||
{
|
||||
to[0] = INPR(matrix[0],from);
|
||||
to[1] = INPR(matrix[1],from);
|
||||
to[2] = INPR(matrix[2],from);
|
||||
float temp[3] = {0,0,0};
|
||||
|
||||
temp[0] = INPR(matrix[0],from);
|
||||
temp[1] = INPR(matrix[1],from);
|
||||
temp[2] = INPR(matrix[2],from);
|
||||
|
||||
VECCOPY(to, temp);
|
||||
}
|
||||
#endif
|
||||
// #endif
|
||||
|
||||
/* 3x3 matrix multiplied by a 3x3 matrix */
|
||||
/* STATUS: verified */
|
||||
@ -538,9 +639,9 @@ DO_INLINE void mulsub_fmatrix_fmatrix(float to[3][3], float matrixA[3][3], float
|
||||
*/
|
||||
/* 3x3 matrix multiplied+added by a vector */
|
||||
/* STATUS: verified */
|
||||
|
||||
/*
|
||||
#ifdef __SSE3__
|
||||
DO_INLINE void muladd_fmatrix_fvector(float to[3], float matrix[3][4], float from[3]) {
|
||||
DO_INLINE void muladd_fmatrix_fvector(float to[4], float matrix[3][4], float from[4]) {
|
||||
__m128 v1, v2, v3, v4;
|
||||
|
||||
v1 = _mm_load_ps(&matrix[0][0]);
|
||||
@ -561,8 +662,8 @@ DO_INLINE void muladd_fmatrix_fvector(float to[3], float matrix[3][4], float fro
|
||||
|
||||
_mm_store_ps(to, v4);
|
||||
}
|
||||
#else
|
||||
DO_INLINE void muladd_fmatrix_fvector(float to[3], float matrix[3][4], float from[3])
|
||||
#else */
|
||||
DO_INLINE void muladd_fmatrix_fvector(float to[4], float matrix[3][4], float from[4])
|
||||
{
|
||||
float temp[3] = { 0,0,0 };
|
||||
|
||||
@ -572,7 +673,7 @@ DO_INLINE void muladd_fmatrix_fvector(float to[3], float matrix[3][4], float fro
|
||||
|
||||
VECADD(to, to, temp);
|
||||
}
|
||||
#endif
|
||||
// #endif
|
||||
|
||||
/* 3x3 matrix multiplied+sub'ed by a vector */
|
||||
/*
|
||||
@ -660,27 +761,51 @@ DO_INLINE void mul_bfmatrix_S(fmatrix3x3 *matrix, float scalar)
|
||||
/* STATUS: verified */
|
||||
void mul_bfmatrix_lfvector( lfVector *to, fmatrix3x3 *from, lfVector *fLongVector)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
float *tflongvector;
|
||||
unsigned int i = 0, numverts = from[0].vcount;
|
||||
// lfVector *tflongvector = create_lfvector(numverts);
|
||||
float temp[4]={0,0,0,0};
|
||||
|
||||
zero_lfvector(to, from[0].vcount);
|
||||
zero_lfvector(to, numverts);
|
||||
/*
|
||||
#pragma omp parallel sections private(i)
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
for(i = numverts; i < numverts+from[0].scount; i++)
|
||||
{
|
||||
muladd_fmatrix_fvector(to[from[i].c], from[i].m, fLongVector[from[i].r]);
|
||||
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
for(i = 0; i < numverts+from[0].scount; i++)
|
||||
{
|
||||
muladd_fmatrix_fvector(tflongvector[from[i].r], from[i].m, fLongVector[from[i].c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_lfvector(to, tflongvector, numverts);
|
||||
|
||||
del_lfvector(tflongvector);
|
||||
*/
|
||||
// alternative NON OpenMP code
|
||||
/* process diagonal elements */
|
||||
|
||||
for(i = 0; i < from[0].vcount; i++)
|
||||
{
|
||||
mul_fmatrix_fvector(to[from[i].r], from[i].m, fLongVector[from[i].c]);
|
||||
}
|
||||
|
||||
|
||||
/* process off-diagonal entries (every off-diagonal entry needs to be symmetric) */
|
||||
// TODO: pragma below is wrong, correct it!
|
||||
// #pragma omp parallel for shared(to,from, fLongVector) private(i)
|
||||
|
||||
for(i = from[0].vcount; i < from[0].vcount+from[0].scount; i++)
|
||||
{
|
||||
muladd_fmatrix_fvector(to[from[i].c], from[i].m, fLongVector[from[i].r]);
|
||||
muladd_fmatrix_fvector(to[from[i].r], from[i].m, fLongVector[from[i].c]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* SPARSE SYMMETRIC add big matrix with big matrix: A = B + C*/
|
||||
@ -860,7 +985,7 @@ int implicit_init (Object *ob, ClothModifierData *clmd)
|
||||
|
||||
return 1;
|
||||
}
|
||||
int implicit_free (ClothModifierData *clmd)
|
||||
int implicit_free (ClothModifierData *clmd)
|
||||
{
|
||||
Implicit_Data *id;
|
||||
Cloth *cloth;
|
||||
@ -1010,12 +1135,13 @@ int cg_filtered(lfVector *ldV, fmatrix3x3 *lA, lfVector *lB, lfVector *z, fmatr
|
||||
|
||||
// X = X + d*a;
|
||||
add_lfvector_lfvectorS(ldV, ldV, d, a, numverts);
|
||||
|
||||
s_prev = s;
|
||||
|
||||
// r = r - q*a;
|
||||
sub_lfvector_lfvectorS(r, r, q, a, numverts);
|
||||
|
||||
s_prev = s;
|
||||
add_lfvector_lfvectorS(r, r, q, -a, numverts);
|
||||
s = dot_lfvector(r, r, numverts);
|
||||
// s = add_lfvectorS_dot(r, q, -a, numverts);
|
||||
|
||||
//d = r+d*(s/s_prev);
|
||||
add_lfvector_lfvectorS(d, r, d, (s/s_prev), numverts);
|
||||
@ -1069,7 +1195,7 @@ int cg_filtered_pre(lfVector *dv, fmatrix3x3 *lA, lfVector *lB, lfVector *z, fma
|
||||
BuildPPinv(lA, P, Pinv);
|
||||
|
||||
filter(dv, S);
|
||||
add_lfvector_lfvector(dv, dv, z, numverts);
|
||||
add_lfvector(dv, z, numverts);
|
||||
|
||||
mul_bfmatrix_lfvector(r, lA, dv);
|
||||
sub_lfvector_lfvector(r, lB, r, numverts);
|
||||
@ -1091,9 +1217,9 @@ int cg_filtered_pre(lfVector *dv, fmatrix3x3 *lA, lfVector *lB, lfVector *z, fma
|
||||
|
||||
alpha = deltaNew / dot_lfvector(p, s, numverts);
|
||||
|
||||
add_lfvector_lfvectorS(dv, dv, p, alpha, numverts);
|
||||
add_lfvectorS(dv, p, alpha, numverts);
|
||||
|
||||
sub_lfvector_lfvectorS(r, r, s, alpha, numverts);
|
||||
add_lfvectorS(r, s, -alpha, numverts);
|
||||
|
||||
mul_bfmatrix_lfvector(h, Pinv, r);
|
||||
filter(h, S);
|
||||
@ -1257,7 +1383,7 @@ DO_INLINE void cloth_calc_spring_force(ClothModifierData *clmd, ClothSpring *s,
|
||||
|
||||
// DG: My formula to handle bending for the AIMEX scheme
|
||||
// multiply with 1000 because of numerical problems
|
||||
if( ((k*1000.0)*dt*dt) < -0.18 )
|
||||
// if( ((k*1000.0)*dt*dt) < -0.18 )
|
||||
{
|
||||
dfdx_spring_type2(s->dfdx, dir,length,L,clmd->sim_parms->bending, cb);
|
||||
clmd->sim_parms->flags |= CLOTH_SIMSETTINGS_FLAG_BIG_FORCE;
|
||||
@ -1392,7 +1518,7 @@ void cloth_calc_force(ClothModifierData *clmd, lfVector *lF, lfVector *lX, lfVec
|
||||
float speed[3] = {0.0f, 0.0f,0.0f};
|
||||
float force[3]= {0.0f, 0.0f, 0.0f};
|
||||
|
||||
#pragma omp parallel for private (i) shared(lF)
|
||||
#pragma omp parallel for private (i) shared(lF) schedule(static)
|
||||
for(i = 0; i < cloth->numverts; i++)
|
||||
{
|
||||
float vertexnormal[3]={0,0,0};
|
||||
@ -1467,8 +1593,8 @@ void simulate_implicit_euler(lfVector *Vnew, lfVector *lX, lfVector *lV, lfVecto
|
||||
|
||||
add_lfvectorS_lfvectorS(B, lF, dt, dFdXmV, (dt*dt), numverts);
|
||||
|
||||
cg_filtered(dV, A, B, z, S); // conjugate gradient algorithm to solve Ax=b
|
||||
// cg_filtered_pre(dV, A, B, z, S, P, Pinv);
|
||||
// cg_filtered(dV, A, B, z, S); // conjugate gradient algorithm to solve Ax=b
|
||||
cg_filtered_pre(dV, A, B, z, S, P, Pinv);
|
||||
|
||||
// advance velocities
|
||||
add_lfvector_lfvector(Vnew, lV, dV, numverts);
|
||||
|
Loading…
Reference in New Issue
Block a user