/* * Copyright (c) 2005 Erwin Coumans http://www.erwincoumans.com * * Permission to use, copy, modify, distribute and sell this software * and its documentation for any purpose is hereby granted without fee, * provided that the above copyright notice appear in all copies. * Erwin Coumans makes no representations about the suitability * of this software for any purpose. * It is provided "as is" without express or implied warranty. */ #include "SorLcp.h" #ifdef USE_SOR_SOLVER // SOR LCP taken from ode quickstep, // todo: write own successive overrelaxation gauss-seidel, or jacobi iterative solver #include "SimdScalar.h" #include "Dynamics/RigidBody.h" #include #include //FLT_MAX #ifdef WIN32 #include #endif #include #include #ifdef WIN32 #include #else #include #endif #include "Dynamics/BU_Joint.h" #include "ContactSolverInfo.h" //////////////////////////////////////////////////////////////////// //math stuff typedef SimdScalar dVector4[4]; typedef SimdScalar dMatrix3[4*3]; #define dInfinity FLT_MAX #define dRecip(x) ((float)(1.0f/(x))) /* reciprocal */ #define dMULTIPLY0_331NEW(A,op,B,C) \ {\ float tmp[3];\ tmp[0] = C.getX();\ tmp[1] = C.getY();\ tmp[2] = C.getZ();\ dMULTIPLYOP0_331(A,op,B,tmp);\ } #define dMULTIPLY0_331(A,B,C) dMULTIPLYOP0_331(A,=,B,C) #define dMULTIPLYOP0_331(A,op,B,C) \ (A)[0] op dDOT1((B),(C)); \ (A)[1] op dDOT1((B+4),(C)); \ (A)[2] op dDOT1((B+8),(C)); #define dAASSERT ASSERT #define dIASSERT ASSERT #define REAL float #define dDOTpq(a,b,p,q) ((a)[0]*(b)[0] + (a)[p]*(b)[q] + (a)[2*(p)]*(b)[2*(q)]) SimdScalar dDOT1 (const SimdScalar *a, const SimdScalar *b) { return dDOTpq(a,b,1,1); } #define dDOT14(a,b) dDOTpq(a,b,1,4) #define dCROSS(a,op,b,c) \ (a)[0] op ((b)[1]*(c)[2] - (b)[2]*(c)[1]); \ (a)[1] op ((b)[2]*(c)[0] - (b)[0]*(c)[2]); \ (a)[2] op ((b)[0]*(c)[1] - (b)[1]*(c)[0]); #define dMULTIPLYOP2_333(A,op,B,C) \ (A)[0] op dDOT1((B),(C)); \ (A)[1] op dDOT1((B),(C+4)); \ (A)[2] op dDOT1((B),(C+8)); \ (A)[4] op dDOT1((B+4),(C)); \ (A)[5] op dDOT1((B+4),(C+4)); \ (A)[6] op dDOT1((B+4),(C+8)); \ (A)[8] op dDOT1((B+8),(C)); \ (A)[9] op dDOT1((B+8),(C+4)); \ (A)[10] op dDOT1((B+8),(C+8)); #define dMULTIPLYOP0_333(A,op,B,C) \ (A)[0] op dDOT14((B),(C)); \ (A)[1] op dDOT14((B),(C+1)); \ (A)[2] op dDOT14((B),(C+2)); \ (A)[4] op dDOT14((B+4),(C)); \ (A)[5] op dDOT14((B+4),(C+1)); \ (A)[6] op dDOT14((B+4),(C+2)); \ (A)[8] op dDOT14((B+8),(C)); \ (A)[9] op dDOT14((B+8),(C+1)); \ (A)[10] op dDOT14((B+8),(C+2)); #define dMULTIPLY2_333(A,B,C) dMULTIPLYOP2_333(A,=,B,C) #define dMULTIPLY0_333(A,B,C) dMULTIPLYOP0_333(A,=,B,C) #define dMULTIPLYADD0_331(A,B,C) dMULTIPLYOP0_331(A,+=,B,C) //////////////////////////////////////////////////////////////////// #define EFFICIENT_ALIGNMENT 16 #define dEFFICIENT_SIZE(x) ((((x)-1)|(EFFICIENT_ALIGNMENT-1))+1) /* alloca aligned to the EFFICIENT_ALIGNMENT. note that this can waste * up to 15 bytes per allocation, depending on what alloca() returns. */ #define dALLOCA16(n) \ ((char*)dEFFICIENT_SIZE(((size_t)(alloca((n)+(EFFICIENT_ALIGNMENT-1)))))) ///////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////// #ifdef DEBUG #define ANSI_FTOL 1 extern "C" { __declspec(naked) void _ftol2() { __asm { #if ANSI_FTOL fnstcw WORD PTR [esp-2] mov ax, WORD PTR [esp-2] OR AX, 0C00h mov WORD PTR [esp-4], ax fldcw WORD PTR [esp-4] fistp QWORD PTR [esp-12] fldcw WORD PTR [esp-2] mov eax, DWORD PTR [esp-12] mov edx, DWORD PTR [esp-8] #else fistp DWORD PTR [esp-12] mov eax, DWORD PTR [esp-12] mov ecx, DWORD PTR [esp-8] #endif ret } } } #endif //DEBUG #define ALLOCA dALLOCA16 typedef const SimdScalar *dRealPtr; typedef SimdScalar *dRealMutablePtr; #define dRealArray(name,n) SimdScalar name[n]; #define dRealAllocaArray(name,n) SimdScalar *name = (SimdScalar*) ALLOCA ((n)*sizeof(SimdScalar)); void dSetZero1 (SimdScalar *a, int n) { dAASSERT (a && n >= 0); while (n > 0) { *(a++) = 0; n--; } } void dSetValue1 (SimdScalar *a, int n, SimdScalar value) { dAASSERT (a && n >= 0); while (n > 0) { *(a++) = value; n--; } } //*************************************************************************** // configuration // for the SOR and CG methods: // uncomment the following line to use warm starting. this definitely // help for motor-driven joints. unfortunately it appears to hurt // with high-friction contacts using the SOR method. use with care //#define WARM_STARTING 1 // for the SOR method: // uncomment the following line to randomly reorder constraint rows // during the solution. depending on the situation, this can help a lot // or hardly at all, but it doesn't seem to hurt. //#define RANDOMLY_REORDER_CONSTRAINTS 1 //*************************************************************************** // various common computations involving the matrix J // compute iMJ = inv(M)*J' static void compute_invM_JT (int m, dRealMutablePtr J, dRealMutablePtr iMJ, int *jb, RigidBody * const *body, dRealPtr invI) { int i,j; dRealMutablePtr iMJ_ptr = iMJ; dRealMutablePtr J_ptr = J; for (i=0; igetInvMass(); for (j=0; j<3; j++) iMJ_ptr[j] = k*J_ptr[j]; dMULTIPLY0_331 (iMJ_ptr + 3, invI + 12*b1, J_ptr + 3); if (b2 >= 0) { k = body[b2]->getInvMass(); for (j=0; j<3; j++) iMJ_ptr[j+6] = k*J_ptr[j+6]; dMULTIPLY0_331 (iMJ_ptr + 9, invI + 12*b2, J_ptr + 9); } J_ptr += 12; iMJ_ptr += 12; } } static void multiply_invM_JTSpecial (int m, int nb, dRealMutablePtr iMJ, int *jb, dRealMutablePtr in, dRealMutablePtr out,int onlyBody1,int onlyBody2) { int i,j; dRealMutablePtr out_ptr1 = out + onlyBody1*6; for (j=0; j<6; j++) out_ptr1[j] = 0; if (onlyBody2 >= 0) { out_ptr1 = out + onlyBody2*6; for (j=0; j<6; j++) out_ptr1[j] = 0; } dRealPtr iMJ_ptr = iMJ; for (i=0; i= 0) { out_ptr = out + b2*6; for (j=0; j<6; j++) out_ptr[j] += iMJ_ptr[j] * in[i]; } } iMJ_ptr += 6; } } // compute out = inv(M)*J'*in. static void multiply_invM_JT (int m, int nb, dRealMutablePtr iMJ, int *jb, dRealMutablePtr in, dRealMutablePtr out) { int i,j; dSetZero1 (out,6*nb); dRealPtr iMJ_ptr = iMJ; for (i=0; i= 0) { out_ptr = out + b2*6; for (j=0; j<6; j++) out_ptr[j] += iMJ_ptr[j] * in[i]; } iMJ_ptr += 6; } } // compute out = J*in. static void multiply_J (int m, dRealMutablePtr J, int *jb, dRealMutablePtr in, dRealMutablePtr out) { int i,j; dRealPtr J_ptr = J; for (i=0; i= 0) { in_ptr = in + b2*6; for (j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j]; } J_ptr += 6; out[i] = sum; } } //*************************************************************************** // SOR-LCP method // nb is the number of bodies in the body array. // J is an m*12 matrix of constraint rows // jb is an array of first and second body numbers for each constraint row // invI is the global frame inverse inertia for each body (stacked 3x3 matrices) // // this returns lambda and fc (the constraint force). // note: fc is returned as inv(M)*J'*lambda, the constraint force is actually J'*lambda // // b, lo and hi are modified on exit struct IndexError { SimdScalar error; // error to sort on int findex; int index; // row index }; static unsigned long seed2 = 0; unsigned long dRand2() { seed2 = (1664525L*seed2 + 1013904223L) & 0xffffffff; return seed2; } int dRandInt2 (int n) { float a = float(n) / 4294967296.0f; return (int) (float(dRand2()) * a); } static void SOR_LCP (int m, int nb, dRealMutablePtr J, int *jb, RigidBody * const *body, dRealPtr invI, dRealMutablePtr lambda, dRealMutablePtr invMforce, dRealMutablePtr rhs, dRealMutablePtr lo, dRealMutablePtr hi, dRealPtr cfm, int *findex, int numiter,float overRelax) { const int num_iterations = numiter; const float sor_w = overRelax; // SOR over-relaxation parameter int i,j; #ifdef WARM_STARTING // for warm starting, this seems to be necessary to prevent // jerkiness in motor-driven joints. i have no idea why this works. for (i=0; i= 0) { for (j=6; j<12; j++) sum += iMJ_ptr[j] * J_ptr[j]; } iMJ_ptr += 12; J_ptr += 12; Ad[i] = sor_w / (sum + cfm[i]); } // scale J and b by Ad J_ptr = J; for (i=0; i= 0) order[j++].index = i; dIASSERT (j==m); #endif for (int iteration=0; iteration < num_iterations; iteration++) { #ifdef REORDER_CONSTRAINTS // constraints with findex < 0 always come first. if (iteration < 2) { // for the first two iterations, solve the constraints in // the given order for (i=0; i v2) ? v1 : v2; if (max > 0) { //@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max; order[i].error = dFabs(lambda[i]-last_lambda[i]); } else { order[i].error = dInfinity; } order[i].findex = findex[i]; order[i].index = i; } } qsort (order,m,sizeof(IndexError),&compare_index_error); #endif #ifdef RANDOMLY_REORDER_CONSTRAINTS if ((iteration & 7) == 0) { for (i=1; i= 0) { hi[index] = fabsf (hicopy[index] * lambda[findex[index]]); lo[index] = -hi[index]; } int b1 = jb[index*2]; int b2 = jb[index*2+1]; float delta = rhs[index] - lambda[index]*Ad[index]; dRealMutablePtr fc_ptr = invMforce + 6*b1; // @@@ potential optimization: SIMD-ize this and the b2 >= 0 case delta -=fc_ptr[0] * J_ptr[0] + fc_ptr[1] * J_ptr[1] + fc_ptr[2] * J_ptr[2] + fc_ptr[3] * J_ptr[3] + fc_ptr[4] * J_ptr[4] + fc_ptr[5] * J_ptr[5]; // @@@ potential optimization: handle 1-body constraints in a separate // loop to avoid the cost of test & jump? if (b2 >= 0) { fc_ptr = invMforce + 6*b2; delta -=fc_ptr[0] * J_ptr[6] + fc_ptr[1] * J_ptr[7] + fc_ptr[2] * J_ptr[8] + fc_ptr[3] * J_ptr[9] + fc_ptr[4] * J_ptr[10] + fc_ptr[5] * J_ptr[11]; } // compute lambda and clamp it to [lo,hi]. // @@@ potential optimization: does SSE have clamping instructions // to save test+jump penalties here? float new_lambda = lambda[index] + delta; if (new_lambda < lo[index]) { delta = lo[index]-lambda[index]; lambda[index] = lo[index]; } else if (new_lambda > hi[index]) { delta = hi[index]-lambda[index]; lambda[index] = hi[index]; } else { lambda[index] = new_lambda; } //@@@ a trick that may or may not help //float ramp = (1-((float)(iteration+1)/(float)num_iterations)); //delta *= ramp; // update invMforce. // @@@ potential optimization: SIMD for this and the b2 >= 0 case fc_ptr = invMforce + 6*b1; fc_ptr[0] += delta * iMJ_ptr[0]; fc_ptr[1] += delta * iMJ_ptr[1]; fc_ptr[2] += delta * iMJ_ptr[2]; fc_ptr[3] += delta * iMJ_ptr[3]; fc_ptr[4] += delta * iMJ_ptr[4]; fc_ptr[5] += delta * iMJ_ptr[5]; // @@@ potential optimization: handle 1-body constraints in a separate // loop to avoid the cost of test & jump? if (b2 >= 0) { fc_ptr = invMforce + 6*b2; fc_ptr[0] += delta * iMJ_ptr[6]; fc_ptr[1] += delta * iMJ_ptr[7]; fc_ptr[2] += delta * iMJ_ptr[8]; fc_ptr[3] += delta * iMJ_ptr[9]; fc_ptr[4] += delta * iMJ_ptr[10]; fc_ptr[5] += delta * iMJ_ptr[11]; } } } } void SolveInternal1 (float global_cfm, float global_erp, RigidBody * const *body, int nb, BU_Joint * const *_joint, int nj, const ContactSolverInfo& solverInfo) { int numIter = 30; float sOr = 1.3f; int i,j; SimdScalar stepsize1 = dRecip(solverInfo.m_timeStep); // number all bodies in the body list - set their tag values for (i=0; im_odeTag = i; // make a local copy of the joint array, because we might want to modify it. // (the "BU_Joint *const*" declaration says we're allowed to modify the joints // but not the joint array, because the caller might need it unchanged). //@@@ do we really need to do this? we'll be sorting constraint rows individually, not joints BU_Joint **joint = (BU_Joint**) alloca (nj * sizeof(BU_Joint*)); memcpy (joint,_joint,nj * sizeof(BU_Joint*)); // for all bodies, compute the inertia tensor and its inverse in the global // frame, and compute the rotational force and add it to the torque // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body. dRealAllocaArray (I,3*4*nb); dRealAllocaArray (invI,3*4*nb); /* for (i=0; im_I,body[i]->m_R); // compute inverse inertia tensor in global frame dMULTIPLY2_333 (tmp,body[i]->m_invI,body[i]->m_R); dMULTIPLY0_333 (invI+i*12,body[i]->m_R,tmp); // compute rotational force dCROSS (body[i]->m_tacc,-=,body[i]->getAngularVelocity(),tmp); } */ for (i=0; im_I,body[i]->m_R); dMULTIPLY0_333 (I+i*12,body[i]->m_R,tmp); // compute inverse inertia tensor in global frame dMULTIPLY2_333 (tmp,body[i]->m_invI,body[i]->m_R); dMULTIPLY0_333 (invI+i*12,body[i]->m_R,tmp); // compute rotational force dMULTIPLY0_331 (tmp,I+i*12,body[i]->getAngularVelocity()); //dCROSS (body[i]->tacc,-=,body[i]->avel,tmp); dCROSS (body[i]->m_tacc,-=,body[i]->getAngularVelocity(),tmp); } // get joint information (m = total constraint dimension, nub = number of unbounded variables). // joints with m=0 are inactive and are removed from the joints array // entirely, so that the code that follows does not consider them. //@@@ do we really need to save all the info1's BU_Joint::Info1 *info = (BU_Joint::Info1*) alloca (nj*sizeof(BU_Joint::Info1)); for (i=0, j=0; jGetInfo1 (info+i); dIASSERT (info[i].m >= 0 && info[i].m <= 6 && info[i].nub >= 0 && info[i].nub <= info[i].m); if (info[i].m > 0) { joint[i] = joint[j]; i++; } } nj = i; // create the row offset array int m = 0; int *ofs = (int*) alloca (nj*sizeof(int)); for (i=0; i 0) { // create a constraint equation right hand side vector `c', a constraint // force mixing vector `cfm', and LCP low and high bound vectors, and an // 'findex' vector. dRealAllocaArray (c,m); dRealAllocaArray (cfm,m); dRealAllocaArray (lo,m); dRealAllocaArray (hi,m); int *findex = (int*) alloca (m*sizeof(int)); dSetZero1 (c,m); dSetValue1 (cfm,m,global_cfm); dSetValue1 (lo,m,-dInfinity); dSetValue1 (hi,m, dInfinity); for (i=0; iGetInfo2 (&Jinfo); if (Jinfo.c[0] > solverInfo.m_maxErrorReduction) Jinfo.c[0] = solverInfo.m_maxErrorReduction; // adjust returned findex values for global index numbering for (j=0; j= 0) findex[ofs[i] + j] += ofs[i]; } } // create an array of body numbers for each joint row int *jb_ptr = jb; for (i=0; inode[0].body) ? (joint[i]->node[0].body->m_odeTag) : -1; int b2 = (joint[i]->node[1].body) ? (joint[i]->node[1].body->m_odeTag) : -1; for (j=0; jgetInvMass(); for (j=0; j<3; j++) tmp1[i*6+j] = body[i]->m_facc[j] * body_invMass + body[i]->getLinearVelocity()[j] * stepsize1; dMULTIPLY0_331NEW (tmp1 + i*6 + 3,=,invI + i*12,body[i]->m_tacc); for (j=0; j<3; j++) tmp1[i*6+3+j] += body[i]->getAngularVelocity()[j] * stepsize1; } // put J*tmp1 into rhs dRealAllocaArray (rhs,m); multiply_J (m,J,jb,tmp1,rhs); // complete rhs for (i=0; ilambda,info[i].m * sizeof(SimdScalar)); } #endif // solve the LCP problem and get lambda and invM*constraint_force dRealAllocaArray (cforce,nb*6); SOR_LCP (m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,findex,numIter,sOr); #ifdef WARM_STARTING // save lambda for the next iteration //@@@ note that this doesn't work for contact joints yet, as they are // recreated every iteration for (i=0; ilambda,lambda+ofs[i],info[i].m * sizeof(SimdScalar)); } #endif // note that the SOR method overwrites rhs and J at this point, so // they should not be used again. // add stepsize * cforce to the body velocity for (i=0; igetLinearVelocity(); SimdVector3 angvel = body[i]->getAngularVelocity(); for (j=0; j<3; j++) linvel[j] += solverInfo.m_timeStep* cforce[i*6+j]; for (j=0; j<3; j++) angvel[j] += solverInfo.m_timeStep* cforce[i*6+3+j]; body[i]->setLinearVelocity(linvel); body[i]->setAngularVelocity(angvel); } } // compute the velocity update: // add stepsize * invM * fe to the body velocity for (i=0; igetInvMass(); SimdVector3 linvel = body[i]->getLinearVelocity(); SimdVector3 angvel = body[i]->getAngularVelocity(); for (j=0; j<3; j++) { linvel[j] += solverInfo.m_timeStep * body_invMass * body[i]->m_facc[j]; } for (j=0; j<3; j++) { body[i]->m_tacc[j] *= solverInfo.m_timeStep; } dMULTIPLY0_331NEW(angvel,+=,invI + i*12,body[i]->m_tacc); body[i]->setAngularVelocity(angvel); } } #endif //USE_SOR_SOLVER