// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #ifndef __SOLVER_BLOCK_TGS_CUH__ #define __SOLVER_BLOCK_TGS_CUH__ #include "common/PxPhysXCommonConfig.h" #include #include #include "PxgSolverBody.h" //#include "PxgSolverConstraint1D.h" #include "PxgSolverConstraintBlock1D.h" #include "PxgSolverConstraintDesc.h" #include "PxgConstraint.h" #include "PxgConstraintBlock.h" #include "PxgIslandContext.h" #include "PxgSolverContext.h" #include "cutil_math.h" #include "PxgSolverCoreDesc.h" #include "DyThresholdTable.h" #include "PxgFrictionPatch.h" #include "foundation/PxUtilities.h" #include "PxgConstraintWriteBack.h" #include "PxgSolverFlags.h" #include "stdio.h" #include "assert.h" #include "PxgIntrinsics.h" #include "solverResidual.cuh" #include "DyCpuGpu1dConstraint.h" #include "solverBlockCommon.cuh" #include "constraintPrepShared.cuh" using namespace physx; static __forceinline__ __device__ float4 shfl(const PxU32 syncMask, const float4 f, const PxU32 index) { float4 ret; ret.x = __shfl_sync(syncMask, f.x, index); ret.y = __shfl_sync(syncMask, f.y, index); ret.z = __shfl_sync(syncMask, f.z, index); ret.w = __shfl_sync(syncMask, f.w, index); return ret; } //Loads a set of TxIData inertia tensors quickly using shuffles static __device__ void loadTxInertia(const PxU32 syncMask, const PxgSolverTxIData* datas, const PxU32 bodyIndex, const PxU32 nbToLoad, const PxU32 threadIndexInWarp, PxgSolverTxIData& out) { if (1) { { float4 data0, data1, data2, data3; //There are 4x float4 in a PxgSolverTxIData const PxU32 elementIndex = threadIndexInWarp / 4; const PxU32 index0 = __shfl_sync(syncMask, bodyIndex, elementIndex); const PxU32 index1 = __shfl_sync(syncMask, bodyIndex, elementIndex + 8); const PxU32 index2 = __shfl_sync(syncMask, bodyIndex, elementIndex + 16); const PxU32 index3 = __shfl_sync(syncMask, bodyIndex, elementIndex + 24); if (elementIndex < nbToLoad) { data0 = reinterpret_cast(&datas[index0])[threadIndexInWarp & 3]; if ((elementIndex + 8) < nbToLoad) { data1 = reinterpret_cast(&datas[index1])[threadIndexInWarp & 3]; if ((elementIndex + 16) < nbToLoad) { data2 = reinterpret_cast(&datas[index2])[threadIndexInWarp & 3]; if ((elementIndex + 24) < nbToLoad) { data3 = reinterpret_cast(&datas[index3])[threadIndexInWarp & 3]; } } } } //OK. We have our 4 vectors...now we have to shuffle them //Thread 0 has the first float4 for T0, T8, T16, T24 //Thread 1 has the 2nd float4 for T0, T8, T16, T24 ... //Thread 4 has the first float4 T1, T9, T17, T25... const PxU32 idxAnd3 = threadIndexInWarp & 3; float4 swapVal0 = idxAnd3 == 0 ? data0 : idxAnd3 == 1 ? data1 : idxAnd3 == 2 ? data2 : data3; float4 swapVal1 = idxAnd3 == 0 ? data3 : idxAnd3 == 1 ? data0 : idxAnd3 == 2 ? data1 : data2; float4 swapVal2 = idxAnd3 == 0 ? data2 : idxAnd3 == 1 ? data3 : idxAnd3 == 2 ? data0 : data1; float4 swapVal3 = idxAnd3 == 0 ? data1 : idxAnd3 == 1 ? data2 : idxAnd3 == 2 ? data3 : data0; const PxU32 threadReadBase = ((threadIndexInWarp * 4) & 31); const PxU32 offset = threadIndexInWarp / 8; float4 val0 = shfl(syncMask, swapVal0, threadReadBase + offset); float4 val1 = shfl(syncMask, swapVal1, threadReadBase + ((offset + 1) & 3)); float4 val2 = shfl(syncMask, swapVal2, threadReadBase + ((offset + 2) & 3)); float4 val3 = shfl(syncMask, swapVal3, threadReadBase + ((offset + 3) & 3)); float4 shuffled0 = offset == 0 ? val0 : offset == 1 ? val3 : offset == 2 ? val2 : val1; float4 shuffled1 = offset == 0 ? val1 : offset == 1 ? val0 : offset == 2 ? val3 : val2; float4 shuffled2 = offset == 0 ? val2 : offset == 1 ? val1 : offset == 2 ? val0 : val3; float4 shuffled3 = offset == 0 ? val3 : offset == 1 ? val2 : offset == 2 ? val1 : val0; //Now copy into the structure... out.deltaBody2World.q.x = shuffled0.x; out.deltaBody2World.q.y = shuffled0.y; out.deltaBody2World.q.z = shuffled0.z; out.deltaBody2World.q.w = shuffled0.w; out.deltaBody2World.p.x = shuffled1.x; out.deltaBody2World.p.y = shuffled1.y; out.deltaBody2World.p.z = shuffled1.z; out.sqrtInvInertia.column0.x = shuffled1.w; out.sqrtInvInertia.column0.y = shuffled2.x; out.sqrtInvInertia.column0.z = shuffled2.y; out.sqrtInvInertia.column1.x = shuffled2.z; out.sqrtInvInertia.column1.y = shuffled2.w; out.sqrtInvInertia.column1.z = shuffled3.x; out.sqrtInvInertia.column2.x = shuffled3.y; out.sqrtInvInertia.column2.y = shuffled3.z; out.sqrtInvInertia.column2.z = shuffled3.w; } } else { if (threadIndexInWarp < nbToLoad) out = datas[bodyIndex]; } } // Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep. // Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep; // i.e., mass-related terms are computed at every sub-timestep. // Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting idea. static __device__ void solveContactBlockTGS(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, const PxVec3& b0LinDelta, const PxVec3& b0AngDelta, const PxVec3& b1LinDelta, const PxVec3& b1AngDelta, const PxU32 threadIndex, const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, PxgTGSBlockSolverContactFriction* PX_RESTRICT frictionPoints, const PxReal elapsedTime, const PxReal minPen, PxgErrorAccumulator* error, PxReal ref0 = 1.f, PxReal ref1 = 1.f) { using namespace physx; PxVec3 linVel0(b0LinVel.x, b0LinVel.y, b0LinVel.z); PxVec3 linVel1(b1LinVel.x, b1LinVel.y, b1LinVel.z); PxVec3 angVel0(b0AngVel.x, b0AngVel.y, b0AngVel.z); PxVec3 angVel1(b1AngVel.x, b1AngVel.y, b1AngVel.z); float accumulatedNormalImpulse = 0.f; { const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex]; PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex]; PxgTGSBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex]; const uint numNormalConstr = Pxldcs(contactHeader->numNormalConstr[threadIndex]); const uint numFrictionConstrTotal = Pxldcs(frictionHeader->numFrictionConstr[threadIndex]); const uint numFrictionConstr = numFrictionConstrTotal & (~1); float accumDeltaF = 0.f; if (numNormalConstr) { const PxReal maxPenBias = Pxldcs(contactHeader->maxPenBias[threadIndex]); PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex]; const float4 invMass0_1_angDom0_1 = Pxldcs(contactHeader->invMass0_1_angDom0_1[threadIndex]); const float invMassA = ref0 * invMass0_1_angDom0_1.x; const float invMassB = ref1 * invMass0_1_angDom0_1.y; const float angDom0 = ref0 * invMass0_1_angDom0_1.z; const float angDom1 = ref1 * invMass0_1_angDom0_1.w; const float sumInvMass = invMassA + invMassB; const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]); const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z); const float restitutionXdt = contactHeader->restitutionXdt[threadIndex]; const float p8 = contactHeader->p8[threadIndex]; // using previous p8 value in the prep step. const PxU8 flags = contactHeader->flags[threadIndex]; const PxVec3 delLinVel0 = normal * invMassA; const PxVec3 delLinVel1 = normal * invMassB; //Bring forward a read event const float staticFrictionCof = normal_staticFriction.w; const PxVec3 relMotion = b0LinDelta - b1LinDelta; const float deltaV = normal.dot(relMotion); float relVel1 = (linVel0 - linVel1).dot(normal); float4 next_raXn_extraCoeff = Pxldcs(contacts->raXn_extraCoeff[threadIndex]); float4 next_rbXn_targetVelW = Pxldcs(contacts->rbXn_targetVelW[threadIndex]); float next_appliedForce = Pxldcs(contacts->appliedForce[threadIndex]); float next_error = Pxldcs(contacts->separation[threadIndex]); float next_maxImpulse = Pxldcs(contacts->maxImpulse[threadIndex]); float next_biasCoefficient = Pxldcs(contacts->biasCoefficient[threadIndex]); float next_resp0 = ref0 * Pxldcs(contacts->resp0[threadIndex]); float next_resp1 = ref1 * Pxldcs(contacts->resp1[threadIndex]); float next_unitResponse = next_resp0 + next_resp1; float next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f; float next_velMultiplier = next_recipResponse; if (restitutionXdt < 0.f) { computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse, next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier, next_biasCoefficient); } { for (uint i = 0; i < numNormalConstr; i++) { PxgTGSBlockSolverContactPoint& c = contacts[i]; const float4 raXn_contactCoeff = next_raXn_extraCoeff; const float velMultiplier = next_velMultiplier; const float4 rbXn_targetVelW = next_rbXn_targetVelW; const float appliedForce = next_appliedForce; const float separation = next_error; const float maxImpulse = next_maxImpulse; const float biasCoefficient = next_biasCoefficient; const float recipResponse = next_recipResponse; if ((i + 1) < numNormalConstr) { const PxgTGSBlockSolverContactPoint& nextC = contacts[i + 1]; next_raXn_extraCoeff = Pxldcs(nextC.raXn_extraCoeff[threadIndex]); next_rbXn_targetVelW = Pxldcs(nextC.rbXn_targetVelW[threadIndex]); next_appliedForce = Pxldcs(nextC.appliedForce[threadIndex]); next_error = Pxldcs(nextC.separation[threadIndex]); next_maxImpulse = Pxldcs(nextC.maxImpulse[threadIndex]); next_biasCoefficient = Pxldcs(nextC.biasCoefficient[threadIndex]); next_resp0 = ref0 * Pxldcs(nextC.resp0[threadIndex]); next_resp1 = ref1 * Pxldcs(nextC.resp1[threadIndex]); next_unitResponse = next_resp0 + next_resp1; next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f; next_velMultiplier = next_recipResponse; if (restitutionXdt < 0.f) { computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse, next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier, next_biasCoefficient); } } else if (numFrictionConstr) { next_raXn_extraCoeff = Pxldcs(frictions[0].raXn_error[threadIndex]); next_rbXn_targetVelW = Pxldcs(frictions[0].rbXn_targetVelW[threadIndex]); next_error = next_raXn_extraCoeff.w; next_appliedForce = Pxldcs(frictions[0].appliedForce[threadIndex]); next_resp0 = ref0 * Pxldcs(frictions[0].resp0[threadIndex]); next_resp1 = ref1 * Pxldcs(frictions[0].resp1[threadIndex]); const float next_resp = next_resp0 + next_resp1; next_velMultiplier = (next_resp > 0.f) ? (p8 / next_resp) : 0.f; } const PxVec3 raXn = PxVec3(raXn_contactCoeff.x, raXn_contactCoeff.y, raXn_contactCoeff.z); const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z); const float targetVel = rbXn_targetVelW.w; //Compute the normal velocity of the constraint. const PxReal v0 = angVel0.dot(raXn); const PxReal v1 = angVel1.dot(rbXn); const float normalVel = relVel1 + (v0 - v1); const float sep = PxMax(minPen, separation + deltaV + b0AngDelta.dot(raXn) - b1AngDelta.dot(rbXn)); //How much the target vel should have changed the position of this constraint... const PxReal tVelErr = targetVel * elapsedTime; const PxReal biasedErr = recipResponse * fminf(-maxPenBias, biasCoefficient * (sep - tVelErr)); //KS - clamp the maximum force const float tempDeltaF = biasedErr - (normalVel - targetVel) * velMultiplier; const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce)); const float _newForce = appliedForce + _deltaF; const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse); const float deltaF = newForce - appliedForce; accumDeltaF += deltaF; relVel1 += sumInvMass * deltaF; angVel0 += raXn * (deltaF * angDom0); angVel1 -= rbXn * (deltaF * angDom1); if(error) error->accumulateErrorLocal(deltaF, velMultiplier); Pxstcs(&c.appliedForce[threadIndex], newForce); accumulatedNormalImpulse = accumulatedNormalImpulse + newForce; } } linVel0 += delLinVel0 * accumDeltaF; linVel1 -= delLinVel1 * accumDeltaF; if (numFrictionConstr) { const float biasCoefficient = Pxldcg(frictionHeader->biasCoefficient[threadIndex]); const float dynamicFrictionCof = Pxldcg(frictionHeader->dynamicFriction[threadIndex]); const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse; const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse; PxU32 broken = 0; const float4 frictionNormal0 = Pxldcg(frictionHeader->frictionNormals[0][threadIndex]); const float4 frictionNormal1 = Pxldcg(frictionHeader->frictionNormals[1][threadIndex]); const PxVec3 normal0 = PxVec3(frictionNormal0.x, frictionNormal0.y, frictionNormal0.z); const PxVec3 normal1 = PxVec3(frictionNormal1.x, frictionNormal1.y, frictionNormal1.z); const PxVec3 delLinVel00 = normal0 * invMassA; const PxVec3 delLinVel10 = normal0 * invMassB; const PxVec3 delLinVel01 = normal1 * invMassA; const PxVec3 delLinVel11 = normal1 * invMassB; float relDelta0 = relMotion.dot(normal0); float relDelta1 = relMotion.dot(normal1); for (uint i = 0; i < numFrictionConstr; i += 2) { PxgTGSBlockSolverContactFriction& f0 = frictions[i]; PxgTGSBlockSolverContactFriction& f1 = frictions[i + 1]; const float4 raXn_extraCoeff1 = f1.raXn_error[threadIndex]; const float4 rbXn_targetVelW1 = f1.rbXn_targetVelW[threadIndex]; const float initialError1 = raXn_extraCoeff1.w; const float appliedForce1 = f1.appliedForce[threadIndex]; const float targetVel1 = rbXn_targetVelW1.w; const float4 raXn_extraCoeff0 = next_raXn_extraCoeff; const float4 rbXn_targetVelW0 = next_rbXn_targetVelW; const float initialError0 = next_error; const float appliedForce0 = next_appliedForce; const float targetVel0 = rbXn_targetVelW0.w; const float f1_resp0 = ref0 * f1.resp0[threadIndex]; const float f1_resp1 = ref1 * f1.resp1[threadIndex]; const float f1_resp = f1_resp0 + f1_resp1; const float velMultiplier1 = (f1_resp > 0.f) ? (p8 / f1_resp) : 0.f; const float velMultiplier0 = next_velMultiplier; if ((i + 2) < numFrictionConstrTotal) { next_raXn_extraCoeff = Pxldcs(frictions[i + 2].raXn_error[threadIndex]); next_rbXn_targetVelW = Pxldcs(frictions[i + 2].rbXn_targetVelW[threadIndex]); next_error = next_raXn_extraCoeff.w; next_appliedForce = Pxldcs(frictions[i + 2].appliedForce[threadIndex]); next_resp0 = ref0 * Pxldcs(frictions[i + 2].resp0[threadIndex]); next_resp1 = ref1 * Pxldcs(frictions[i + 2].resp1[threadIndex]); const float next_resp = next_resp0 + next_resp1; next_velMultiplier = (next_resp > 0.f) ? (p8 / next_resp) : 0.f; } const PxVec3 raXn0 = PxVec3(raXn_extraCoeff0.x, raXn_extraCoeff0.y, raXn_extraCoeff0.z); const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z); const PxReal v00 = angVel0.dot(raXn0) + linVel0.dot(normal0); const PxReal v10 = angVel1.dot(rbXn0) + linVel1.dot(normal0); const float normalVel0 = v00 - v10; const float error0 = initialError0 - targetVel0 * elapsedTime + raXn0.dot(b0AngDelta) - rbXn0.dot(b1AngDelta) + relDelta0; const float bias0 = error0 * biasCoefficient; const float tmp10 = appliedForce0 - (bias0 - targetVel0) * velMultiplier0; const float totalImpulse0 = tmp10 - normalVel0 * velMultiplier0; const PxVec3 raXn1 = PxVec3(raXn_extraCoeff1.x, raXn_extraCoeff1.y, raXn_extraCoeff1.z); const PxVec3 rbXn1 = PxVec3(rbXn_targetVelW1.x, rbXn_targetVelW1.y, rbXn_targetVelW1.z); const PxReal v01 = angVel0.dot(raXn1) + linVel0.dot(normal1); const PxReal v11 = angVel1.dot(rbXn1) + linVel1.dot(normal1); const float normalVel1 = v01 - v11; const float error1 = initialError1 - targetVel1 * elapsedTime + raXn1.dot(b0AngDelta) - rbXn1.dot(b1AngDelta) + relDelta1; const float bias1 = error1 * biasCoefficient; const float tmp11 = appliedForce1 - (bias1 - targetVel1) * velMultiplier1; const float totalImpulse1 = tmp11 - normalVel1 * velMultiplier1; const float totalImpulse = PxSqrt(totalImpulse0 * totalImpulse0 + totalImpulse1 * totalImpulse1); const bool clamp = totalImpulse > maxFrictionImpulse; const float ratio = clamp ? fminf(maxDynFrictionImpulse, totalImpulse) / totalImpulse : 1.f; const float newAppliedForce0 = totalImpulse0 * ratio; const float newAppliedForce1 = totalImpulse1 * ratio; float deltaF0 = newAppliedForce0 - appliedForce0; float deltaF1 = newAppliedForce1 - appliedForce1; if (error) error->accumulateErrorLocal(deltaF0, deltaF1, velMultiplier0, velMultiplier1); linVel0 += delLinVel00 * deltaF0; linVel1 -= delLinVel10 * deltaF0; angVel0 += raXn0 * (deltaF0 * angDom0); angVel1 -= rbXn0 * (deltaF0 * angDom1); linVel0 += delLinVel01 * deltaF1; linVel1 -= delLinVel11 * deltaF1; angVel0 += raXn1 * (deltaF1 * angDom0); angVel1 -= rbXn1 * (deltaF1 * angDom1); Pxstcs(&f0.appliedForce[threadIndex], newAppliedForce0); Pxstcs(&f1.appliedForce[threadIndex], newAppliedForce1); broken = broken | clamp; } if (numFrictionConstr < numFrictionConstrTotal) { const PxReal frictionScale = frictionHeader->torsionalFrictionScale[threadIndex]; //We have a torsional friction anchor, solve this... PxgTGSBlockSolverContactFriction& f0 = frictions[numFrictionConstr]; const float4 raXn_extraCoeff0 = next_raXn_extraCoeff; const PxVec3 raXn0 = PxVec3(raXn_extraCoeff0.x, raXn_extraCoeff0.y, raXn_extraCoeff0.z); const float velMultiplier0 = next_velMultiplier; const float4 rbXn_targetVelW0 = next_rbXn_targetVelW; const float appliedForce0 = next_appliedForce; const float targetVel0 = rbXn_targetVelW0.w; const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z); const PxReal v00 = angVel0.dot(raXn0); const PxReal v10 = angVel1.dot(rbXn0); const float normalVel0 = v00 - v10; const float tmp10 = appliedForce0 - (-targetVel0) * velMultiplier0; const float totalImpulse = tmp10 - normalVel0 * velMultiplier0; const bool clamp = PxAbs(totalImpulse) > (maxFrictionImpulse * frictionScale); const PxReal totalClamped = PxClamp(totalImpulse, -maxDynFrictionImpulse * frictionScale, maxDynFrictionImpulse * frictionScale); const PxReal newAppliedForce = clamp ? totalClamped : totalImpulse; float deltaF = newAppliedForce - appliedForce0; if (error) error->accumulateErrorLocal(deltaF, velMultiplier0); angVel0 += raXn0 * (deltaF * angDom0); angVel1 -= rbXn0 * (deltaF * angDom1); Pxstcs(&f0.appliedForce[threadIndex], newAppliedForce); broken = broken | clamp; } Pxstcs(&frictionHeader->broken[threadIndex], broken); } } } // Write back b0LinVel = PxVec3(linVel0.x, linVel0.y, linVel0.z); b0AngVel = PxVec3(angVel0.x, angVel0.y, angVel0.z); b1LinVel = PxVec3(linVel1.x, linVel1.y, linVel1.z); b1AngVel = PxVec3(angVel1.x, angVel1.y, angVel1.z); } // A light version of the function "solveContactBlockTGS" to quickly check if there is any active contact. // TODO: Make this even lighter. static __device__ bool checkActiveContactBlockTGS(const PxgBlockConstraintBatch& batch, const PxVec3& linVel0, const PxVec3& angVel0, const PxVec3& linVel1, const PxVec3& angVel1, const PxVec3& b0LinDelta, const PxVec3& b0AngDelta, const PxVec3& b1LinDelta, const PxVec3& b1AngDelta, const PxU32 threadIndex, const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, const PxReal elapsedTime, const PxReal minPen) { using namespace physx; { const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex]; const uint numNormalConstr = Pxldcs(contactHeader->numNormalConstr[threadIndex]); if (numNormalConstr) { const PxReal maxPenBias = Pxldcs(contactHeader->maxPenBias[threadIndex]); PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex]; const float4 invMass0_1_angDom0_1 = Pxldcs(contactHeader->invMass0_1_angDom0_1[threadIndex]); const float invMassA = invMass0_1_angDom0_1.x; const float invMassB = invMass0_1_angDom0_1.y; const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]); const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z); const float restitutionXdt = contactHeader->restitutionXdt[threadIndex]; const PxU8 flags = contactHeader->flags[threadIndex]; const PxVec3 delLinVel0 = normal * invMassA; const PxVec3 delLinVel1 = normal * invMassB; //Bring forward a read event const PxVec3 relMotion = b0LinDelta - b1LinDelta; const float deltaV = normal.dot(relMotion); float relVel1 = (linVel0 - linVel1).dot(normal); float4 next_raXn_extraCoeff = Pxldcs(contacts->raXn_extraCoeff[threadIndex]); float4 next_rbXn_targetVelW = Pxldcs(contacts->rbXn_targetVelW[threadIndex]); float next_appliedForce = Pxldcs(contacts->appliedForce[threadIndex]); float next_error = Pxldcs(contacts->separation[threadIndex]); float next_maxImpulse = Pxldcs(contacts->maxImpulse[threadIndex]); float next_biasCoefficient = Pxldcs(contacts->biasCoefficient[threadIndex]); float next_resp0 = Pxldcs(contacts->resp0[threadIndex]); float next_resp1 = Pxldcs(contacts->resp1[threadIndex]); float next_unitResponse = next_resp0 + next_resp1; float next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f; float next_velMultiplier = next_recipResponse; if (restitutionXdt < 0.f) { computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse, next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier, next_biasCoefficient); } { for (uint i = 0; i < numNormalConstr; i++) { const float4 raXn_contactCoeff = next_raXn_extraCoeff; const float velMultiplier = next_velMultiplier; const float4 rbXn_targetVelW = next_rbXn_targetVelW; const float appliedForce = next_appliedForce; const float separation = next_error; const float maxImpulse = next_maxImpulse; const float biasCoefficient = next_biasCoefficient; const float recipResponse = next_recipResponse; if ((i + 1) < numNormalConstr) { const PxgTGSBlockSolverContactPoint& nextC = contacts[i + 1]; next_raXn_extraCoeff = Pxldcs(nextC.raXn_extraCoeff[threadIndex]); next_rbXn_targetVelW = Pxldcs(nextC.rbXn_targetVelW[threadIndex]); next_appliedForce = Pxldcs(nextC.appliedForce[threadIndex]); next_error = Pxldcs(nextC.separation[threadIndex]); next_maxImpulse = Pxldcs(nextC.maxImpulse[threadIndex]); next_biasCoefficient = Pxldcs(nextC.biasCoefficient[threadIndex]); next_resp0 = Pxldcs(nextC.resp0[threadIndex]); next_resp1 = Pxldcs(nextC.resp1[threadIndex]); next_unitResponse = next_resp0 + next_resp1; next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f; next_velMultiplier = next_recipResponse; if (restitutionXdt < 0.f) { computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse, next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier, next_biasCoefficient); } } const PxVec3 raXn = PxVec3(raXn_contactCoeff.x, raXn_contactCoeff.y, raXn_contactCoeff.z); const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z); const float targetVel = rbXn_targetVelW.w; //Compute the normal velocity of the constraint. const PxReal v0 = angVel0.dot(raXn); const PxReal v1 = angVel1.dot(rbXn); const float normalVel = relVel1 + (v0 - v1); const float sep = PxMax(minPen, separation + deltaV + b0AngDelta.dot(raXn) - b1AngDelta.dot(rbXn)); //How much the target vel should have changed the position of this constraint... const PxReal tVelErr = targetVel * elapsedTime; const PxReal biasedErr = recipResponse * fminf(-maxPenBias, biasCoefficient * (sep - tVelErr)); //KS - clamp the maximum force const float tempDeltaF = biasedErr - (normalVel - targetVel) * velMultiplier; const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce)); const float _newForce = appliedForce + _deltaF; const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse); const float deltaF = newForce - appliedForce; // active contact if (PxAbs(deltaF) > 1.0e-8f) { return true; } } } } } return false; } static __device__ void concludeContactBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* contactHeaders, PxgTGSBlockSolverFrictionHeader* frictionHeaders, PxgTGSBlockSolverContactPoint* contactPoints, PxgTGSBlockSolverContactFriction* frictions) { #if 0 using namespace physx; { const PxgTGSBlockSolverContactHeader* contactHeader = &contactHeaders[batch.mConstraintBatchIndex]; PxgTGSBlockSolverFrictionHeader* frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex]; const uint32_t numNormalConstr = contactHeader->numNormalConstr[threadIndex]; //const uint32_t numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex]; PxgTGSBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex]; for(uint32_t i=0;ibiasCoefficient[threadIndex] = 0.f; //PxgTGSBlockSolverContactFriction* frictionConstr = &frictions[batch.startFrictionIndex]; //for(uint32_t i=0;iforceWritebackOffset[threadIndex]; forceThreshold = contactHeader->flags[threadIndex] & PxgSolverContactFlags::eHAS_FORCE_THRESHOLDS; const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex]; const PxU32 numNormalConstr = contactHeader->numNormalConstr[threadIndex]; if(forceWritebackOffset!=0xFFFFFFFF) { PxReal* vForceWriteback = &forcewritebackBuffer[forceWritebackOffset]; PxgTGSBlockSolverContactPoint* c = &contactPoints[batch.startConstraintIndex]; for(PxU32 i=0; igetAppliedForce()); *vForceWriteback++ = appliedForce; normalForce += appliedForce; } } writeBackContactBlockFriction(threadIndex, numFrictionConstr, frictionHeader, frictionPatchBlock, frictions + batch.startFrictionIndex, frictionPatches); if(numFrictionConstr && frictionHeader->broken[threadIndex]) { frictionPatchBlock.broken[threadIndex] = 1; } } float reportThreshold0 = bd0.reportThreshold; float reportThreshold1 = bd1.reportThreshold; if((forceThreshold && normalForce !=0 && (reportThreshold0 < PX_MAX_REAL || reportThreshold1 < PX_MAX_REAL))) { //ToDo : support PxgThresholdStreamElement Dy::ThresholdStreamElement elt; elt.normalForce = normalForce; elt.threshold = PxMin(reportThreshold0, reportThreshold1); elt.nodeIndexA = bd0.islandNodeIndex; elt.nodeIndexB = bd1.islandNodeIndex; elt.shapeInteraction = batch.shapeInteraction[threadIndex]; PxOrder(elt.nodeIndexA, elt.nodeIndexB); assert(elt.nodeIndexA < elt.nodeIndexB); PxI32 index = atomicAdd(sharedThresholdStreamIndex, 1); //KS - force a 16-byte coalesced write //((float4*)thresholdStream)[index] = *((float4*)&elt); thresholdStream[index] = elt; } } // Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep. // Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep; // i.e., mass-related terms are computed at every sub-timestep. // Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting idea. static __device__ void solve1DBlockTGS(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, const PxVec3& linDelta0, const PxVec3& angDelta0, const PxVec3& linDelta1, const PxVec3& angDelta1, const PxU32 threadIndex, const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rowsCon, const PxgSolverTxIData& iData0, const PxgSolverTxIData& iData1, const PxReal elapsedTime, bool residualReportingEnabled, PxReal ref0 = 1.f, PxReal ref1 = 1.f) { using namespace physx; // // please refer to solve1DStep() in DyTGSContactPrep.cpp for a description of the parameters and some of the logic // const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex]; PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex]; PxVec3 linVel0(b0LinVel.x, b0LinVel.y, b0LinVel.z); PxVec3 linVel1(b1LinVel.x, b1LinVel.y, b1LinVel.z); PxVec3 angVel0(b0AngVel.x, b0AngVel.y, b0AngVel.z); PxVec3 angVel1(b1AngVel.x, b1AngVel.y, b1AngVel.z); const float4 raInvMass0 = header->rAWorld_invMass0D0[threadIndex]; const float4 rbInvMass1 = header->rBWorld_invMass1D1[threadIndex]; float invMass0 = ref0 * raInvMass0.w; //FLoad(header->invMass0D0); float invMass1 = ref1 * rbInvMass1.w; //FLoad(header->invMass1D1); const PxVec3 raPrev(raInvMass0.x, raInvMass0.y, raInvMass0.z); const PxVec3 rbPrev(rbInvMass1.x, rbInvMass1.y, rbInvMass1.z); const PxVec3 ra = iData0.deltaBody2World.q.rotate(raPrev); const PxVec3 rb = iData1.deltaBody2World.q.rotate(rbPrev); const PxVec3 raMotion = (ra + linDelta0) - raPrev; const PxVec3 rbMotion = (rb + linDelta1) - rbPrev; float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex]; float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex]; const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex]; const PxU32 rowCount = rowCounts_breakable_orthoAxisCount.x; const float4 ang0Ortho0_recipResponseW = header->angOrthoAxis0_recipResponseW[0][threadIndex]; const float4 ang0Ortho1_recipResponseW = header->angOrthoAxis0_recipResponseW[1][threadIndex]; const float4 ang0Ortho2_recipResponseW = header->angOrthoAxis0_recipResponseW[2][threadIndex]; const float4 ang1OrthoAxis0_ErrorW = header->angOrthoAxis1_ErrorW[0][threadIndex]; const float4 ang1OrthoAxis1_ErrorW = header->angOrthoAxis1_ErrorW[1][threadIndex]; const float4 ang1OrthoAxis2_ErrorW = header->angOrthoAxis1_ErrorW[2][threadIndex]; const float recipResponse0 = ang0Ortho0_recipResponseW.w; const float recipResponse1 = ang0Ortho1_recipResponseW.w; const float recipResponse2 = ang0Ortho2_recipResponseW.w; const PxVec3 ang0Ortho0(ang0Ortho0_recipResponseW.x, ang0Ortho0_recipResponseW.y, ang0Ortho0_recipResponseW.z); const PxVec3 ang0Ortho1(ang0Ortho1_recipResponseW.x, ang0Ortho1_recipResponseW.y, ang0Ortho1_recipResponseW.z); const PxVec3 ang0Ortho2(ang0Ortho2_recipResponseW.x, ang0Ortho2_recipResponseW.y, ang0Ortho2_recipResponseW.z); const PxVec3 ang1Ortho0(ang1OrthoAxis0_ErrorW.x, ang1OrthoAxis0_ErrorW.y, ang1OrthoAxis0_ErrorW.z); const PxVec3 ang1Ortho1(ang1OrthoAxis1_ErrorW.x, ang1OrthoAxis1_ErrorW.y, ang1OrthoAxis1_ErrorW.z); const PxVec3 ang1Ortho2(ang1OrthoAxis2_ErrorW.x, ang1OrthoAxis2_ErrorW.y, ang1OrthoAxis2_ErrorW.z); PxReal error0 = ang1OrthoAxis0_ErrorW.w + (ang0Ortho0.dot(angDelta0) - ang1Ortho0.dot(angDelta1)); PxReal error1 = ang1OrthoAxis1_ErrorW.w + (ang0Ortho1.dot(angDelta0) - ang1Ortho1.dot(angDelta1)); PxReal error2 = ang1OrthoAxis2_ErrorW.w + (ang0Ortho2.dot(angDelta0) - ang1Ortho2.dot(angDelta1)); for (PxU32 i = 0; i < rowCount; ++i) { PxgTGSBlockSolverConstraint1DCon& ccon = baseCon[i]; const float4 _clinVel0_coeff0W = ccon.lin0XYZ_initBiasOrCoeff0[threadIndex]; const float4 _clinVel1_coeff1W = ccon.lin1XYZ_biasScaleOrCoeff1[threadIndex]; const float4 _cangVel0_coeff2W = ccon.ang0XYZ_velMultiplierOrCoeff2[threadIndex]; const float4 _cangVel1_coeff3W = ccon.ang1XYZ_velTargetOrCoeff3[threadIndex]; const PxVec3 clinVel0(_clinVel0_coeff0W.x, _clinVel0_coeff0W.y, _clinVel0_coeff0W.z); const PxVec3 clinVel1(_clinVel1_coeff1W.x, _clinVel1_coeff1W.y, _clinVel1_coeff1W.z); const PxVec3 cangVel0_(_cangVel0_coeff2W.x, _cangVel0_coeff2W.y, _cangVel0_coeff2W.z); const PxVec3 cangVel1_(_cangVel1_coeff3W.x, _cangVel1_coeff3W.y, _cangVel1_coeff3W.z); PxReal initBias = _clinVel0_coeff0W.w; const PxReal biasScale = _clinVel1_coeff1W.w; const PxReal velMultiplier = _cangVel0_coeff2W.w; const PxReal targetVel = _cangVel1_coeff3W.w; const PxVec3 cangVel0 = cangVel0_ + ra.cross(clinVel0); const PxVec3 cangVel1 = cangVel1_ + rb.cross(clinVel1); const PxU32 flags = ccon.flags[threadIndex]; const PxReal maxBias = ccon.maxBias[threadIndex]; const PxReal minBias = Dy::computeMinBiasTGS(flags, maxBias); PxVec3 raXnI = iData0.sqrtInvInertia * cangVel0; PxVec3 rbXnI = iData1.sqrtInvInertia * cangVel1; //KS - TODO - orthogonalization here... if (flags & DY_SC_FLAG_ORTHO_TARGET) { const PxReal proj0 = (raXnI.dot(ang0Ortho0) + rbXnI.dot(ang1Ortho0)) * recipResponse0; const PxReal proj1 = (raXnI.dot(ang0Ortho1) + rbXnI.dot(ang1Ortho1)) * recipResponse1; const PxReal proj2 = (raXnI.dot(ang0Ortho2) + rbXnI.dot(ang1Ortho2)) * recipResponse2; const PxVec3 delta0 = ang0Ortho0 * proj0 + ang0Ortho1 * proj1 + ang0Ortho2 * proj2; const PxVec3 delta1 = ang1Ortho0 * proj0 + ang1Ortho1 * proj1 + ang1Ortho2 * proj2; raXnI = raXnI - delta0; rbXnI = rbXnI - delta1; const PxReal orthoBasisError = biasScale * (error0 * proj0 + error1 * proj1 + error2 * proj2); initBias = initBias - orthoBasisError; } // If biasScale is zero, initBias must also be zero. // biasScale is enforced to zero before the velocity itertaion (e.g., in conclude1DBlockTGS), but not initBias. // Thus, explicitly resetting it to zero here for velocity iteration. initBias = (biasScale == 0.f) ? 0.f : initBias; const bool isSpringConstraint = (flags & DY_SC_FLAG_SPRING); const PxReal errorDelta = Dy::computeResolvedGeometricErrorTGS(raMotion, rbMotion, clinVel0, clinVel1, angDelta0, angDelta1, raXnI, rbXnI, ccon.angularErrorScale[threadIndex], isSpringConstraint, targetVel, elapsedTime); const PxReal resp0 = invMass0 * clinVel0.dot(clinVel0) + raXnI.dot(raXnI) * invInertiaScale0; const PxReal resp1 = invMass1 * clinVel1.dot(clinVel1) + rbXnI.dot(rbXnI) * invInertiaScale1; //KS - may be a - required here... const PxReal response = resp0 + resp1; const PxReal recipResponse = response > 0.f ? 1.f / response : 0.f; const PxReal vMul = isSpringConstraint ? velMultiplier : recipResponse * velMultiplier; const PxReal unclampedBias = initBias + errorDelta * biasScale; const PxReal bias = PxClamp(unclampedBias, minBias, maxBias); const PxReal constant = isSpringConstraint ? bias + targetVel : recipResponse * (bias + targetVel); const float appliedForce = ccon.appliedForce[threadIndex];//FLoad(c.appliedForce); //const FloatV targetVel = FLoad(c.targetVelocity); const float maxImpulse = ccon.maxImpulse[threadIndex];//FLoad(c.maxImpulse); const float minImpulse = ccon.minImpulse[threadIndex];//FLoad(c.minImpulse); //const PxVec3 v0 = linVel0.multiply(clinVel0) + angVel0.multiply(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0)); //const PxVec3 v1 = linVel1.multiply(clinVel1) + angVel1.multiply(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1)); const float v0 = linVel0.dot(clinVel0) + angVel0.dot(raXnI);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0)); const float v1 = linVel1.dot(clinVel1) + angVel1.dot(rbXnI);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1)); const float normalVel = v0 - v1; const float unclampedForce = appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant)); const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce))); const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce); ccon.appliedForce[threadIndex] = clampedForce; // FStore(clampedForce); if(residualReportingEnabled) ccon.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul); linVel0 = linVel0 + clinVel0 * (deltaF * invMass0);//V3ScaleAdd(clinVel0, FMul(deltaF, invMass0), linVel0); linVel1 = linVel1 - clinVel1 * (deltaF * invMass1);//V3NegScaleSub(clinVel1, FMul(deltaF, invMass1), linVel1); angVel0 = angVel0 + raXnI * deltaF * invInertiaScale0;//V3ScaleAdd(cangVel0, deltaF, angVel0); angVel1 = angVel1 - rbXnI * deltaF * invInertiaScale1;//V3NegScaleSub(cangVel1, deltaF, angVel1); } b0LinVel = PxVec3(linVel0.x, linVel0.y, linVel0.z); b0AngVel = PxVec3(angVel0.x, angVel0.y, angVel0.z); b1LinVel = PxVec3(linVel1.x, linVel1.y, linVel1.z); b1AngVel = PxVec3(angVel1.x, angVel1.y, angVel1.z); } // Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep. // Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep; // i.e., mass-related terms are computed at every sub-timestep. static __device__ PX_FORCE_INLINE void solveExt1DBlockTGS(const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0, Cm::UnAlignedSpatialVector& vel1, const Cm::UnAlignedSpatialVector& motion0, const Cm::UnAlignedSpatialVector& motion1, const PxU32 threadIndex, const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rowsCon, PxgArticulationBlockResponse* PX_RESTRICT artiResponse, const PxQuat& deltaQ0, const PxQuat& deltaQ1, const PxReal elapsedTime, Cm::UnAlignedSpatialVector& impluse0, Cm::UnAlignedSpatialVector& impluse1, bool residualReportingEnabled, PxReal ref0 = 1.f, PxReal ref1 = 1.f) { using namespace physx; const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex]; PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex]; const PxgArticulationBlockResponse* PX_RESTRICT responses = &artiResponse[batch.mArticulationResponseIndex]; PxVec3 linVel0 = vel0.bottom; PxVec3 linVel1 = vel1.bottom; PxVec3 angVel0 = vel0.top; PxVec3 angVel1 = vel1.top; const float4 raInvMass0 = header->rAWorld_invMass0D0[threadIndex]; const float4 rbInvMass1 = header->rBWorld_invMass1D1[threadIndex]; float invMass0 = ref0 * raInvMass0.w; //FLoad(header->invMass0D0); float invMass1 = ref1 * rbInvMass1.w; //FLoad(header->invMass1D1); float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex]; float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex]; const PxVec3 raPrev(raInvMass0.x, raInvMass0.y, raInvMass0.z); const PxVec3 rbPrev(rbInvMass1.x, rbInvMass1.y, rbInvMass1.z); const PxVec3 ra = deltaQ0.rotate(raPrev); const PxVec3 rb = deltaQ1.rotate(rbPrev); const PxVec3 raMotion = (ra + motion0.bottom) - raPrev; const PxVec3 rbMotion = (rb + motion1.bottom) - rbPrev; const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex]; const PxU32 rowCount = rowCounts_breakable_orthoAxisCount.x; const PxReal cfm = header->cfm[threadIndex]; PxVec3 li0(0.f, 0.f, 0.f); PxVec3 li1(0.f, 0.f, 0.f); PxVec3 ai0(0.f, 0.f, 0.f); PxVec3 ai1(0.f, 0.f, 0.f); for (PxU32 i = 0; i < rowCount; ++i) { PxgTGSBlockSolverConstraint1DCon& ccon = baseCon[i]; const PxgArticulationBlockResponse& response = responses[i]; const float4 _clinVel0_coeff0W = ccon.lin0XYZ_initBiasOrCoeff0[threadIndex];//V3LoadA(c.lin0); const float4 _clinVel1_coeff1W = ccon.lin1XYZ_biasScaleOrCoeff1[threadIndex];//V3LoadA(c.lin1); const float4 _cangVel0_coeff2W = ccon.ang0XYZ_velMultiplierOrCoeff2[threadIndex];//V3LoadA(c.ang0); const float4 _cangVel1_coeff3W = ccon.ang1XYZ_velTargetOrCoeff3[threadIndex];//V3LoadA(c.ang1); const PxVec3 clinVel0(_clinVel0_coeff0W.x, _clinVel0_coeff0W.y, _clinVel0_coeff0W.z); const PxVec3 clinVel1(_clinVel1_coeff1W.x, _clinVel1_coeff1W.y, _clinVel1_coeff1W.z); const PxVec3 cangVel0(_cangVel0_coeff2W.x, _cangVel0_coeff2W.y, _cangVel0_coeff2W.z); const PxVec3 cangVel1(_cangVel1_coeff3W.x, _cangVel1_coeff3W.y, _cangVel1_coeff3W.z); const PxU32 flags = ccon.flags[threadIndex]; const bool isSpringConstraint = (flags & DY_SC_FLAG_SPRING); const PxReal resp0 = ref0 * ccon.resp0[threadIndex]; const PxReal resp1 = ref1 * ccon.resp1[threadIndex]; float unitResponse = resp0 + resp1; unitResponse += cfm; //https://omniverse-jirasw.nvidia.com/browse/PX-4383 const PxReal minRowResponse = DY_ARTICULATION_MIN_RESPONSE; const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse); const bool isAccelerationSpring = (flags & DY_SC_FLAG_ACCELERATION_SPRING); const PxReal coeff0 = _clinVel0_coeff0W.w; const PxReal coeff1 = _clinVel1_coeff1W.w; const PxReal coeff2 = _cangVel0_coeff2W.w; const PxReal coeff3 = _cangVel1_coeff3W.w; const PxReal geometricError = ccon.geometricError[threadIndex]; PxReal biasScale, velMultiplier, initBias, targetVel; Dy::compute1dConstraintSolverConstantsTGS(isSpringConstraint, isAccelerationSpring, geometricError, unitResponse, recipResponse, coeff0, coeff1, coeff2, coeff3, biasScale, velMultiplier, initBias, targetVel); // If biasScale is zero, initBias must also be zero. // biasScale is enforced to zero before the velocity itertaion (e.g., in conclude1DBlockTGS), but not initBias. // Thus, explicitly resetting it to zero here for velocity iteration. initBias = (biasScale == 0.f) ? 0.f : initBias; const PxReal errorDelta = Dy::computeResolvedGeometricErrorTGS(raMotion, rbMotion, clinVel0, clinVel1, motion0.top, motion1.top, cangVel0, cangVel1, ccon.angularErrorScale[threadIndex], isSpringConstraint, targetVel, elapsedTime); const PxReal maxBias = ccon.maxBias[threadIndex]; const PxReal vMul = isSpringConstraint ? velMultiplier : recipResponse * velMultiplier; const float appliedForce = ccon.appliedForce[threadIndex]; const PxReal unclampedBias = initBias + errorDelta * biasScale; const PxReal minBias = Dy::computeMinBiasTGS(flags, maxBias); const PxReal bias = PxClamp(unclampedBias, minBias, maxBias); const PxReal constant = isSpringConstraint ? (bias + targetVel) : recipResponse * (bias + targetVel); const float maxImpulse = ccon.maxImpulse[threadIndex];//FLoad(c.maxImpulse); const float minImpulse = ccon.minImpulse[threadIndex];//FLoad(c.minImpulse); const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0)); const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1)); const float normalVel = v0 - v1; const float unclampedForce = appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant)); const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce))); const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce); ccon.appliedForce[threadIndex] = clampedForce; // FStore(clampedForce); if(residualReportingEnabled) ccon.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul); li0 = clinVel0 * deltaF + li0; ai0 = cangVel0 * deltaF + ai0; li1 = clinVel1 * deltaF + li1; ai1 = cangVel1 * deltaF + ai1; PxVec3 linVa = ref0 * PxVec3(response.deltaRALin_x[threadIndex], response.deltaRALin_y[threadIndex], response.deltaRALin_z[threadIndex]); PxVec3 angVa = ref0 * PxVec3(response.deltaRAAng_x[threadIndex], response.deltaRAAng_y[threadIndex], response.deltaRAAng_z[threadIndex]); PxVec3 linVb = ref1 * PxVec3(response.deltaRBLin_x[threadIndex], response.deltaRBLin_y[threadIndex], response.deltaRBLin_z[threadIndex]); PxVec3 angVb = ref1 * PxVec3(response.deltaRBAng_x[threadIndex], response.deltaRBAng_y[threadIndex], response.deltaRBAng_z[threadIndex]); linVel0 = linVa * deltaF + linVel0; angVel0 = angVa * deltaF + angVel0; linVel1 = linVb * deltaF + linVel1; angVel1 = angVb * deltaF + angVel1; } vel0.top = angVel0; vel0.bottom = linVel0; vel1.top = angVel1; vel1.bottom = linVel1; impluse0.top = li0 * invMass0; impluse0.bottom = ai0 * invInertiaScale0; impluse1.top = li1 * invMass1; impluse1.bottom = ai1 * invInertiaScale1; } static __device__ void conclude1DBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rows) { using namespace physx; const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex]; PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT base = &rows[batch.startConstraintIndex]; const uchar4 rowCount = header->rowCounts_breakable_orthoAxisCount[threadIndex]; for (PxU32 i = 0; iwriteBackOffset[threadIndex]; const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex]; const PxU8 breakable = rowCounts_breakable_orthoAxisCount.y; const PxU32 numRows = rowCounts_breakable_orthoAxisCount.x; if (forceWritebackOffset != 0xFFFFFFFF) { PxgConstraintWriteback& writeback = constraintWriteBacks[forceWritebackOffset]; PxVec3 linVel(0), angVel(0); PxReal constraintErrorSq = 0.0f; for (PxU32 i = 0; i < numRows; ++i) { PxgTGSBlockSolverConstraint1DCon& con = conBase[i]; if (con.flags[threadIndex] & DY_SC_FLAG_OUTPUT_FORCE) { const float4 lin0XYZ_ErrorW = con.lin0XYZ_initBiasOrCoeff0[threadIndex]; const float4 ang0WriteBack_VMW = con.ang0XYZ_velMultiplierOrCoeff2[threadIndex]; const PxVec3 lin0(lin0XYZ_ErrorW.x, lin0XYZ_ErrorW.y, lin0XYZ_ErrorW.z); const PxVec3 ang0WriteBack(ang0WriteBack_VMW.x, ang0WriteBack_VMW.y, ang0WriteBack_VMW.z); const PxReal appliedForce = con.appliedForce[threadIndex]; linVel += lin0 * appliedForce; angVel += ang0WriteBack *appliedForce; } PxReal err = con.residual[threadIndex]; constraintErrorSq += err * err; } const float4 body0WorldOffset_InvMass0D0 = header->rAWorld_invMass0D0[threadIndex]; const PxVec3 body0WorldOffset(body0WorldOffset_InvMass0D0.x, body0WorldOffset_InvMass0D0.y, body0WorldOffset_InvMass0D0.z); angVel -= body0WorldOffset.cross(linVel); const PxU32 broken = breakable ? PxU32((linVel.magnitude() > header->linBreakImpulse[threadIndex]) || (angVel.magnitude() > header->angBreakImpulse[threadIndex])) : 0; writeback.angularImpulse_residual = make_float4(angVel.x, angVel.y, angVel.z, constraintErrorSq); writeback.linearImpulse_broken = make_float4(linVel.x, linVel.y, linVel.z, broken ? -0.0f : 0.0f); } } #endif