Files
XCEngine/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockTGS.cuh

1181 lines
51 KiB
Plaintext

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __SOLVER_BLOCK_TGS_CUH__
#define __SOLVER_BLOCK_TGS_CUH__
#include "common/PxPhysXCommonConfig.h"
#include <cuda.h>
#include <sm_35_intrinsics.h>
#include "PxgSolverBody.h"
//#include "PxgSolverConstraint1D.h"
#include "PxgSolverConstraintBlock1D.h"
#include "PxgSolverConstraintDesc.h"
#include "PxgConstraint.h"
#include "PxgConstraintBlock.h"
#include "PxgIslandContext.h"
#include "PxgSolverContext.h"
#include "cutil_math.h"
#include "PxgSolverCoreDesc.h"
#include "DyThresholdTable.h"
#include "PxgFrictionPatch.h"
#include "foundation/PxUtilities.h"
#include "PxgConstraintWriteBack.h"
#include "PxgSolverFlags.h"
#include "stdio.h"
#include "assert.h"
#include "PxgIntrinsics.h"
#include "solverResidual.cuh"
#include "DyCpuGpu1dConstraint.h"
#include "solverBlockCommon.cuh"
#include "constraintPrepShared.cuh"
using namespace physx;
static __forceinline__ __device__ float4 shfl(const PxU32 syncMask, const float4 f, const PxU32 index)
{
float4 ret;
ret.x = __shfl_sync(syncMask, f.x, index);
ret.y = __shfl_sync(syncMask, f.y, index);
ret.z = __shfl_sync(syncMask, f.z, index);
ret.w = __shfl_sync(syncMask, f.w, index);
return ret;
}
//Loads a set of TxIData inertia tensors quickly using shuffles
static __device__ void loadTxInertia(const PxU32 syncMask, const PxgSolverTxIData* datas, const PxU32 bodyIndex, const PxU32 nbToLoad, const PxU32 threadIndexInWarp, PxgSolverTxIData& out)
{
if (1)
{
{
float4 data0, data1, data2, data3;
//There are 4x float4 in a PxgSolverTxIData
const PxU32 elementIndex = threadIndexInWarp / 4;
const PxU32 index0 = __shfl_sync(syncMask, bodyIndex, elementIndex);
const PxU32 index1 = __shfl_sync(syncMask, bodyIndex, elementIndex + 8);
const PxU32 index2 = __shfl_sync(syncMask, bodyIndex, elementIndex + 16);
const PxU32 index3 = __shfl_sync(syncMask, bodyIndex, elementIndex + 24);
if (elementIndex < nbToLoad)
{
data0 = reinterpret_cast<const float4*>(&datas[index0])[threadIndexInWarp & 3];
if ((elementIndex + 8) < nbToLoad)
{
data1 = reinterpret_cast<const float4*>(&datas[index1])[threadIndexInWarp & 3];
if ((elementIndex + 16) < nbToLoad)
{
data2 = reinterpret_cast<const float4*>(&datas[index2])[threadIndexInWarp & 3];
if ((elementIndex + 24) < nbToLoad)
{
data3 = reinterpret_cast<const float4*>(&datas[index3])[threadIndexInWarp & 3];
}
}
}
}
//OK. We have our 4 vectors...now we have to shuffle them
//Thread 0 has the first float4 for T0, T8, T16, T24
//Thread 1 has the 2nd float4 for T0, T8, T16, T24 ...
//Thread 4 has the first float4 T1, T9, T17, T25...
const PxU32 idxAnd3 = threadIndexInWarp & 3;
float4 swapVal0 = idxAnd3 == 0 ? data0 : idxAnd3 == 1 ? data1 : idxAnd3 == 2 ? data2 : data3;
float4 swapVal1 = idxAnd3 == 0 ? data3 : idxAnd3 == 1 ? data0 : idxAnd3 == 2 ? data1 : data2;
float4 swapVal2 = idxAnd3 == 0 ? data2 : idxAnd3 == 1 ? data3 : idxAnd3 == 2 ? data0 : data1;
float4 swapVal3 = idxAnd3 == 0 ? data1 : idxAnd3 == 1 ? data2 : idxAnd3 == 2 ? data3 : data0;
const PxU32 threadReadBase = ((threadIndexInWarp * 4) & 31);
const PxU32 offset = threadIndexInWarp / 8;
float4 val0 = shfl(syncMask, swapVal0, threadReadBase + offset);
float4 val1 = shfl(syncMask, swapVal1, threadReadBase + ((offset + 1) & 3));
float4 val2 = shfl(syncMask, swapVal2, threadReadBase + ((offset + 2) & 3));
float4 val3 = shfl(syncMask, swapVal3, threadReadBase + ((offset + 3) & 3));
float4 shuffled0 = offset == 0 ? val0 : offset == 1 ? val3 : offset == 2 ? val2 : val1;
float4 shuffled1 = offset == 0 ? val1 : offset == 1 ? val0 : offset == 2 ? val3 : val2;
float4 shuffled2 = offset == 0 ? val2 : offset == 1 ? val1 : offset == 2 ? val0 : val3;
float4 shuffled3 = offset == 0 ? val3 : offset == 1 ? val2 : offset == 2 ? val1 : val0;
//Now copy into the structure...
out.deltaBody2World.q.x = shuffled0.x; out.deltaBody2World.q.y = shuffled0.y; out.deltaBody2World.q.z = shuffled0.z; out.deltaBody2World.q.w = shuffled0.w;
out.deltaBody2World.p.x = shuffled1.x; out.deltaBody2World.p.y = shuffled1.y; out.deltaBody2World.p.z = shuffled1.z;
out.sqrtInvInertia.column0.x = shuffled1.w; out.sqrtInvInertia.column0.y = shuffled2.x; out.sqrtInvInertia.column0.z = shuffled2.y;
out.sqrtInvInertia.column1.x = shuffled2.z; out.sqrtInvInertia.column1.y = shuffled2.w; out.sqrtInvInertia.column1.z = shuffled3.x;
out.sqrtInvInertia.column2.x = shuffled3.y; out.sqrtInvInertia.column2.y = shuffled3.z; out.sqrtInvInertia.column2.z = shuffled3.w;
}
}
else
{
if (threadIndexInWarp < nbToLoad)
out = datas[bodyIndex];
}
}
// Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep.
// Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep;
// i.e., mass-related terms are computed at every sub-timestep.
// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting idea.
static __device__ void solveContactBlockTGS(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel,
const PxVec3& b0LinDelta, const PxVec3& b0AngDelta, const PxVec3& b1LinDelta, const PxVec3& b1AngDelta,
const PxU32 threadIndex, const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders,
PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints,
PxgTGSBlockSolverContactFriction* PX_RESTRICT frictionPoints,
const PxReal elapsedTime, const PxReal minPen, PxgErrorAccumulator* error,
PxReal ref0 = 1.f, PxReal ref1 = 1.f)
{
using namespace physx;
PxVec3 linVel0(b0LinVel.x, b0LinVel.y, b0LinVel.z);
PxVec3 linVel1(b1LinVel.x, b1LinVel.y, b1LinVel.z);
PxVec3 angVel0(b0AngVel.x, b0AngVel.y, b0AngVel.z);
PxVec3 angVel1(b1AngVel.x, b1AngVel.y, b1AngVel.z);
float accumulatedNormalImpulse = 0.f;
{
const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
PxgTGSBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex];
const uint numNormalConstr = Pxldcs(contactHeader->numNormalConstr[threadIndex]);
const uint numFrictionConstrTotal = Pxldcs(frictionHeader->numFrictionConstr[threadIndex]);
const uint numFrictionConstr = numFrictionConstrTotal & (~1);
float accumDeltaF = 0.f;
if (numNormalConstr)
{
const PxReal maxPenBias = Pxldcs(contactHeader->maxPenBias[threadIndex]);
PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
const float4 invMass0_1_angDom0_1 = Pxldcs(contactHeader->invMass0_1_angDom0_1[threadIndex]);
const float invMassA = ref0 * invMass0_1_angDom0_1.x;
const float invMassB = ref1 * invMass0_1_angDom0_1.y;
const float angDom0 = ref0 * invMass0_1_angDom0_1.z;
const float angDom1 = ref1 * invMass0_1_angDom0_1.w;
const float sumInvMass = invMassA + invMassB;
const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
const float p8 = contactHeader->p8[threadIndex]; // using previous p8 value in the prep step.
const PxU8 flags = contactHeader->flags[threadIndex];
const PxVec3 delLinVel0 = normal * invMassA;
const PxVec3 delLinVel1 = normal * invMassB;
//Bring forward a read event
const float staticFrictionCof = normal_staticFriction.w;
const PxVec3 relMotion = b0LinDelta - b1LinDelta;
const float deltaV = normal.dot(relMotion);
float relVel1 = (linVel0 - linVel1).dot(normal);
float4 next_raXn_extraCoeff = Pxldcs(contacts->raXn_extraCoeff[threadIndex]);
float4 next_rbXn_targetVelW = Pxldcs(contacts->rbXn_targetVelW[threadIndex]);
float next_appliedForce = Pxldcs(contacts->appliedForce[threadIndex]);
float next_error = Pxldcs(contacts->separation[threadIndex]);
float next_maxImpulse = Pxldcs(contacts->maxImpulse[threadIndex]);
float next_biasCoefficient = Pxldcs(contacts->biasCoefficient[threadIndex]);
float next_resp0 = ref0 * Pxldcs(contacts->resp0[threadIndex]);
float next_resp1 = ref1 * Pxldcs(contacts->resp1[threadIndex]);
float next_unitResponse = next_resp0 + next_resp1;
float next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f;
float next_velMultiplier = next_recipResponse;
if (restitutionXdt < 0.f)
{
computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse,
next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier,
next_biasCoefficient);
}
{
for (uint i = 0; i < numNormalConstr; i++)
{
PxgTGSBlockSolverContactPoint& c = contacts[i];
const float4 raXn_contactCoeff = next_raXn_extraCoeff;
const float velMultiplier = next_velMultiplier;
const float4 rbXn_targetVelW = next_rbXn_targetVelW;
const float appliedForce = next_appliedForce;
const float separation = next_error;
const float maxImpulse = next_maxImpulse;
const float biasCoefficient = next_biasCoefficient;
const float recipResponse = next_recipResponse;
if ((i + 1) < numNormalConstr)
{
const PxgTGSBlockSolverContactPoint& nextC = contacts[i + 1];
next_raXn_extraCoeff = Pxldcs(nextC.raXn_extraCoeff[threadIndex]);
next_rbXn_targetVelW = Pxldcs(nextC.rbXn_targetVelW[threadIndex]);
next_appliedForce = Pxldcs(nextC.appliedForce[threadIndex]);
next_error = Pxldcs(nextC.separation[threadIndex]);
next_maxImpulse = Pxldcs(nextC.maxImpulse[threadIndex]);
next_biasCoefficient = Pxldcs(nextC.biasCoefficient[threadIndex]);
next_resp0 = ref0 * Pxldcs(nextC.resp0[threadIndex]);
next_resp1 = ref1 * Pxldcs(nextC.resp1[threadIndex]);
next_unitResponse = next_resp0 + next_resp1;
next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f;
next_velMultiplier = next_recipResponse;
if (restitutionXdt < 0.f)
{
computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse,
next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier,
next_biasCoefficient);
}
}
else if (numFrictionConstr)
{
next_raXn_extraCoeff = Pxldcs(frictions[0].raXn_error[threadIndex]);
next_rbXn_targetVelW = Pxldcs(frictions[0].rbXn_targetVelW[threadIndex]);
next_error = next_raXn_extraCoeff.w;
next_appliedForce = Pxldcs(frictions[0].appliedForce[threadIndex]);
next_resp0 = ref0 * Pxldcs(frictions[0].resp0[threadIndex]);
next_resp1 = ref1 * Pxldcs(frictions[0].resp1[threadIndex]);
const float next_resp = next_resp0 + next_resp1;
next_velMultiplier = (next_resp > 0.f) ? (p8 / next_resp) : 0.f;
}
const PxVec3 raXn = PxVec3(raXn_contactCoeff.x, raXn_contactCoeff.y, raXn_contactCoeff.z);
const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
const float targetVel = rbXn_targetVelW.w;
//Compute the normal velocity of the constraint.
const PxReal v0 = angVel0.dot(raXn);
const PxReal v1 = angVel1.dot(rbXn);
const float normalVel = relVel1 + (v0 - v1);
const float sep = PxMax(minPen, separation + deltaV + b0AngDelta.dot(raXn) - b1AngDelta.dot(rbXn));
//How much the target vel should have changed the position of this constraint...
const PxReal tVelErr = targetVel * elapsedTime;
const PxReal biasedErr = recipResponse * fminf(-maxPenBias, biasCoefficient * (sep - tVelErr));
//KS - clamp the maximum force
const float tempDeltaF = biasedErr - (normalVel - targetVel) * velMultiplier;
const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
const float _newForce = appliedForce + _deltaF;
const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
const float deltaF = newForce - appliedForce;
accumDeltaF += deltaF;
relVel1 += sumInvMass * deltaF;
angVel0 += raXn * (deltaF * angDom0);
angVel1 -= rbXn * (deltaF * angDom1);
if(error)
error->accumulateErrorLocal(deltaF, velMultiplier);
Pxstcs(&c.appliedForce[threadIndex], newForce);
accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
}
}
linVel0 += delLinVel0 * accumDeltaF;
linVel1 -= delLinVel1 * accumDeltaF;
if (numFrictionConstr)
{
const float biasCoefficient = Pxldcg(frictionHeader->biasCoefficient[threadIndex]);
const float dynamicFrictionCof = Pxldcg(frictionHeader->dynamicFriction[threadIndex]);
const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse;
const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
PxU32 broken = 0;
const float4 frictionNormal0 = Pxldcg(frictionHeader->frictionNormals[0][threadIndex]);
const float4 frictionNormal1 = Pxldcg(frictionHeader->frictionNormals[1][threadIndex]);
const PxVec3 normal0 = PxVec3(frictionNormal0.x, frictionNormal0.y, frictionNormal0.z);
const PxVec3 normal1 = PxVec3(frictionNormal1.x, frictionNormal1.y, frictionNormal1.z);
const PxVec3 delLinVel00 = normal0 * invMassA;
const PxVec3 delLinVel10 = normal0 * invMassB;
const PxVec3 delLinVel01 = normal1 * invMassA;
const PxVec3 delLinVel11 = normal1 * invMassB;
float relDelta0 = relMotion.dot(normal0);
float relDelta1 = relMotion.dot(normal1);
for (uint i = 0; i < numFrictionConstr; i += 2)
{
PxgTGSBlockSolverContactFriction& f0 = frictions[i];
PxgTGSBlockSolverContactFriction& f1 = frictions[i + 1];
const float4 raXn_extraCoeff1 = f1.raXn_error[threadIndex];
const float4 rbXn_targetVelW1 = f1.rbXn_targetVelW[threadIndex];
const float initialError1 = raXn_extraCoeff1.w;
const float appliedForce1 = f1.appliedForce[threadIndex];
const float targetVel1 = rbXn_targetVelW1.w;
const float4 raXn_extraCoeff0 = next_raXn_extraCoeff;
const float4 rbXn_targetVelW0 = next_rbXn_targetVelW;
const float initialError0 = next_error;
const float appliedForce0 = next_appliedForce;
const float targetVel0 = rbXn_targetVelW0.w;
const float f1_resp0 = ref0 * f1.resp0[threadIndex];
const float f1_resp1 = ref1 * f1.resp1[threadIndex];
const float f1_resp = f1_resp0 + f1_resp1;
const float velMultiplier1 = (f1_resp > 0.f) ? (p8 / f1_resp) : 0.f;
const float velMultiplier0 = next_velMultiplier;
if ((i + 2) < numFrictionConstrTotal)
{
next_raXn_extraCoeff = Pxldcs(frictions[i + 2].raXn_error[threadIndex]);
next_rbXn_targetVelW = Pxldcs(frictions[i + 2].rbXn_targetVelW[threadIndex]);
next_error = next_raXn_extraCoeff.w;
next_appliedForce = Pxldcs(frictions[i + 2].appliedForce[threadIndex]);
next_resp0 = ref0 * Pxldcs(frictions[i + 2].resp0[threadIndex]);
next_resp1 = ref1 * Pxldcs(frictions[i + 2].resp1[threadIndex]);
const float next_resp = next_resp0 + next_resp1;
next_velMultiplier = (next_resp > 0.f) ? (p8 / next_resp) : 0.f;
}
const PxVec3 raXn0 = PxVec3(raXn_extraCoeff0.x, raXn_extraCoeff0.y, raXn_extraCoeff0.z);
const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
const PxReal v00 = angVel0.dot(raXn0) + linVel0.dot(normal0);
const PxReal v10 = angVel1.dot(rbXn0) + linVel1.dot(normal0);
const float normalVel0 = v00 - v10;
const float error0 = initialError0 - targetVel0 * elapsedTime +
raXn0.dot(b0AngDelta) - rbXn0.dot(b1AngDelta) + relDelta0;
const float bias0 = error0 * biasCoefficient;
const float tmp10 = appliedForce0 - (bias0 - targetVel0) * velMultiplier0;
const float totalImpulse0 = tmp10 - normalVel0 * velMultiplier0;
const PxVec3 raXn1 = PxVec3(raXn_extraCoeff1.x, raXn_extraCoeff1.y, raXn_extraCoeff1.z);
const PxVec3 rbXn1 = PxVec3(rbXn_targetVelW1.x, rbXn_targetVelW1.y, rbXn_targetVelW1.z);
const PxReal v01 = angVel0.dot(raXn1) + linVel0.dot(normal1);
const PxReal v11 = angVel1.dot(rbXn1) + linVel1.dot(normal1);
const float normalVel1 = v01 - v11;
const float error1 = initialError1 - targetVel1 * elapsedTime +
raXn1.dot(b0AngDelta) - rbXn1.dot(b1AngDelta) + relDelta1;
const float bias1 = error1 * biasCoefficient;
const float tmp11 = appliedForce1 - (bias1 - targetVel1) * velMultiplier1;
const float totalImpulse1 = tmp11 - normalVel1 * velMultiplier1;
const float totalImpulse = PxSqrt(totalImpulse0 * totalImpulse0 + totalImpulse1 * totalImpulse1);
const bool clamp = totalImpulse > maxFrictionImpulse;
const float ratio = clamp ? fminf(maxDynFrictionImpulse, totalImpulse) / totalImpulse : 1.f;
const float newAppliedForce0 = totalImpulse0 * ratio;
const float newAppliedForce1 = totalImpulse1 * ratio;
float deltaF0 = newAppliedForce0 - appliedForce0;
float deltaF1 = newAppliedForce1 - appliedForce1;
if (error)
error->accumulateErrorLocal(deltaF0, deltaF1, velMultiplier0, velMultiplier1);
linVel0 += delLinVel00 * deltaF0;
linVel1 -= delLinVel10 * deltaF0;
angVel0 += raXn0 * (deltaF0 * angDom0);
angVel1 -= rbXn0 * (deltaF0 * angDom1);
linVel0 += delLinVel01 * deltaF1;
linVel1 -= delLinVel11 * deltaF1;
angVel0 += raXn1 * (deltaF1 * angDom0);
angVel1 -= rbXn1 * (deltaF1 * angDom1);
Pxstcs(&f0.appliedForce[threadIndex], newAppliedForce0);
Pxstcs(&f1.appliedForce[threadIndex], newAppliedForce1);
broken = broken | clamp;
}
if (numFrictionConstr < numFrictionConstrTotal)
{
const PxReal frictionScale = frictionHeader->torsionalFrictionScale[threadIndex];
//We have a torsional friction anchor, solve this...
PxgTGSBlockSolverContactFriction& f0 = frictions[numFrictionConstr];
const float4 raXn_extraCoeff0 = next_raXn_extraCoeff;
const PxVec3 raXn0 = PxVec3(raXn_extraCoeff0.x, raXn_extraCoeff0.y, raXn_extraCoeff0.z);
const float velMultiplier0 = next_velMultiplier;
const float4 rbXn_targetVelW0 = next_rbXn_targetVelW;
const float appliedForce0 = next_appliedForce;
const float targetVel0 = rbXn_targetVelW0.w;
const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
const PxReal v00 = angVel0.dot(raXn0);
const PxReal v10 = angVel1.dot(rbXn0);
const float normalVel0 = v00 - v10;
const float tmp10 = appliedForce0 - (-targetVel0) * velMultiplier0;
const float totalImpulse = tmp10 - normalVel0 * velMultiplier0;
const bool clamp = PxAbs(totalImpulse) > (maxFrictionImpulse * frictionScale);
const PxReal totalClamped = PxClamp(totalImpulse, -maxDynFrictionImpulse * frictionScale, maxDynFrictionImpulse * frictionScale);
const PxReal newAppliedForce = clamp ? totalClamped : totalImpulse;
float deltaF = newAppliedForce - appliedForce0;
if (error)
error->accumulateErrorLocal(deltaF, velMultiplier0);
angVel0 += raXn0 * (deltaF * angDom0);
angVel1 -= rbXn0 * (deltaF * angDom1);
Pxstcs(&f0.appliedForce[threadIndex], newAppliedForce);
broken = broken | clamp;
}
Pxstcs(&frictionHeader->broken[threadIndex], broken);
}
}
}
// Write back
b0LinVel = PxVec3(linVel0.x, linVel0.y, linVel0.z);
b0AngVel = PxVec3(angVel0.x, angVel0.y, angVel0.z);
b1LinVel = PxVec3(linVel1.x, linVel1.y, linVel1.z);
b1AngVel = PxVec3(angVel1.x, angVel1.y, angVel1.z);
}
// A light version of the function "solveContactBlockTGS" to quickly check if there is any active contact.
// TODO: Make this even lighter.
static __device__ bool checkActiveContactBlockTGS(const PxgBlockConstraintBatch& batch, const PxVec3& linVel0, const PxVec3& angVel0,
const PxVec3& linVel1, const PxVec3& angVel1, const PxVec3& b0LinDelta, const PxVec3& b0AngDelta, const PxVec3& b1LinDelta,
const PxVec3& b1AngDelta, const PxU32 threadIndex, const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders,
PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, const PxReal elapsedTime, const PxReal minPen)
{
using namespace physx;
{
const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
const uint numNormalConstr = Pxldcs(contactHeader->numNormalConstr[threadIndex]);
if (numNormalConstr)
{
const PxReal maxPenBias = Pxldcs(contactHeader->maxPenBias[threadIndex]);
PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
const float4 invMass0_1_angDom0_1 = Pxldcs(contactHeader->invMass0_1_angDom0_1[threadIndex]);
const float invMassA = invMass0_1_angDom0_1.x;
const float invMassB = invMass0_1_angDom0_1.y;
const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
const PxU8 flags = contactHeader->flags[threadIndex];
const PxVec3 delLinVel0 = normal * invMassA;
const PxVec3 delLinVel1 = normal * invMassB;
//Bring forward a read event
const PxVec3 relMotion = b0LinDelta - b1LinDelta;
const float deltaV = normal.dot(relMotion);
float relVel1 = (linVel0 - linVel1).dot(normal);
float4 next_raXn_extraCoeff = Pxldcs(contacts->raXn_extraCoeff[threadIndex]);
float4 next_rbXn_targetVelW = Pxldcs(contacts->rbXn_targetVelW[threadIndex]);
float next_appliedForce = Pxldcs(contacts->appliedForce[threadIndex]);
float next_error = Pxldcs(contacts->separation[threadIndex]);
float next_maxImpulse = Pxldcs(contacts->maxImpulse[threadIndex]);
float next_biasCoefficient = Pxldcs(contacts->biasCoefficient[threadIndex]);
float next_resp0 = Pxldcs(contacts->resp0[threadIndex]);
float next_resp1 = Pxldcs(contacts->resp1[threadIndex]);
float next_unitResponse = next_resp0 + next_resp1;
float next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f;
float next_velMultiplier = next_recipResponse;
if (restitutionXdt < 0.f)
{
computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse,
next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier,
next_biasCoefficient);
}
{
for (uint i = 0; i < numNormalConstr; i++)
{
const float4 raXn_contactCoeff = next_raXn_extraCoeff;
const float velMultiplier = next_velMultiplier;
const float4 rbXn_targetVelW = next_rbXn_targetVelW;
const float appliedForce = next_appliedForce;
const float separation = next_error;
const float maxImpulse = next_maxImpulse;
const float biasCoefficient = next_biasCoefficient;
const float recipResponse = next_recipResponse;
if ((i + 1) < numNormalConstr)
{
const PxgTGSBlockSolverContactPoint& nextC = contacts[i + 1];
next_raXn_extraCoeff = Pxldcs(nextC.raXn_extraCoeff[threadIndex]);
next_rbXn_targetVelW = Pxldcs(nextC.rbXn_targetVelW[threadIndex]);
next_appliedForce = Pxldcs(nextC.appliedForce[threadIndex]);
next_error = Pxldcs(nextC.separation[threadIndex]);
next_maxImpulse = Pxldcs(nextC.maxImpulse[threadIndex]);
next_biasCoefficient = Pxldcs(nextC.biasCoefficient[threadIndex]);
next_resp0 = Pxldcs(nextC.resp0[threadIndex]);
next_resp1 = Pxldcs(nextC.resp1[threadIndex]);
next_unitResponse = next_resp0 + next_resp1;
next_recipResponse = (next_unitResponse > 0.f) ? (1.f / next_unitResponse) : 0.f;
next_velMultiplier = next_recipResponse;
if (restitutionXdt < 0.f)
{
computeCompliantContactCoefficientsTGS(flags, restitutionXdt, next_unitResponse,
next_recipResponse, next_raXn_extraCoeff.w, next_velMultiplier,
next_biasCoefficient);
}
}
const PxVec3 raXn = PxVec3(raXn_contactCoeff.x, raXn_contactCoeff.y, raXn_contactCoeff.z);
const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
const float targetVel = rbXn_targetVelW.w;
//Compute the normal velocity of the constraint.
const PxReal v0 = angVel0.dot(raXn);
const PxReal v1 = angVel1.dot(rbXn);
const float normalVel = relVel1 + (v0 - v1);
const float sep = PxMax(minPen, separation + deltaV + b0AngDelta.dot(raXn) - b1AngDelta.dot(rbXn));
//How much the target vel should have changed the position of this constraint...
const PxReal tVelErr = targetVel * elapsedTime;
const PxReal biasedErr = recipResponse * fminf(-maxPenBias, biasCoefficient * (sep - tVelErr));
//KS - clamp the maximum force
const float tempDeltaF = biasedErr - (normalVel - targetVel) * velMultiplier;
const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
const float _newForce = appliedForce + _deltaF;
const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
const float deltaF = newForce - appliedForce;
// active contact
if (PxAbs(deltaF) > 1.0e-8f)
{
return true;
}
}
}
}
}
return false;
}
static __device__ void concludeContactBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* contactHeaders, PxgTGSBlockSolverFrictionHeader* frictionHeaders,
PxgTGSBlockSolverContactPoint* contactPoints, PxgTGSBlockSolverContactFriction* frictions)
{
#if 0
using namespace physx;
{
const PxgTGSBlockSolverContactHeader* contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
PxgTGSBlockSolverFrictionHeader* frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
const uint32_t numNormalConstr = contactHeader->numNormalConstr[threadIndex];
//const uint32_t numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
PxgTGSBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
for(uint32_t i=0;i<numNormalConstr;i++)
{
//contactPoints[i].setScaledBias(fmaxf(contactPoints[i].getScaledBias(), 0.f));
contacts[i].biasCoefficeint[threadIndex] = 0.f;
}
frictionHeader->biasCoefficient[threadIndex] = 0.f;
//PxgTGSBlockSolverContactFriction* frictionConstr = &frictions[batch.startFrictionIndex];
//for(uint32_t i=0;i<numFrictionConstr;i++)
//{
// //frictionConstr[i].setBias(0.f);
// frictionConstr[i].bias[threadIndex] = 0.f;
//}
}
#endif
}
static __device__ void writeBackContactBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex,
const PxgSolverBodyData* bodies, Dy::ThresholdStreamElement* thresholdStream,
PxI32* sharedThresholdStreamIndex, PxgTGSBlockSolverContactHeader* contactHeaders, PxgTGSBlockSolverFrictionHeader* frictionHeaders,
PxgTGSBlockSolverContactPoint* contactPoints, PxgTGSBlockSolverContactFriction* frictions,
PxF32* forcewritebackBuffer, PxgBlockFrictionPatch& frictionPatchBlock,
PxgFrictionPatchGPU* frictionPatches)
{
const PxU32 bodyAIndex = batch.bodyAIndex[threadIndex];
const PxU32 bodyBIndex = batch.bodyBIndex[threadIndex];
const PxgSolverBodyData& bd0 = bodies[bodyAIndex];
const PxgSolverBodyData& bd1 = bodies[bodyBIndex];
bool forceThreshold = false;
float normalForce = 0.f;
{
const PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
const PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
PxU32 forceWritebackOffset = contactHeader->forceWritebackOffset[threadIndex];
forceThreshold = contactHeader->flags[threadIndex] & PxgSolverContactFlags::eHAS_FORCE_THRESHOLDS;
const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
const PxU32 numNormalConstr = contactHeader->numNormalConstr[threadIndex];
if(forceWritebackOffset!=0xFFFFFFFF)
{
PxReal* vForceWriteback = &forcewritebackBuffer[forceWritebackOffset];
PxgTGSBlockSolverContactPoint* c = &contactPoints[batch.startConstraintIndex];
for(PxU32 i=0; i<numNormalConstr; i++)
{
const PxReal appliedForce = c[i].appliedForce[threadIndex];//FStore(c->getAppliedForce());
*vForceWriteback++ = appliedForce;
normalForce += appliedForce;
}
}
writeBackContactBlockFriction(threadIndex, numFrictionConstr, frictionHeader,
frictionPatchBlock, frictions + batch.startFrictionIndex, frictionPatches);
if(numFrictionConstr && frictionHeader->broken[threadIndex])
{
frictionPatchBlock.broken[threadIndex] = 1;
}
}
float reportThreshold0 = bd0.reportThreshold;
float reportThreshold1 = bd1.reportThreshold;
if((forceThreshold && normalForce !=0 && (reportThreshold0 < PX_MAX_REAL || reportThreshold1 < PX_MAX_REAL)))
{
//ToDo : support PxgThresholdStreamElement
Dy::ThresholdStreamElement elt;
elt.normalForce = normalForce;
elt.threshold = PxMin<float>(reportThreshold0, reportThreshold1);
elt.nodeIndexA = bd0.islandNodeIndex;
elt.nodeIndexB = bd1.islandNodeIndex;
elt.shapeInteraction = batch.shapeInteraction[threadIndex];
PxOrder(elt.nodeIndexA, elt.nodeIndexB);
assert(elt.nodeIndexA < elt.nodeIndexB);
PxI32 index = atomicAdd(sharedThresholdStreamIndex, 1);
//KS - force a 16-byte coalesced write
//((float4*)thresholdStream)[index] = *((float4*)&elt);
thresholdStream[index] = elt;
}
}
// Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep.
// Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep;
// i.e., mass-related terms are computed at every sub-timestep.
// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting idea.
static __device__ void solve1DBlockTGS(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel,
const PxVec3& linDelta0, const PxVec3& angDelta0, const PxVec3& linDelta1, const PxVec3& angDelta1, const PxU32 threadIndex,
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
const PxgSolverTxIData& iData0, const PxgSolverTxIData& iData1, const PxReal elapsedTime, bool residualReportingEnabled,
PxReal ref0 = 1.f, PxReal ref1 = 1.f)
{
using namespace physx;
//
// please refer to solve1DStep() in DyTGSContactPrep.cpp for a description of the parameters and some of the logic
//
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex];
PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
PxVec3 linVel0(b0LinVel.x, b0LinVel.y, b0LinVel.z);
PxVec3 linVel1(b1LinVel.x, b1LinVel.y, b1LinVel.z);
PxVec3 angVel0(b0AngVel.x, b0AngVel.y, b0AngVel.z);
PxVec3 angVel1(b1AngVel.x, b1AngVel.y, b1AngVel.z);
const float4 raInvMass0 = header->rAWorld_invMass0D0[threadIndex];
const float4 rbInvMass1 = header->rBWorld_invMass1D1[threadIndex];
float invMass0 = ref0 * raInvMass0.w; //FLoad(header->invMass0D0);
float invMass1 = ref1 * rbInvMass1.w; //FLoad(header->invMass1D1);
const PxVec3 raPrev(raInvMass0.x, raInvMass0.y, raInvMass0.z);
const PxVec3 rbPrev(rbInvMass1.x, rbInvMass1.y, rbInvMass1.z);
const PxVec3 ra = iData0.deltaBody2World.q.rotate(raPrev);
const PxVec3 rb = iData1.deltaBody2World.q.rotate(rbPrev);
const PxVec3 raMotion = (ra + linDelta0) - raPrev;
const PxVec3 rbMotion = (rb + linDelta1) - rbPrev;
float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex];
const PxU32 rowCount = rowCounts_breakable_orthoAxisCount.x;
const float4 ang0Ortho0_recipResponseW = header->angOrthoAxis0_recipResponseW[0][threadIndex];
const float4 ang0Ortho1_recipResponseW = header->angOrthoAxis0_recipResponseW[1][threadIndex];
const float4 ang0Ortho2_recipResponseW = header->angOrthoAxis0_recipResponseW[2][threadIndex];
const float4 ang1OrthoAxis0_ErrorW = header->angOrthoAxis1_ErrorW[0][threadIndex];
const float4 ang1OrthoAxis1_ErrorW = header->angOrthoAxis1_ErrorW[1][threadIndex];
const float4 ang1OrthoAxis2_ErrorW = header->angOrthoAxis1_ErrorW[2][threadIndex];
const float recipResponse0 = ang0Ortho0_recipResponseW.w;
const float recipResponse1 = ang0Ortho1_recipResponseW.w;
const float recipResponse2 = ang0Ortho2_recipResponseW.w;
const PxVec3 ang0Ortho0(ang0Ortho0_recipResponseW.x, ang0Ortho0_recipResponseW.y, ang0Ortho0_recipResponseW.z);
const PxVec3 ang0Ortho1(ang0Ortho1_recipResponseW.x, ang0Ortho1_recipResponseW.y, ang0Ortho1_recipResponseW.z);
const PxVec3 ang0Ortho2(ang0Ortho2_recipResponseW.x, ang0Ortho2_recipResponseW.y, ang0Ortho2_recipResponseW.z);
const PxVec3 ang1Ortho0(ang1OrthoAxis0_ErrorW.x, ang1OrthoAxis0_ErrorW.y, ang1OrthoAxis0_ErrorW.z);
const PxVec3 ang1Ortho1(ang1OrthoAxis1_ErrorW.x, ang1OrthoAxis1_ErrorW.y, ang1OrthoAxis1_ErrorW.z);
const PxVec3 ang1Ortho2(ang1OrthoAxis2_ErrorW.x, ang1OrthoAxis2_ErrorW.y, ang1OrthoAxis2_ErrorW.z);
PxReal error0 = ang1OrthoAxis0_ErrorW.w + (ang0Ortho0.dot(angDelta0) - ang1Ortho0.dot(angDelta1));
PxReal error1 = ang1OrthoAxis1_ErrorW.w + (ang0Ortho1.dot(angDelta0) - ang1Ortho1.dot(angDelta1));
PxReal error2 = ang1OrthoAxis2_ErrorW.w + (ang0Ortho2.dot(angDelta0) - ang1Ortho2.dot(angDelta1));
for (PxU32 i = 0; i < rowCount; ++i)
{
PxgTGSBlockSolverConstraint1DCon& ccon = baseCon[i];
const float4 _clinVel0_coeff0W = ccon.lin0XYZ_initBiasOrCoeff0[threadIndex];
const float4 _clinVel1_coeff1W = ccon.lin1XYZ_biasScaleOrCoeff1[threadIndex];
const float4 _cangVel0_coeff2W = ccon.ang0XYZ_velMultiplierOrCoeff2[threadIndex];
const float4 _cangVel1_coeff3W = ccon.ang1XYZ_velTargetOrCoeff3[threadIndex];
const PxVec3 clinVel0(_clinVel0_coeff0W.x, _clinVel0_coeff0W.y, _clinVel0_coeff0W.z);
const PxVec3 clinVel1(_clinVel1_coeff1W.x, _clinVel1_coeff1W.y, _clinVel1_coeff1W.z);
const PxVec3 cangVel0_(_cangVel0_coeff2W.x, _cangVel0_coeff2W.y, _cangVel0_coeff2W.z);
const PxVec3 cangVel1_(_cangVel1_coeff3W.x, _cangVel1_coeff3W.y, _cangVel1_coeff3W.z);
PxReal initBias = _clinVel0_coeff0W.w;
const PxReal biasScale = _clinVel1_coeff1W.w;
const PxReal velMultiplier = _cangVel0_coeff2W.w;
const PxReal targetVel = _cangVel1_coeff3W.w;
const PxVec3 cangVel0 = cangVel0_ + ra.cross(clinVel0);
const PxVec3 cangVel1 = cangVel1_ + rb.cross(clinVel1);
const PxU32 flags = ccon.flags[threadIndex];
const PxReal maxBias = ccon.maxBias[threadIndex];
const PxReal minBias = Dy::computeMinBiasTGS(flags, maxBias);
PxVec3 raXnI = iData0.sqrtInvInertia * cangVel0;
PxVec3 rbXnI = iData1.sqrtInvInertia * cangVel1;
//KS - TODO - orthogonalization here...
if (flags & DY_SC_FLAG_ORTHO_TARGET)
{
const PxReal proj0 = (raXnI.dot(ang0Ortho0) +
rbXnI.dot(ang1Ortho0)) * recipResponse0;
const PxReal proj1 = (raXnI.dot(ang0Ortho1) +
rbXnI.dot(ang1Ortho1)) * recipResponse1;
const PxReal proj2 = (raXnI.dot(ang0Ortho2) +
rbXnI.dot(ang1Ortho2)) * recipResponse2;
const PxVec3 delta0 = ang0Ortho0 * proj0 + ang0Ortho1 * proj1 + ang0Ortho2 * proj2;
const PxVec3 delta1 = ang1Ortho0 * proj0 + ang1Ortho1 * proj1 + ang1Ortho2 * proj2;
raXnI = raXnI - delta0;
rbXnI = rbXnI - delta1;
const PxReal orthoBasisError = biasScale * (error0 * proj0 + error1 * proj1 + error2 * proj2);
initBias = initBias - orthoBasisError;
}
// If biasScale is zero, initBias must also be zero.
// biasScale is enforced to zero before the velocity itertaion (e.g., in conclude1DBlockTGS), but not initBias.
// Thus, explicitly resetting it to zero here for velocity iteration.
initBias = (biasScale == 0.f) ? 0.f : initBias;
const bool isSpringConstraint = (flags & DY_SC_FLAG_SPRING);
const PxReal errorDelta = Dy::computeResolvedGeometricErrorTGS(raMotion, rbMotion, clinVel0, clinVel1,
angDelta0, angDelta1, raXnI, rbXnI,
ccon.angularErrorScale[threadIndex],
isSpringConstraint, targetVel, elapsedTime);
const PxReal resp0 = invMass0 * clinVel0.dot(clinVel0) + raXnI.dot(raXnI) * invInertiaScale0;
const PxReal resp1 = invMass1 * clinVel1.dot(clinVel1) + rbXnI.dot(rbXnI) * invInertiaScale1;
//KS - may be a - required here...
const PxReal response = resp0 + resp1;
const PxReal recipResponse = response > 0.f ? 1.f / response : 0.f;
const PxReal vMul = isSpringConstraint ? velMultiplier : recipResponse * velMultiplier;
const PxReal unclampedBias = initBias + errorDelta * biasScale;
const PxReal bias = PxClamp(unclampedBias, minBias, maxBias);
const PxReal constant = isSpringConstraint ? bias + targetVel : recipResponse * (bias + targetVel);
const float appliedForce = ccon.appliedForce[threadIndex];//FLoad(c.appliedForce);
//const FloatV targetVel = FLoad(c.targetVelocity);
const float maxImpulse = ccon.maxImpulse[threadIndex];//FLoad(c.maxImpulse);
const float minImpulse = ccon.minImpulse[threadIndex];//FLoad(c.minImpulse);
//const PxVec3 v0 = linVel0.multiply(clinVel0) + angVel0.multiply(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
//const PxVec3 v1 = linVel1.multiply(clinVel1) + angVel1.multiply(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
const float v0 = linVel0.dot(clinVel0) + angVel0.dot(raXnI);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
const float v1 = linVel1.dot(clinVel1) + angVel1.dot(rbXnI);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
const float normalVel = v0 - v1;
const float unclampedForce = appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
ccon.appliedForce[threadIndex] = clampedForce; // FStore(clampedForce);
if(residualReportingEnabled)
ccon.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
linVel0 = linVel0 + clinVel0 * (deltaF * invMass0);//V3ScaleAdd(clinVel0, FMul(deltaF, invMass0), linVel0);
linVel1 = linVel1 - clinVel1 * (deltaF * invMass1);//V3NegScaleSub(clinVel1, FMul(deltaF, invMass1), linVel1);
angVel0 = angVel0 + raXnI * deltaF * invInertiaScale0;//V3ScaleAdd(cangVel0, deltaF, angVel0);
angVel1 = angVel1 - rbXnI * deltaF * invInertiaScale1;//V3NegScaleSub(cangVel1, deltaF, angVel1);
}
b0LinVel = PxVec3(linVel0.x, linVel0.y, linVel0.z);
b0AngVel = PxVec3(angVel0.x, angVel0.y, angVel0.z);
b1LinVel = PxVec3(linVel1.x, linVel1.y, linVel1.z);
b1AngVel = PxVec3(angVel1.x, angVel1.y, angVel1.z);
}
// Using the same logic as the previous implementation, but mass-splitting is additionally performed per sub-timestep.
// Required data is packed and stored differently from the previous implementation to split mass at each sub-timestep;
// i.e., mass-related terms are computed at every sub-timestep.
static __device__ PX_FORCE_INLINE void solveExt1DBlockTGS(const PxgBlockConstraintBatch& batch,
Cm::UnAlignedSpatialVector& vel0,
Cm::UnAlignedSpatialVector& vel1,
const Cm::UnAlignedSpatialVector& motion0,
const Cm::UnAlignedSpatialVector& motion1,
const PxU32 threadIndex,
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers,
PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
PxgArticulationBlockResponse* PX_RESTRICT artiResponse,
const PxQuat& deltaQ0, const PxQuat& deltaQ1,
const PxReal elapsedTime,
Cm::UnAlignedSpatialVector& impluse0,
Cm::UnAlignedSpatialVector& impluse1,
bool residualReportingEnabled,
PxReal ref0 = 1.f, PxReal ref1 = 1.f)
{
using namespace physx;
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex];
PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
const PxgArticulationBlockResponse* PX_RESTRICT responses = &artiResponse[batch.mArticulationResponseIndex];
PxVec3 linVel0 = vel0.bottom;
PxVec3 linVel1 = vel1.bottom;
PxVec3 angVel0 = vel0.top;
PxVec3 angVel1 = vel1.top;
const float4 raInvMass0 = header->rAWorld_invMass0D0[threadIndex];
const float4 rbInvMass1 = header->rBWorld_invMass1D1[threadIndex];
float invMass0 = ref0 * raInvMass0.w; //FLoad(header->invMass0D0);
float invMass1 = ref1 * rbInvMass1.w; //FLoad(header->invMass1D1);
float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
const PxVec3 raPrev(raInvMass0.x, raInvMass0.y, raInvMass0.z);
const PxVec3 rbPrev(rbInvMass1.x, rbInvMass1.y, rbInvMass1.z);
const PxVec3 ra = deltaQ0.rotate(raPrev);
const PxVec3 rb = deltaQ1.rotate(rbPrev);
const PxVec3 raMotion = (ra + motion0.bottom) - raPrev;
const PxVec3 rbMotion = (rb + motion1.bottom) - rbPrev;
const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex];
const PxU32 rowCount = rowCounts_breakable_orthoAxisCount.x;
const PxReal cfm = header->cfm[threadIndex];
PxVec3 li0(0.f, 0.f, 0.f);
PxVec3 li1(0.f, 0.f, 0.f);
PxVec3 ai0(0.f, 0.f, 0.f);
PxVec3 ai1(0.f, 0.f, 0.f);
for (PxU32 i = 0; i < rowCount; ++i)
{
PxgTGSBlockSolverConstraint1DCon& ccon = baseCon[i];
const PxgArticulationBlockResponse& response = responses[i];
const float4 _clinVel0_coeff0W = ccon.lin0XYZ_initBiasOrCoeff0[threadIndex];//V3LoadA(c.lin0);
const float4 _clinVel1_coeff1W = ccon.lin1XYZ_biasScaleOrCoeff1[threadIndex];//V3LoadA(c.lin1);
const float4 _cangVel0_coeff2W = ccon.ang0XYZ_velMultiplierOrCoeff2[threadIndex];//V3LoadA(c.ang0);
const float4 _cangVel1_coeff3W = ccon.ang1XYZ_velTargetOrCoeff3[threadIndex];//V3LoadA(c.ang1);
const PxVec3 clinVel0(_clinVel0_coeff0W.x, _clinVel0_coeff0W.y, _clinVel0_coeff0W.z);
const PxVec3 clinVel1(_clinVel1_coeff1W.x, _clinVel1_coeff1W.y, _clinVel1_coeff1W.z);
const PxVec3 cangVel0(_cangVel0_coeff2W.x, _cangVel0_coeff2W.y, _cangVel0_coeff2W.z);
const PxVec3 cangVel1(_cangVel1_coeff3W.x, _cangVel1_coeff3W.y, _cangVel1_coeff3W.z);
const PxU32 flags = ccon.flags[threadIndex];
const bool isSpringConstraint = (flags & DY_SC_FLAG_SPRING);
const PxReal resp0 = ref0 * ccon.resp0[threadIndex];
const PxReal resp1 = ref1 * ccon.resp1[threadIndex];
float unitResponse = resp0 + resp1;
unitResponse += cfm;
//https://omniverse-jirasw.nvidia.com/browse/PX-4383
const PxReal minRowResponse = DY_ARTICULATION_MIN_RESPONSE;
const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
const bool isAccelerationSpring = (flags & DY_SC_FLAG_ACCELERATION_SPRING);
const PxReal coeff0 = _clinVel0_coeff0W.w;
const PxReal coeff1 = _clinVel1_coeff1W.w;
const PxReal coeff2 = _cangVel0_coeff2W.w;
const PxReal coeff3 = _cangVel1_coeff3W.w;
const PxReal geometricError = ccon.geometricError[threadIndex];
PxReal biasScale, velMultiplier, initBias, targetVel;
Dy::compute1dConstraintSolverConstantsTGS(isSpringConstraint, isAccelerationSpring, geometricError,
unitResponse, recipResponse, coeff0, coeff1, coeff2,
coeff3, biasScale, velMultiplier, initBias, targetVel);
// If biasScale is zero, initBias must also be zero.
// biasScale is enforced to zero before the velocity itertaion (e.g., in conclude1DBlockTGS), but not initBias.
// Thus, explicitly resetting it to zero here for velocity iteration.
initBias = (biasScale == 0.f) ? 0.f : initBias;
const PxReal errorDelta = Dy::computeResolvedGeometricErrorTGS(raMotion, rbMotion, clinVel0, clinVel1,
motion0.top, motion1.top, cangVel0, cangVel1,
ccon.angularErrorScale[threadIndex],
isSpringConstraint, targetVel, elapsedTime);
const PxReal maxBias = ccon.maxBias[threadIndex];
const PxReal vMul = isSpringConstraint ? velMultiplier : recipResponse * velMultiplier;
const float appliedForce = ccon.appliedForce[threadIndex];
const PxReal unclampedBias = initBias + errorDelta * biasScale;
const PxReal minBias = Dy::computeMinBiasTGS(flags, maxBias);
const PxReal bias = PxClamp(unclampedBias, minBias, maxBias);
const PxReal constant = isSpringConstraint ? (bias + targetVel) : recipResponse * (bias + targetVel);
const float maxImpulse = ccon.maxImpulse[threadIndex];//FLoad(c.maxImpulse);
const float minImpulse = ccon.minImpulse[threadIndex];//FLoad(c.minImpulse);
const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
const float normalVel = v0 - v1;
const float unclampedForce = appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
ccon.appliedForce[threadIndex] = clampedForce; // FStore(clampedForce);
if(residualReportingEnabled)
ccon.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
li0 = clinVel0 * deltaF + li0;
ai0 = cangVel0 * deltaF + ai0;
li1 = clinVel1 * deltaF + li1;
ai1 = cangVel1 * deltaF + ai1;
PxVec3 linVa = ref0 * PxVec3(response.deltaRALin_x[threadIndex], response.deltaRALin_y[threadIndex], response.deltaRALin_z[threadIndex]);
PxVec3 angVa = ref0 * PxVec3(response.deltaRAAng_x[threadIndex], response.deltaRAAng_y[threadIndex], response.deltaRAAng_z[threadIndex]);
PxVec3 linVb = ref1 * PxVec3(response.deltaRBLin_x[threadIndex], response.deltaRBLin_y[threadIndex], response.deltaRBLin_z[threadIndex]);
PxVec3 angVb = ref1 * PxVec3(response.deltaRBAng_x[threadIndex], response.deltaRBAng_y[threadIndex], response.deltaRBAng_z[threadIndex]);
linVel0 = linVa * deltaF + linVel0;
angVel0 = angVa * deltaF + angVel0;
linVel1 = linVb * deltaF + linVel1;
angVel1 = angVb * deltaF + angVel1;
}
vel0.top = angVel0; vel0.bottom = linVel0;
vel1.top = angVel1; vel1.bottom = linVel1;
impluse0.top = li0 * invMass0; impluse0.bottom = ai0 * invInertiaScale0;
impluse1.top = li1 * invMass1; impluse1.bottom = ai1 * invInertiaScale1;
}
static __device__ void conclude1DBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rows)
{
using namespace physx;
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex];
PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT base = &rows[batch.startConstraintIndex];
const uchar4 rowCount = header->rowCounts_breakable_orthoAxisCount[threadIndex];
for (PxU32 i = 0; i<rowCount.x; i++)
{
PxgTGSBlockSolverConstraint1DCon& c = base[i];
// Previous data, biasScale, initBias, velTarget, and velMultiplier are repacked using other coefficients.
// See "PxgTGSBlockSolverConstraint1DCon" for details.
if(!(c.flags[threadIndex] & DY_SC_FLAG_KEEP_BIAS))
{
c.lin1XYZ_biasScaleOrCoeff1[threadIndex].w = 0.f; // setting biasScale to zero. Make sure to enforce
// initialBias to be zero, when biasScale is zero.
}
if(c.flags[threadIndex] & DY_SC_FLAG_SPRING)
{
// see CPU version for an explanation
c.lin0XYZ_initBiasOrCoeff0[threadIndex].w = 0.f;
c.lin1XYZ_biasScaleOrCoeff1[threadIndex].w = 0.f;
c.ang0XYZ_velMultiplierOrCoeff2[threadIndex].w = 0.0f;
c.ang1XYZ_velTargetOrCoeff3[threadIndex].w = 0.0f;
}
}
}
static __device__ void writeBack1DBlockTGS(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT headers,
PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT rowsCon, PxgConstraintWriteback* constraintWriteBacks)
{
const PxgTGSBlockSolverConstraint1DHeader* PX_RESTRICT header = &headers[batch.mConstraintBatchIndex];
PxgTGSBlockSolverConstraint1DCon* conBase = &rowsCon[batch.startConstraintIndex];
PxU32 forceWritebackOffset = header->writeBackOffset[threadIndex];
const uchar4 rowCounts_breakable_orthoAxisCount = header->rowCounts_breakable_orthoAxisCount[threadIndex];
const PxU8 breakable = rowCounts_breakable_orthoAxisCount.y;
const PxU32 numRows = rowCounts_breakable_orthoAxisCount.x;
if (forceWritebackOffset != 0xFFFFFFFF)
{
PxgConstraintWriteback& writeback = constraintWriteBacks[forceWritebackOffset];
PxVec3 linVel(0), angVel(0);
PxReal constraintErrorSq = 0.0f;
for (PxU32 i = 0; i < numRows; ++i)
{
PxgTGSBlockSolverConstraint1DCon& con = conBase[i];
if (con.flags[threadIndex] & DY_SC_FLAG_OUTPUT_FORCE)
{
const float4 lin0XYZ_ErrorW = con.lin0XYZ_initBiasOrCoeff0[threadIndex];
const float4 ang0WriteBack_VMW = con.ang0XYZ_velMultiplierOrCoeff2[threadIndex];
const PxVec3 lin0(lin0XYZ_ErrorW.x, lin0XYZ_ErrorW.y, lin0XYZ_ErrorW.z);
const PxVec3 ang0WriteBack(ang0WriteBack_VMW.x, ang0WriteBack_VMW.y, ang0WriteBack_VMW.z);
const PxReal appliedForce = con.appliedForce[threadIndex];
linVel += lin0 * appliedForce;
angVel += ang0WriteBack *appliedForce;
}
PxReal err = con.residual[threadIndex];
constraintErrorSq += err * err;
}
const float4 body0WorldOffset_InvMass0D0 = header->rAWorld_invMass0D0[threadIndex];
const PxVec3 body0WorldOffset(body0WorldOffset_InvMass0D0.x, body0WorldOffset_InvMass0D0.y, body0WorldOffset_InvMass0D0.z);
angVel -= body0WorldOffset.cross(linVel);
const PxU32 broken = breakable ? PxU32((linVel.magnitude() > header->linBreakImpulse[threadIndex]) || (angVel.magnitude() > header->angBreakImpulse[threadIndex])) : 0;
writeback.angularImpulse_residual = make_float4(angVel.x, angVel.y, angVel.z, constraintErrorSq);
writeback.linearImpulse_broken = make_float4(linVel.x, linVel.y, linVel.z, broken ? -0.0f : 0.0f);
}
}
#endif