feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpuarticulation/src/CUDA/articulationDynamic.cuh
+++ b/engine/third_party/physx/source/gpuarticulation/src/CUDA/articulationDynamic.cuh
@@ -0,0 +1,449 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef	__ARTI_DYNAMIC_CUH__
+#define	__ARTI_DYNAMIC_CUH__
+
+#include "PxgArticulation.h"
+#include "PxgArticulationLink.h"
+#include "DyFeatherstoneArticulationUtils.h"
+#include "DyFeatherstoneArticulation.h"
+
+//This function stores Q-stZ to mDeferredQstZ
+static __device__ Cm::UnAlignedSpatialVector propagateImpulseW_0(const PxVec3& childToParent,
+	PxgArticulationBlockDofData* PX_RESTRICT dofData, const Cm::UnAlignedSpatialVector& Z,
+	const PxU32 dofCount, const PxU32 threadIndexInWarp,
+	const PxReal* PX_RESTRICT jointForce = NULL, const PxReal jointForceMultiplier = 1.0f)
+{
+	Cm::UnAlignedSpatialVector temp = Z;
+	Cm::UnAlignedSpatialVector sas[3];
+	Cm::UnAlignedSpatialVector isInvD[3];
+	PxReal jf[3];
+
+// the split into two separate loops is an optimization that allows dispatching the loads as early as possible.
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			sas[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
+			isInvD[ind] = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
+			jf[ind] = (jointForce ? jointForce[ind] * jointForceMultiplier : 0.0f);
+		}
+	}
+
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			const PxReal stZ = jf[ind] - sas[ind].innerProduct(Z);
+			dofData[ind].mDeferredQstZ[threadIndexInWarp] += stZ;
+			temp += isInvD[ind] * stZ;
+		}
+	}
+
+	//parent space's spatial zero acceleration impulse
+	return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
+}
+
+static __device__ Cm::UnAlignedSpatialVector propagateImpulseWTemp(const PxVec3& childToParent,
+	PxgArticulationBlockDofData* PX_RESTRICT dofData, const Cm::UnAlignedSpatialVector& Z,
+	const PxU32 dofCount, const PxU32 threadIndexInWarp)
+{
+	Cm::UnAlignedSpatialVector temp = Z;
+
+	assert(dofCount<=3);
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if(ind<dofCount)
+		{
+			const Cm::UnAlignedSpatialVector sa = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
+			const Cm::UnAlignedSpatialVector isInvD = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
+			const PxReal stZ = -sa.innerProduct(Z);
+			dofData[ind].mTmpQstZ[threadIndexInWarp] += stZ;
+
+			temp += isInvD * stZ;
+		}
+	}
+
+	//parent space's spatial zero acceleration impulse
+	return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
+}
+
+static __device__ Cm::UnAlignedSpatialVector propagateImpulseW_1(
+	const PxVec3& childToParent,
+	const PxgArticulationBlockDofData* PX_RESTRICT dofData, 
+	const Cm::UnAlignedSpatialVector& Z,
+	const PxReal* jointDofImpulses, const PxU32 dofCount,
+	const PxU32 threadIndexInWarp, 
+	PxReal* qstZ)
+{
+	Cm::UnAlignedSpatialVector temp = Z;
+
+	assert(dofCount<=3);
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if(ind<dofCount)
+		{
+			const Cm::UnAlignedSpatialVector sa = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
+			const Cm::UnAlignedSpatialVector isInvD = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
+			const PxReal jointDofImpulse = jointDofImpulses ? jointDofImpulses[ind] : 0.0f;
+			const PxReal QMinusSTZ = jointDofImpulse - sa.innerProduct(Z);
+			qstZ[ind] += QMinusSTZ;
+
+			temp += isInvD * QMinusSTZ;
+		}
+	}
+
+	//parent space's spatial zero acceleration impulse
+	return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
+}
+
+static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p, const float3* invStIsT,
+	const Cm::UnAlignedSpatialVector* motionMatrix, const Cm::UnAlignedSpatialVector& hDeltaV, const PxU32 dofCount,
+	const Cm::UnAlignedSpatialVector* IsW, const PxReal* qstZ)
+{
+	Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
+
+	//Convert parent velocity change into an impulse
+	PxReal tJointDelta[3] = { 0.f, 0.f, 0.f };
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			//stI * pAcceleration
+			const PxReal temp = IsW[ind].innerProduct(pDeltaV);
+
+			tJointDelta[ind] = (qstZ[ind] - temp);
+		}
+	}
+
+	assert(dofCount<=3);
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if(ind<dofCount)
+		{
+			const float3 iStIsTi = invStIsT[ind];
+
+			const PxReal jDelta = iStIsTi.x * tJointDelta[0]
+								+ iStIsTi.y * tJointDelta[1]
+								+ iStIsTi.z * tJointDelta[2];
+
+			pDeltaV += motionMatrix[ind] * jDelta;
+		}
+	}
+
+	return pDeltaV;
+}
+
+static __device__ Cm::UnAlignedSpatialVector computeSpatialJointDelta(
+	const PxgArticulationBlockDofData* PX_RESTRICT dofData,
+	const PxReal* PX_RESTRICT QSTZMinusISDotTranslatedParentDeltaV, PxReal* PX_RESTRICT jointDeltaDofSpeeds, const PxU32 dofCount, 
+	const PxU32 threadIndexInWarp)
+{
+	Cm::UnAlignedSpatialVector sas[3];
+
+	// the split into two separate loops is an optimization that allows dispatching the loads as early as possible.
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			sas[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
+		}
+	}
+
+	Cm::UnAlignedSpatialVector jointSpatialDeltaV(PxVec3(0.f), PxVec3(0.f));
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			const float iStIsTi_x = dofData[ind].mInvStIsT_x[threadIndexInWarp];
+			const float iStIsTi_y = dofData[ind].mInvStIsT_y[threadIndexInWarp];
+			const float iStIsTi_z = dofData[ind].mInvStIsT_z[threadIndexInWarp];
+
+			const PxReal jDelta = iStIsTi_x * QSTZMinusISDotTranslatedParentDeltaV[0]
+								+ iStIsTi_y * QSTZMinusISDotTranslatedParentDeltaV[1]
+								+ iStIsTi_z * QSTZMinusISDotTranslatedParentDeltaV[2];
+
+			if(jointDeltaDofSpeeds)
+				jointDeltaDofSpeeds[ind] = jDelta;
+
+			jointSpatialDeltaV += sas[ind] * jDelta;
+		}
+	}
+
+	return jointSpatialDeltaV;
+}
+
+//This function use mDeferredQstZ
+static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p,
+	const PxgArticulationBlockDofData* PX_RESTRICT dofData,
+	const Cm::UnAlignedSpatialVector& hDeltaV,
+	const PxU32 dofCount, PxReal* jointDeltaDofSpeeds, const PxU32 threadIndexInWarp)
+{
+	const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
+
+	//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
+	PxReal QSTZMinusISDotTranslatedParentDeltaV[3] = { 0.f, 0.f, 0.f };
+	Cm::UnAlignedSpatialVector IsW;
+#pragma unroll(3)
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
+			//stI * pAcceleration
+			const PxReal temp = IsW.innerProduct(pDeltaV);
+
+			QSTZMinusISDotTranslatedParentDeltaV[ind] = (dofData[ind].mDeferredQstZ[threadIndexInWarp] - temp);
+		}
+	}
+
+	const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTranslatedParentDeltaV, jointDeltaDofSpeeds, dofCount, threadIndexInWarp);
+
+	return pDeltaV + jointSpatialDeltaV;
+}
+
+//This function use mTmpQstZ
+static __device__ Cm::UnAlignedSpatialVector propagateAccelerationWTemp(const PxVec3& c2p,
+	const PxgArticulationBlockDofData* PX_RESTRICT dofData,
+	const Cm::UnAlignedSpatialVector& hDeltaV,
+	const PxU32 dofCount, const PxU32 threadIndexInWarp)
+{
+	const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
+
+	//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
+	PxReal QSTZMinusISDotTransaltedParentDeltaV[3] = { 0.f, 0.f, 0.f };
+
+#pragma unroll(3)
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			const Cm::UnAlignedSpatialVector IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
+			//stI * pAcceleration
+			const PxReal temp = IsW.innerProduct(pDeltaV);
+
+			QSTZMinusISDotTransaltedParentDeltaV[ind] = (dofData[ind].mTmpQstZ[threadIndexInWarp] - temp);
+		}
+	}
+
+	const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTransaltedParentDeltaV, NULL, dofCount, threadIndexInWarp);
+
+	return pDeltaV + jointSpatialDeltaV;
+}
+
+//This function use qstZ as input
+static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p,
+	const PxgArticulationBlockDofData* PX_RESTRICT dofData,
+	const Cm::UnAlignedSpatialVector& hDeltaV,
+	const PxU32 dofCount, const PxU32 threadIndexInWarp,
+	const PxReal* qstZ)
+{
+	const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
+
+	//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
+	PxReal QSTZMinusISDotTransaltedParentDeltaV[3] = { 0.f, 0.f, 0.f };
+	Cm::UnAlignedSpatialVector IsW;
+#pragma unroll(3)
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
+			//stI * pAcceleration
+			const PxReal temp = IsW.innerProduct(pDeltaV);
+
+			QSTZMinusISDotTransaltedParentDeltaV[ind] = (qstZ[ind] - temp);
+		}
+	}
+
+	const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTransaltedParentDeltaV, NULL, dofCount, threadIndexInWarp);
+
+	return pDeltaV + jointSpatialDeltaV;
+}
+
+static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(
+	PxgArticulationBlockLinkData& linkData,
+	PxgArticulationBlockDofData* dofData,
+	const PxU32 dofCount,
+	const Cm::UnAlignedSpatialVector& hDeltaV,
+	const PxU32 threadIndexInWarp)
+{
+	const float c2px = linkData.mRw_x[threadIndexInWarp];
+	const float c2py = linkData.mRw_y[threadIndexInWarp];
+	const float c2pz = linkData.mRw_z[threadIndexInWarp];
+
+	float3 invStIsT[3];
+	PxReal tJointDelta[3] = { 0.f, 0.f, 0.f };
+	Cm::UnAlignedSpatialVector isW[3];
+	Cm::UnAlignedSpatialVector mMotionMatrix[3];
+	PxReal jVel[3];
+
+// the split into three separate loops is an optimization that allows dispatching the loads as early as possible.
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			isW[ind] = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
+			tJointDelta[ind] = (dofData[ind].mDeferredQstZ[threadIndexInWarp]);
+			jVel[ind] = dofData[ind].mJointVelocities[threadIndexInWarp];
+			invStIsT[ind] = make_float3(dofData[ind].mInvStIsT_x[threadIndexInWarp], dofData[ind].mInvStIsT_y[threadIndexInWarp], dofData[ind].mInvStIsT_z[threadIndexInWarp]);
+			mMotionMatrix[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
+		}
+	}
+
+	Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(PxVec3(-c2px, -c2py, -c2pz), hDeltaV); //parent velocity change
+	
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			tJointDelta[ind] -= isW[ind].innerProduct(pDeltaV);
+		}
+	}
+
+#pragma unroll 3
+	for (PxU32 ind = 0; ind < 3; ++ind)
+	{
+		if (ind < dofCount)
+		{
+			const PxReal jDelta = invStIsT[ind].x * tJointDelta[0] + invStIsT[ind].y * tJointDelta[1]
+				+ invStIsT[ind].z * tJointDelta[2];
+
+			dofData[ind].mJointVelocities[threadIndexInWarp] = jVel[ind] + jDelta;
+
+			pDeltaV += mMotionMatrix[ind] * jDelta;
+		}
+	}
+
+	return pDeltaV;
+}
+
+// There is a another version of this function in forwardDynamic2.cu which additionally writes
+// the link velocity to a global buffer
+static void __device__ PxcFsFlushVelocity(PxgArticulationBlockData& articulation,
+	PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
+	PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
+	PxU32 linkCount, bool fixBase, const PxU32 threadIndexInWarp)
+{
+	Cm::UnAlignedSpatialVector deltaV = Cm::UnAlignedSpatialVector::Zero();
+	Cm::UnAlignedSpatialVector deferredZ = -loadSpatialVector(articulation.mRootDeferredZ, threadIndexInWarp);
+	if (!fixBase)
+	{
+		//ArticulationLink& link = links[0];
+
+		// PT: preload data
+		const Cm::UnAlignedSpatialVector motionVelocity0 = loadSpatialVector(artiLinks[0].mMotionVelocity, threadIndexInWarp);
+		const Cm::UnAlignedSpatialVector solverSpatialDeltaVel0 = loadSpatialVector(artiLinks[0].mSolverSpatialDeltaVel, threadIndexInWarp);
+
+		Dy::SpatialMatrix invInertia;
+		loadSpatialMatrix(articulation.mInvSpatialArticulatedInertia, threadIndexInWarp, invInertia);
+		//deltaV = invInertia * (-loadSpatialVector(artiLinks[0].mDeferredZ, threadIndexInWarp));
+
+		deltaV = invInertia * deferredZ;
+
+		//motionVelocities[0] += deltaV[0];
+		storeSpatialVector(artiLinks[0].mMotionVelocity, motionVelocity0 + deltaV, threadIndexInWarp);
+		//storeSpatialVector(artiLinks[0].mDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
+		storeSpatialVector(articulation.mRootDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
+		storeSpatialVector(artiLinks[0].mSolverSpatialDeltaVel, solverSpatialDeltaVel0 + deltaV, threadIndexInWarp);	
+	}
+
+	storeSpatialVector(artiLinks[0].mScratchDeltaV, deltaV, threadIndexInWarp);
+	addSpatialVector(artiLinks[0].mConstraintForces, deferredZ, threadIndexInWarp);
+
+	storeSpatialVector(articulation.mCommonLinkDeltaVelocity, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
+
+	PxgArticulationBlockDofData* dofs = artiDofs;
+
+	if (linkCount > 1)
+	{
+		Cm::UnAlignedSpatialVector nextMotionV = loadSpatialVector(artiLinks[1].mMotionVelocity, threadIndexInWarp);
+		PxU32 nextNbDofs = artiLinks[1].mDofs[threadIndexInWarp];
+		PxU32 nextParent = artiLinks[1].mParents[threadIndexInWarp];
+		//Cm::UnAlignedSpatialVector nextDeferredZ = loadSpatialVector(artiLinks[1].mDeferredZ, threadIndexInWarp);
+
+		for (PxU32 i = 1; i < linkCount; i++)
+		{
+			PxgArticulationBlockLinkData& tLink = artiLinks[i];
+			const PxU32 nbDofs = nextNbDofs;
+			const PxU32 parent = nextParent;
+
+			const Cm::UnAlignedSpatialVector preloadedConstraintForces = loadSpatialVector(tLink.mConstraintForces, threadIndexInWarp);
+			const Cm::UnAlignedSpatialVector preloadedSolverSpatialDeltaVel = loadSpatialVector(tLink.mSolverSpatialDeltaVel, threadIndexInWarp);
+
+			Cm::UnAlignedSpatialVector motionV = nextMotionV;
+			//const Cm::UnAlignedSpatialVector deferredZ = nextDeferredZ;
+
+			//storeSpatialVector(tLink.mDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
+
+			if ((i + 1) < linkCount)
+			{
+				nextMotionV = loadSpatialVector(artiLinks[i + 1].mMotionVelocity, threadIndexInWarp);
+				nextNbDofs = artiLinks[i + 1].mDofs[threadIndexInWarp];
+				nextParent = artiLinks[i + 1].mParents[threadIndexInWarp];
+				//nextDeferredZ = loadSpatialVector(artiLinks[i + 1].mDeferredZ, threadIndexInWarp);
+			}
+
+			if (parent != (i - 1))
+				deltaV = loadSpatialVector(artiLinks[parent].mScratchDeltaV, threadIndexInWarp);
+
+			deltaV = propagateAccelerationW(tLink, dofs, nbDofs, deltaV, threadIndexInWarp);
+
+			//Accumulate the DeltaVel arising from solver impulses applied to this link.
+			storeSpatialVector(tLink.mSolverSpatialDeltaVel, preloadedSolverSpatialDeltaVel + deltaV, threadIndexInWarp);
+
+			//zeroing mDeferredQstZ
+			for (PxU32 ind = 0; ind < nbDofs; ++ind)
+			{
+				dofs[ind].mDeferredQstZ[threadIndexInWarp] = 0.f;
+			}
+
+			motionV += deltaV;
+
+			storeSpatialVector(tLink.mScratchDeltaV, deltaV, threadIndexInWarp);
+
+			//const PxTransform& tBody2World = poses[i];
+			storeSpatialVector(tLink.mMotionVelocity, motionV, threadIndexInWarp);
+			
+			storeSpatialVector(tLink.mConstraintForces, preloadedConstraintForces + deltaV, threadIndexInWarp);
+
+			dofs += nbDofs;
+		}
+	}
+}
+
+#endif
+