feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraint.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraint.h
@@ -0,0 +1,199 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_H
+#define PXG_CONSTRAINT_H
+
+#include "PxvConfig.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "vector_types.h"
+
+namespace physx
+{
+
+struct PxgSolverBodyData;
+struct PxConstraintInvMassScale;
+
+struct PxgSolverContactHeader
+{
+	float4	invMass0_1_angDom0_1;
+	float4	normal_staticFriction;
+
+	PxU32	flags;
+	PxU32	numNormalConstr;
+	PxU32	forceWritebackOffset;
+	PxReal	accumNormalForce;
+}; 
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverContactHeader) == 48);
+
+/**
+\brief A single articulation contact point for the solver.
+*/
+struct PxgSolverContactPointExt
+{
+	PxVec3	angDeltaVA;						//12	12
+	PxVec3  linDeltaVA;						//12	24
+	PxVec3	angDeltaVB;						//12	36
+	PxVec3	linDeltaVB;						//12	48
+	PxVec3	raXn;							//12	60
+	PxVec3	rbXn;							//12	72
+	PxReal	velMultiplier;					//4		76
+	PxReal	maxImpulse;						//4		80
+	PxReal	biasedErr;						//4		84
+	PxReal	unbiasedErr;					//4		88
+	PxReal	appliedForce;					//4		92
+	PxU32	padding;						//4		96
+}; 
+
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverContactPointExt) == 96);
+
+struct PxgSolverFrictionHeader
+{
+	float4	frictionNormals[2];		
+	PxU32	numFrictionConstr;			
+	PxReal	dynamicFriction;			
+	PxU32	broken;					
+};
+
+/**
+\brief A single articulation friction constraint for the solver.
+*/
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+struct PX_ALIGN_PREFIX(16) PxgSolverContactFrictionExt
+{
+	PxVec3	angDeltaVA;						//12	12
+	PxVec3  linDeltaVA;						//12	24
+	PxVec3	angDeltaVB;						//12	36
+	PxVec3	linDeltaVB;						//12	48
+	PxVec3	raXn;							//12	60
+	PxVec3	rbXn;							//12	72
+	PxReal	velMultiplier;					//4		76
+	PxReal	targetVel;						//4		80
+	PxReal	bias;							//4		84
+	PxReal	appliedForce;					//4		88
+	PxU32	padding[2];						//8		96
+
+} PX_ALIGN_SUFFIX(16); 
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+struct PxgContactParams
+{
+	PxgSolverContactHeader* contactHeader;
+	PxgSolverFrictionHeader* frictionHeader;
+	PxgSolverContactPointExt* solverContacts;
+	PxgSolverContactFrictionExt* solverFrictions;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverContactFrictionExt) == 96);
+
+
+struct PxgTGSSolverContactHeader
+{
+	float4	dom0_1_angDom0_1;			//16
+	float4	normal_maxPenBias;			//32
+	
+	PxReal	staticFriction;
+	PxReal	dynamicFriction;
+	PxReal	minNormalForce;
+	PxU32	flags;						//48
+
+	PxU16	numNormalConstr;
+	PxU16	numFrictionConstr;
+	PxU32	forceWritebackOffset;
+	PxU32	broken;						
+	PxU32	pad;						//64
+
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgTGSSolverContactHeader) == 64);
+
+struct PxgTGSSolverContactPointExt
+{
+	//Grouped together in contiguous memory so we can load all 48 bytes in a single instruction
+	PxVec3	angDeltaVA;					//12	12
+	PxVec3  linDeltaVA;					//12	24
+	PxVec3	angDeltaVB;					//12	36
+	PxVec3	linDeltaVB;					//12	48
+	
+	//Grouped so we can load 24 bytes in single instruction
+	PxVec3	raXn;						//12	60
+	PxVec3	rbXn;						//12	72
+
+	//All the loose items - loaded incoherently
+	PxReal	separation;					//4		76
+	PxReal	velMultiplier;				//4		80
+	PxReal	targetVelocity;				//4		84
+	PxReal	biasCoefficient;			//4		88
+	PxReal	maxImpulse;					//4		92
+	PxReal	appliedForce;				//4		96
+};
+
+struct PxgTGSSolverFrictionExt
+{
+	//Grouped together in contiguous memory so we can load all 48 bytes in a single instruction
+	PxVec3	angDeltaVA;					//12	12
+	PxVec3  linDeltaVA;					//12	24
+	PxVec3	angDeltaVB;					//12	36
+	PxVec3	linDeltaVB;					//12	48
+
+	//Grouped so we can load 24 bytes in single instruction
+	PxVec3	raXn;						//12	60
+	PxVec3	rbXn;						//12	72
+
+	//Loose items - loaded incoherently
+	PxVec3	normal;						//12	84
+	PxReal	error;						//4		88
+	PxReal	targetVel;					//4		92
+	PxReal	velMultiplier;				//4		96
+
+	PxReal	biasScale;					//4		100
+	PxReal	frictionScale;				//4		104
+	PxReal	appliedForce;				//4		108
+	PxU32	pad;						//4		112
+};
+
+struct PxgTGSContactParams
+{
+	PxgTGSSolverContactHeader* contactHeader;
+	PxgTGSSolverContactPointExt* solverContacts;
+	PxgTGSSolverFrictionExt* solverFrictions;
+};
+
+
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraintBlock.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraintBlock.h
@@ -0,0 +1,237 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_BLOCK_H
+#define PXG_CONSTRAINT_BLOCK_H
+
+#include "PxvConfig.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxgSolverBody.h"
+
+namespace physx
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+
+struct PxgBlockSolverContactHeader	
+{
+	PX_ALIGN(128, float4	invMass0_1_angDom0_1[32]);		//512		512
+	PX_ALIGN(128, float4	normal_staticFriction[32]);		//1024		512
+	PX_ALIGN(128, PxReal	accumNormalForce[32]);
+	//Only used by articulation constraints. Forces the minimum normal force for friction.
+	//Without this, articulations can drift due to no normal force when multi-link systems contact with surfaces.
+	PX_ALIGN(128, PxReal	minNormalForce[32]);
+
+	PX_ALIGN(128, PxU32		flags[32]);						//1152		128
+	PX_ALIGN(128, PxU32		numNormalConstr[32]);			//1280		128
+	PX_ALIGN(128, PxU32		forceWritebackOffset[32]);		//1408		128
+
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// recipResponse, velMultipler, biasCoefficient, etc. are computed every sub-timestep (or iteration).
+	// To compute them every sub-timestep (or iteration), restitution and cfm are additionally stored.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+	PX_ALIGN(128, PxReal	restitution[32]);
+	PX_ALIGN(128, PxReal	cfm[32]);
+
+};
+
+struct PxgBlockSolverFrictionHeader
+{
+	PX_ALIGN(128, float4	frictionNormals[2][32]);		//1024		1024
+	PX_ALIGN(128, PxU32		numFrictionConstr[32]);			//1152		128
+	PX_ALIGN(128, PxReal	dynamicFriction[32]);			//1280		128
+	PX_ALIGN(128, PxU32		broken[32]);					//1408		128
+};
+
+//PX_COMPILE_TIME_ASSERT(sizeof(PxgBlockSolverContactHeader) == 1280);
+
+/**
+\brief A single rigid body contact point for the solver.
+*/
+struct PxgBlockSolverContactPoint
+{
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// unitResponse, recipResponse, velMultiplier, etc. are computed every sub-timestep (or iteration).
+	// To compute them at every sub-timestep (or iteration), resp0, resp1, and other relevant data are stored additionally.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+
+	PX_ALIGN(128, float4	raXn_targetVelocity[32]); 
+	PX_ALIGN(128, float4	rbXn_maxImpulse[32]);
+	PX_ALIGN(128, PxReal	appliedForce[32]);
+
+	PX_ALIGN(128, PxReal	resp0[32]);
+	PX_ALIGN(128, PxReal	resp1[32]);
+
+	// Two coefficients used in "queryReducedCompliantContactCoefficients" and "computeCompliantContactCoefficients"
+	PX_ALIGN(128, PxReal	coeff0[32]); 
+	PX_ALIGN(128, PxReal	coeff1[32]);
+};
+
+struct PxgArticulationBlockResponse
+{
+	PX_ALIGN(128, float	deltaRALin_x[32]);
+	PX_ALIGN(128, float	deltaRALin_y[32]);
+	PX_ALIGN(128, float	deltaRALin_z[32]);
+	PX_ALIGN(128, float	deltaRAAng_x[32]);
+	PX_ALIGN(128, float	deltaRAAng_y[32]);
+	PX_ALIGN(128, float	deltaRAAng_z[32]);
+	PX_ALIGN(128, float	deltaRBLin_x[32]);
+	PX_ALIGN(128, float	deltaRBLin_y[32]);
+	PX_ALIGN(128, float	deltaRBLin_z[32]);
+	PX_ALIGN(128, float	deltaRBAng_x[32]);
+	PX_ALIGN(128, float	deltaRBAng_y[32]);
+	PX_ALIGN(128, float	deltaRBAng_z[32]);
+};
+
+/**
+\brief A single friction constraint for the solver.
+*/
+struct PxgBlockSolverContactFriction
+{
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// unitResponse, recipResponse, velMultiplier, etc. are computed every sub-timestep (or iteration).
+	// To compute them at every sub-timestep (or iteration), resp0, resp1, and other relevant data are stored additionally.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+
+	PX_ALIGN(128, float4	raXn_bias[32]);
+	PX_ALIGN(128, float4	rbXn_targetVelW[32]);
+	PX_ALIGN(128, PxReal	appliedForce[32]);
+	PX_ALIGN(128, PxReal	resp0[32]);
+	PX_ALIGN(128, PxReal	resp1[32]);
+};
+
+
+struct PxgTGSBlockSolverContactHeader
+{
+	PX_ALIGN(128, float4	invMass0_1_angDom0_1[32]);		//512		512
+	PX_ALIGN(128, float4	normal_staticFriction[32]);		//1024		512
+
+	//Only used by articulation constraints. Forces the minimum normal force for friction.
+	//Without this, articulations can drift due to no normal force when multi-link systems contact with surfaces.
+	PX_ALIGN(128, PxReal	minNormalForce[32]);
+
+	PX_ALIGN(128, PxF32		maxPenBias[32]);				//1152		128
+	PX_ALIGN(128, PxU32		flags[32]);						//1408		128
+	PX_ALIGN(128, PxU32		numNormalConstr[32]);			//1536		128
+	PX_ALIGN(128, PxU32		forceWritebackOffset[32]);		//1664		128
+
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// recipResponse, velMultipler, biasCoefficient, etc. are computed every sub-timestep (or iteration).
+	// To compute them every sub-timestep (or iteration), restitution, cfm, and p8 are additionally stored.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+
+	PX_ALIGN(128, PxReal	restitutionXdt[32]);
+	PX_ALIGN(128, PxReal	cfm[32]);						
+	PX_ALIGN(128, PxReal	p8[32]);						
+};
+
+struct PxgTGSBlockSolverFrictionHeader
+{
+	PX_ALIGN(128, float4	frictionNormals[2][32]);		//1024		1024
+	PX_ALIGN(128, PxU32		numFrictionConstr[32]);			//1152		128
+	PX_ALIGN(128, PxReal	dynamicFriction[32]);			//1280		128
+	PX_ALIGN(128, PxU32		broken[32]);					//1408		128
+	PX_ALIGN(128, PxReal	biasCoefficient[32]);
+	PX_ALIGN(128, PxReal	torsionalFrictionScale[32]);
+};
+
+//PX_COMPILE_TIME_ASSERT(sizeof(PxgBlockSolverContactHeader) == 1280);
+
+/**
+\brief A single rigid body contact point for the solver.
+*/
+struct PxgTGSBlockSolverContactPoint
+{
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// unitResponse, recipResponse, velMultiplier, etc. are computed every sub-timestep (or iteration).
+	// To compute them at every sub-timestep (or iteration), resp0, resp1, and other relevant data are stored additionally.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+
+	PX_ALIGN(128, float4	raXn_extraCoeff[32]); // For contact constraints, extraCoeff is the compliant contact coefficient "a"
+												  // used in "computeCompliantContactCoefficientsTGS".
+
+	PX_ALIGN(128, float4	rbXn_targetVelW[32]);
+	PX_ALIGN(128, PxReal	separation[32]);
+	PX_ALIGN(128, PxReal	maxImpulse[32]);
+	PX_ALIGN(128, PxReal	appliedForce[32]);
+	PX_ALIGN(128, PxReal	biasCoefficient[32]);
+
+	PX_ALIGN(128, PxReal	resp0[32]);
+	PX_ALIGN(128, PxReal	resp1[32]);
+};
+
+/**
+\brief A single friction constraint for the solver.
+*/
+struct PxgTGSBlockSolverContactFriction
+{
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// unitResponse, recipResponse, velMultiplier, etc. are computed every sub-timestep (or iteration).
+	// To compute them every sub-timestep (or iteration), resp0 and resp1 are stored separately.
+	// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.
+
+	PX_ALIGN(128, float4	raXn_error[32]); 
+	PX_ALIGN(128, float4	rbXn_targetVelW[32]);
+	PX_ALIGN(128, PxReal	appliedForce[32]);
+
+	PX_ALIGN(128, PxReal	resp0[32]);
+	PX_ALIGN(128, PxReal	resp1[32]);
+};
+
+struct PxgContactBlockParams
+{
+	PxgBlockSolverContactHeader*	blockContactHeader;
+	PxgBlockSolverFrictionHeader*	blockFrictionHeader;
+	PxgBlockSolverContactPoint*		blockContactPoints;
+	PxgBlockSolverContactFriction*	blockFrictions;
+};
+
+struct PxgTGSContactBlockParams
+{
+	PxgTGSBlockSolverContactHeader*		blockContactHeader;
+	PxgTGSBlockSolverFrictionHeader*	blockFrictionHeader;
+	PxgTGSBlockSolverContactPoint*		blockContactPoints;
+	PxgTGSBlockSolverContactFriction*	blockFrictions;
+};
+
+
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+}
+
+
+
+#endif
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraintHelper.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraintHelper.h
@@ -0,0 +1,314 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_HELPER_H
+#define PXG_CONSTRAINT_HELPER_H
+
+#include "PxgD6JointLimit.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxMathUtils.h"
+#include "CmConeLimitHelper.h"
+
+// PT: TODO: refactor/share remaining code. One reason for the duplication is that the CPU code uses
+// SIMD here and there, while the GPU code doesn't. But we could still merge the two eventually.
+
+namespace physx
+{
+	// PT: TODO: this is a duplicate of the one in Extensions, but less robust?
+	PX_CUDA_CALLABLE PX_INLINE void computeJacobianAxes(PxVec3 row[3], const PxQuat& qa, const PxQuat& qb)
+	{
+		// Compute jacobian matrix for (qa* qb)  [[* means conjugate in this expr]]
+		// d/dt (qa* qb) = 1/2 L(qa*) R(qb) (omega_b - omega_a)
+		// result is L(qa*) R(qb), where L(q) and R(q) are left/right q multiply matrix
+
+		PxReal wa = qa.w, wb = qb.w;
+		const PxVec3 va(qa.x,qa.y,qa.z), vb(qb.x,qb.y,qb.z);
+
+		const PxVec3 c = vb*wa + va*wb;
+		const PxReal d = wa*wb - va.dot(vb);
+
+		row[0] = (va * vb.x + vb * va.x + PxVec3(d,     c.z, -c.y)) * 0.5f;
+		row[1] = (va * vb.y + vb * va.y + PxVec3(-c.z,  d,    c.x)) * 0.5f;
+		row[2] = (va * vb.z + vb * va.z + PxVec3(c.y,   -c.x,   d)) * 0.5f;
+	}
+
+	PX_INLINE PX_CUDA_CALLABLE void computeJointFrames(PxTransform& cA2w, PxTransform& cB2w, const PxgJointData& data, const PxTransform& bA2w, const PxTransform& bB2w)
+	{
+		PX_ASSERT(bA2w.isValid() && bB2w.isValid());
+
+		cA2w = bA2w.transform(data.c2b[0]);
+		cB2w = bB2w.transform(data.c2b[1]);
+
+		PX_ASSERT(cA2w.isValid() && cB2w.isValid());
+	}
+
+	class PxgConstraintHelper
+	{
+		PxVec3 mRa, mRb;
+
+	public:
+		PX_CUDA_CALLABLE PxgConstraintHelper(/*Px1DConstraint* c,*/ const PxVec3& ra, const PxVec3& rb)
+			: /*mConstraints(c), mCurrent(c),*/ mRa(ra), mRb(rb)	{}
+
+		PX_CUDA_CALLABLE PxgConstraintHelper(
+			PxTransform& cA2w, PxTransform& cB2w,
+			const PxgJointData& data, const PxTransform& bA2w, const PxTransform& bB2w)
+		{
+			computeJointFrames(cA2w, cB2w, data, bA2w, bB2w);
+
+			mRa = cB2w.p - bA2w.p;
+			mRb = cB2w.p - bB2w.p;
+		}
+
+		// hard linear & angular
+		PX_FORCE_INLINE void linearHard(Px1DConstraint* c, const PxVec3& axis, PxReal posErr)
+		{
+			linear(c, axis, posErr, PxConstraintSolveHint::eEQUALITY);
+			c->flags |= Px1DConstraintFlag::eOUTPUT_FORCE;
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void angularHard(Px1DConstraint* c, const PxVec3& axis, PxReal posErr)
+		{
+			angular(c, axis, posErr, PxConstraintSolveHint::eEQUALITY);
+			c->flags |= Px1DConstraintFlag::eOUTPUT_FORCE;
+		}
+
+		// limited linear & angular
+		PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 linearLimit(Px1DConstraint* c, PxU32 currentIndex, const PxVec3& axis, PxReal ordinate, PxReal limitValue, const PxgJointLimitParameters& limit)
+		{
+			if(!limit.isSoft() || ordinate > limitValue)
+			{
+				Px1DConstraint* cConstraint = &c[currentIndex++];
+				linear(cConstraint, axis,limitValue - ordinate, PxConstraintSolveHint::eNONE);
+				addLimit(cConstraint ,limit);
+			}
+			return currentIndex;
+		}
+
+		PX_FORCE_INLINE PxU32 angularLimit(Px1DConstraint* c, PxU32 currentIndex, const PxVec3& axis, PxReal ordinate, PxReal limitValue, PxReal pad, const PxgJointLimitParameters& limit)
+		{
+			if(limit.isSoft())
+				pad = 0;
+
+			if(ordinate + pad > limitValue)
+			{
+				Px1DConstraint* cConstraint = &c[currentIndex++];
+				angular(cConstraint, axis,limitValue - ordinate, PxConstraintSolveHint::eNONE);
+				addLimit(cConstraint,limit);
+			}
+			return currentIndex;
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void angularLimit(Px1DConstraint* c, const PxVec3& axis, PxReal error, const PxgJointLimitParameters& limit)const
+		{
+			angular(c, axis,error, PxConstraintSolveHint::eNONE);
+			addLimit(c,limit);
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 anglePair(Px1DConstraint* c, PxU32 currentIndex, PxReal angle, PxReal lower, PxReal upper, const PxVec3& axis, const PxgJointLimitParameters& limit)const
+		{
+			PX_ASSERT(lower<upper);
+			const bool softLimit = limit.isSoft();
+
+			if (!softLimit || angle < lower)
+				angularLimit(&c[currentIndex++], -axis, -(lower - angle), limit);
+			if (!softLimit || angle > upper)
+				angularLimit(&c[currentIndex++], axis, (upper - angle), limit);
+
+			return currentIndex;
+		}
+
+		// driven linear & angular
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void linear(Px1DConstraint* c, const PxVec3& axis, PxReal velTarget, PxReal error, const PxgD6JointDrive& drive)const
+		{
+			linear(c, axis,error,PxConstraintSolveHint::eNONE);
+			addDrive(c,velTarget,drive);
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void angular(Px1DConstraint* c, const PxVec3& axis, PxReal velTarget, PxReal error, const PxgD6JointDrive& drive, PxConstraintSolveHint::Enum hint = PxConstraintSolveHint::eNONE)const
+		{
+			angular(c, axis,error,hint);
+			addDrive(c,velTarget,drive);
+		}
+
+		//PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 getCount() { return PxU32(mCurrent - mConstraints); }
+
+		PX_CUDA_CALLABLE PxU32 prepareLockedAxes(Px1DConstraint* c, PxU32 currentIndex, const PxQuat& qA, const PxQuat& qB, const PxVec3& cB2cAp, PxU32 lin, PxU32 ang,
+			PxVec3& raOut)
+		{
+			//Px1DConstraint* current = mCurrent;
+			//const PxU32 startIndex = currentIndex;
+
+			PxVec3 errorVector(0.f);
+
+			PxVec3 ra = mRa;
+
+			if(lin)
+			{
+				PxMat33 axes(qA);
+
+				if (lin & 1) errorVector -= axes.column0 * cB2cAp.x;
+				if (lin & 2) errorVector -= axes.column1 * cB2cAp.y;
+				if (lin & 4) errorVector -= axes.column2 * cB2cAp.z;
+
+				ra += errorVector;
+
+				if(lin&1) linear(&c[currentIndex++], axes[0], ra, mRb, -cB2cAp[0], PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+				if(lin&2) linear(&c[currentIndex++], axes[1], ra, mRb, -cB2cAp[1], PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+				if(lin&4) linear(&c[currentIndex++], axes[2], ra, mRb, -cB2cAp[2], PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+			}
+
+			if (ang)
+			{
+				PxQuat qB2qA = qA.getConjugate() * qB;
+				/*if (qB2qA.w<0)
+					qB2qA = -qB2qA;*/
+
+				PxVec3 row[3];
+				computeJacobianAxes(row, qA, qB);
+				PxVec3 imp = qB2qA.getImaginaryPart();
+				if (ang & 1) angular(&c[currentIndex++], row[0], -imp.x, PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+				if (ang & 2) angular(&c[currentIndex++], row[1], -imp.y, PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+				if (ang & 4) angular(&c[currentIndex++], row[2], -imp.z, PxConstraintSolveHint::eEQUALITY, Px1DConstraintFlag::eOUTPUT_FORCE);
+			}
+
+			raOut = ra;
+
+			return currentIndex;
+		}
+
+	private:
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void linear(Px1DConstraint* c, const PxVec3& axis, PxReal posErr, PxConstraintSolveHint::Enum hint)const
+		{
+			c->solveHint		= PxU16(hint);
+			c->linear0 = axis;					c->angular0	= mRa.cross(axis);
+			c->linear1 = axis;					c->angular1 = mRb.cross(axis);
+			PX_ASSERT(c->linear0.isFinite());
+			PX_ASSERT(c->linear1.isFinite());
+			PX_ASSERT(c->angular0.isFinite());
+			PX_ASSERT(c->angular1.isFinite());
+
+			c->geometricError	= posErr;
+
+			c->flags = 0;
+			c->minImpulse = -PX_MAX_REAL;
+			c->maxImpulse = PX_MAX_REAL;
+			c->mods.spring.damping = 0.f;
+			c->mods.spring.stiffness = 0.f;
+			c->velocityTarget =0.f;
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void linear(Px1DConstraint* c, const PxVec3& axis, const PxVec3& ra, const PxVec3& rb, PxReal posErr, PxConstraintSolveHint::Enum hint,
+			PxU32 flags = 0)const
+		{
+			c->solveHint = PxU16(hint);
+			c->linear0 = axis;					c->angular0 = ra.cross(axis);
+			c->linear1 = axis;					c->angular1 = rb.cross(axis);
+			PX_ASSERT(c->linear0.isFinite());
+			PX_ASSERT(c->linear1.isFinite());
+			PX_ASSERT(c->angular0.isFinite());
+			PX_ASSERT(c->angular1.isFinite());
+
+			c->geometricError = posErr;
+
+			c->flags = flags;
+			c->minImpulse = -PX_MAX_REAL;
+			c->maxImpulse = PX_MAX_REAL;
+			c->mods.spring.damping = 0.f;
+			c->mods.spring.stiffness = 0.f;
+			c->velocityTarget = 0.f;
+		}
+
+		PX_CUDA_CALLABLE PX_FORCE_INLINE void angular(Px1DConstraint* c, const PxVec3& axis, PxReal posErr, PxConstraintSolveHint::Enum hint,
+			PxU32 flags = 0)const
+		{
+			c->solveHint		= PxU16(hint);
+			c->linear0 = PxVec3(0);		c->angular0			= axis;
+			c->linear1 = PxVec3(0);		c->angular1			= axis;
+			c->geometricError	= posErr;
+
+			c->flags = flags | Px1DConstraintFlag::eANGULAR_CONSTRAINT;
+			c->minImpulse = -PX_MAX_REAL;
+			c->maxImpulse = PX_MAX_REAL;
+			c->mods.spring.damping = 0.f;
+			c->mods.spring.stiffness = 0.f;
+			c->velocityTarget = 0.f;
+		}
+
+		PX_CUDA_CALLABLE void addLimit(Px1DConstraint* c, const PxgJointLimitParameters& limit)const
+		{
+			PxU16 flags = PxU16(c->flags | Px1DConstraintFlag::eOUTPUT_FORCE);
+
+			if(limit.isSoft())
+			{
+				flags |= Px1DConstraintFlag::eSPRING;
+				c->mods.spring.stiffness = limit.stiffness;
+				c->mods.spring.damping = limit.damping;
+			}
+			else
+			{
+				c->solveHint = PxConstraintSolveHint::eINEQUALITY;
+				c->mods.bounce.restitution = limit.restitution;
+				c->mods.bounce.velocityThreshold = limit.bounceThreshold;
+				if(c->geometricError>0)
+					flags |= Px1DConstraintFlag::eKEEPBIAS;
+				if(limit.restitution>0)
+					flags |= Px1DConstraintFlag::eRESTITUTION;
+			}
+
+			c->flags = flags;
+			c->minImpulse = 0;
+		}
+
+		PX_CUDA_CALLABLE void addDrive(Px1DConstraint* c, PxReal velTarget, const PxgD6JointDrive& drive)const
+		{
+			c->velocityTarget = velTarget;
+
+			PxU16 flags = PxU16(c->flags | Px1DConstraintFlag::eSPRING | Px1DConstraintFlag::eHAS_DRIVE_LIMIT);
+
+			if(drive.flags & PxgD6JointDriveFlag::eACCELERATION)
+				flags |= Px1DConstraintFlag::eACCELERATION_SPRING;
+
+			if (drive.flags & PxgD6JointDriveFlag::eOUTPUT_FORCE)
+				flags |= Px1DConstraintFlag::eOUTPUT_FORCE;
+
+			c->flags = flags;
+			c->mods.spring.stiffness = drive.stiffness;
+			c->mods.spring.damping = drive.damping;
+				
+			c->minImpulse = -drive.forceLimit;
+			c->maxImpulse = drive.forceLimit;
+
+			//PX_ASSERT(c->linear0.isFinite());
+			//PX_ASSERT(c->angular0.isFinite());
+		}
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraintPartition.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraintPartition.h
@@ -0,0 +1,455 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_PARTITION_H
+#define PXG_CONSTRAINT_PARTITION_H
+
+#define PX_PARTITION_COMPACTION 1
+
+#include "foundation/PxPinnedArray.h"
+#include "foundation/PxSList.h"
+#include "foundation/PxUserAllocated.h"
+#include "foundation/PxUtilities.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxsSimpleIslandManager.h"
+#include "PxgDynamicsConfiguration.h"
+#include "PxgEdgeType.h"
+#include "PxgPartitionNode.h"
+#include "PxsPartitionEdge.h"
+
+namespace physx
+{
+class PxsContactManagerOutputIterator;
+class PxgBodySimManager;
+class PxgJointManager;
+struct PxsContactManagerOutputCounts;
+
+namespace Cm
+{
+	class FlushPool;
+}
+
+#define SLAB_SIZE 512
+
+// PT: defines controlling a large dense array of 32 pointers per node using up a lot of memory.
+//
+// STORE_INDICES_IN_NODE_ENTRIES stores indices instead of PartitionEdge ptrs in NodeEntries.
+// Half the memory usage compared to initial version but more expensive address computation
+// and we still decode and use a pointer in the end.
+//
+// STORE_EDGE_DATA_IN_NODE_ENTRIES stores the edge data directly instead of indices.
+// One less indirection but same memory usage as initial version.
+//
+// Initial version					| Less indirection	| Better Mem usage
+// ---------------------------------|-------------------|---------------------
+// Initial version					| No				| No
+// STORE_INDICES_IN_NODE_ENTRIES	| No				| Yes
+// STORE_EDGE_DATA_IN_NODE_ENTRIES	| Yes				| No
+//
+// Jury is still out regarding which one is best for perf. Meanwhile we use the one with best mem usage.
+#define STORE_INDICES_IN_NODE_ENTRIES		1
+#if STORE_INDICES_IN_NODE_ENTRIES
+	#define STORE_EDGE_DATA_IN_NODE_ENTRIES	0
+	#if STORE_EDGE_DATA_IN_NODE_ENTRIES
+		struct EdgeData
+		{
+			PxU32	mUniqueIndex;
+			PxU32	mNode0Index;
+		};
+		typedef EdgeData NodeEntryStorage;
+		typedef EdgeData NodeEntryDecoded;
+		PX_FORCE_INLINE	void	resetNodeEntryStorage(EdgeData& edge)	{ edge.mUniqueIndex = IG_INVALID_EDGE;	}
+		PX_FORCE_INLINE	PxU32	getUniqueId(const EdgeData& edge)		{ return edge.mUniqueIndex;				}
+		PX_FORCE_INLINE	PxU32	getNode0Index(const EdgeData& edge)		{ return edge.mNode0Index;				}
+	#else
+		typedef PxU32 NodeEntryStorage;
+		typedef const PartitionEdge* NodeEntryDecoded;
+		PX_FORCE_INLINE	void	resetNodeEntryStorage(PxU32& edge)			{ edge = IG_INVALID_EDGE;		}
+		PX_FORCE_INLINE	PxU32	getUniqueId(const PartitionEdge* edge)		{ return edge->mUniqueIndex;	}
+		PX_FORCE_INLINE	PxU32	getNode0Index(const PartitionEdge* edge)	{ return edge->mNode0.index();	}
+	#endif
+#else
+	typedef const PartitionEdge* NodeEntryStorage;
+	typedef const PartitionEdge* NodeEntryDecoded;
+	PX_FORCE_INLINE	void	resetNodeEntryStorage(const PartitionEdge*& edge)	{ edge = NULL;					}
+	PX_FORCE_INLINE	PxU32	getUniqueId(const PartitionEdge* edge)				{ return edge->mUniqueIndex;	}
+	PX_FORCE_INLINE	PxU32	getNode0Index(const PartitionEdge* edge)			{ return edge->mNode0.index();	}
+#endif
+
+typedef	Cm::BlockArray<PxU32>	PartitionIndices;
+//typedef	PxArray<PxU32>	PartitionIndices;
+
+struct PartitionEdgeSlab
+{
+	PartitionEdge mEdges[SLAB_SIZE];	//! The slabs
+};
+
+class PartitionEdgeManager
+{
+	PartitionEdge* mFreeEdges;
+
+	PxArray<PartitionEdgeSlab*> mPartitionEdgeSlabs;
+	PxArray<void*> mMemory;
+
+	PxU32 mEdgeCount;
+
+	PX_NOINLINE void allocateSlab();
+public:
+	PartitionEdgeManager();
+	~PartitionEdgeManager();
+
+	PX_FORCE_INLINE PartitionEdge*	getEdge(IG::EdgeIndex index);
+	PX_FORCE_INLINE void			putEdge(PartitionEdge* edge);
+
+	PX_FORCE_INLINE const PartitionEdge*	getPartitionEdge(PxU32 uniqueId)	const
+	{
+		return &mPartitionEdgeSlabs[uniqueId / SLAB_SIZE]->mEdges[uniqueId & (SLAB_SIZE - 1)];
+	}
+
+	PX_FORCE_INLINE PxU32 getEdgeCount() const { return mEdgeCount; }
+};
+
+struct Partition
+{
+	PartitionIndices mPartitionIndices[PxgEdgeType::eEDGE_TYPE_COUNT];
+
+	Partition()	{}
+
+	//Adds an edge to the partition
+	bool addToPartition(PxU32 uniqueIndex, PartitionIndexData& indexData)
+	{
+		PartitionIndices& indices = mPartitionIndices[indexData.mCType];
+		indexData.mPartitionEntryIndex = indices.size();
+		indices.pushBack(uniqueIndex);
+		return true;
+	}
+
+	void removeFromPartition(PxU32 uniqueIndex, PxPinnedArray<PartitionIndexData>& iterator)
+	{
+		const PartitionIndexData& indexData = iterator[uniqueIndex];
+		PartitionIndices& indices = mPartitionIndices[indexData.mCType];
+
+		// AD: defensive coding for OM-90842. If this assert hits, you maybe hit the same issue
+		PxU32 size = indices.size();
+		if (size == 0)
+		{
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "PxgConstraintPartition: attempting to remove an edge from an empty partition. Skipping.");
+			PX_ASSERT(false);
+			return;
+		}
+
+		size--;
+
+		const PxU32 uniqueIdx = indices[size];
+		const PxU32 partitionEntryIndex = indexData.mPartitionEntryIndex;
+		iterator[uniqueIdx].mPartitionEntryIndex = partitionEntryIndex;
+		indices[partitionEntryIndex] = uniqueIdx;
+		indices.forceSize_Unsafe(size);
+	}
+};
+
+struct NodeEntries
+{
+	NodeEntryStorage	mEdges[PXG_BATCH_SIZE];
+};
+
+struct PartitionSlab : public PxUserAllocated
+{
+	Partition mPartitions[PXG_BATCH_SIZE];				//! Each slab has 32 partitions
+
+	PxArray<PxU32> mNodeBitmap;			//! Each slab 1 integer per-node, recording the presence of the node in any of the slabs
+
+	PxArray<NodeEntries> mNodeEntries;
+
+	PartitionSlab()	{}
+};
+
+struct PartitionArray : public PxUserAllocated
+{
+	PxArray<Partition*> mPartitions;
+	PxU32 mAccumulatedPartitionCount;
+	PxU32 mAccumulatedArtiPartitionCount;
+
+	PartitionArray() : mPartitions(1024), mAccumulatedPartitionCount(0), mAccumulatedArtiPartitionCount(0)
+	{
+	}
+
+	void clear()
+	{
+		mPartitions.forceSize_Unsafe(0);
+		mAccumulatedPartitionCount = 0;
+		mAccumulatedArtiPartitionCount = 0;
+	}
+};
+
+class PxgCombinedPartitionSlab : public PxUserAllocated
+{
+public:
+
+	PxU32 mNbPartitions;
+	const PxU32 mUserNbMaxPartitions;
+	PxU32 mNbMaxPartitions;
+	PartitionArray mPartitionArray[32];
+
+	PxgCombinedPartitionSlab(PxU32 maxNumPartitions) : mUserNbMaxPartitions(maxNumPartitions), mNbMaxPartitions(maxNumPartitions)
+	{
+	}
+
+	~PxgCombinedPartitionSlab()
+	{
+	}
+
+	void clear()
+	{
+		mNbPartitions = 0;
+		for (PxU32 i = 0; i< 32; ++i)
+			mPartitionArray[i].clear();
+	}
+};
+
+class PxgIncrementalPartition
+{
+	PX_NOCOPY(PxgIncrementalPartition)
+public:	// PT: TODO: revisit after the dust settles
+
+	PxArray<PartitionSlab*> mPartitionSlabs;
+
+	PartitionEdgeManager mEdgeManager;
+
+	PxU32 mNodeCount;
+	PxU32 mNbContactBatches;
+	PxU32 mNbConstraintBatches;
+	PxU32 mNbArtiContactBatches;
+	PxU32 mNbArtiConstraintBatches;
+
+	PxU32 mNbPartitions;
+	PxU32 mTotalContacts;
+	PxU32 mTotalConstraints;
+	PxU32 mTotalArticulationContacts;
+	PxU32 mTotalArticulationConstraints;
+
+	PxU32 mMaxSlabCount;
+	PxU32 mNbForceThresholds;
+
+#if PX_ENABLE_ASSERTS
+	PxArray<PxU32> mAccumulatedPartitionCount; // for contact
+	PxArray<PxU32> mAccumulatedConstraintCount; // for joint
+	PxArray<PxU32> mAccumulatedArtiPartitionCount; //for contact
+	PxArray<PxU32> mAccumulatedArtiConstraintCount; // for constraint
+#endif
+
+	PxBitMap mIsDirtyNode;
+
+	PxArray<PxU32>										mNpIndexArray;
+	PxPinnedArray<PartitionIndexData>					mPartitionIndexArray;
+	PxPinnedArray<PartitionNodeData>					mPartitionNodeArray;
+	PxPinnedArray<PxgSolverConstraintManagerConstants>	mSolverConstants;
+	PxInt32ArrayPinned									mNodeInteractionCountArray;
+
+	PxInt32ArrayPinned			mDestroyedContactEdgeIndices;
+
+	PxInt32ArrayPinned			mStartSlabPerPartition;
+	PxInt32ArrayPinned			mArticStartSlabPerPartition;
+	PxInt32ArrayPinned			mNbJointsPerPartition;
+	PxInt32ArrayPinned			mNbArtiJointsPerPartition;
+
+	PxArray<PxU32>				mJointStartIndices;
+	PxArray<PxU32>				mContactStartIndices;
+	PxArray<PxU32>				mArtiContactStartIndices;
+	PxArray<PxU32>				mArtiJointStartIndices;
+
+	PxgCombinedPartitionSlab	mCSlab;
+
+	const PxU64					mContextID;
+
+public:
+
+	PxgIncrementalPartition(const PxVirtualAllocator& allocator, PxU32 maxNumPartitions, PxU64 contextID);
+	~PxgIncrementalPartition();
+
+	void processLostFoundPatches(	Cm::FlushPool& flushPool, PxBaseTask* continuation,
+									IG::IslandSim& islandSim, PxgBodySimManager& bodySimManager, PxgJointManager& jointManager,
+									PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, const PxsContactManagerOutputCounts* lostFoundPairOutputs);
+
+	void updateIncrementalIslands(	IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData,
+									Cm::FlushPool* flushPool, PxBaseTask* continuation,
+									PxsContactManagerOutputIterator& iterator, PxgBodySimManager& bodySimManager, PxgJointManager& jointManager);
+
+	// PT: internal reference versions, exposed for UTs
+	void processLostPatches_Reference(	IG::IslandSim& islandSim, PxgBodySimManager& bodySimManager, PxgJointManager& jointManager,
+										PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, const PxsContactManagerOutputCounts* lostFoundPairOutputs);
+
+	void processFoundPatches_Reference(	IG::IslandSim& islandSim, PxgBodySimManager& bodySimManager,
+										PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, const PxsContactManagerOutputCounts* lostFoundPairOutputs);
+
+	void updateIncrementalIslands_Reference(IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData,
+											PxsContactManagerOutputIterator& iterator,
+											PxgBodySimManager& bodySimManager, PxgJointManager& jointManager);
+
+	// PT: edge data
+	PX_FORCE_INLINE	const PxPinnedArray<PxgSolverConstraintManagerConstants>&	getSolverConstants()	const	{ return mSolverConstants;	}
+
+	// PT: TODO: what's the difference between mNbPartitions and mCSlab.mNbPartitions ?
+	PX_FORCE_INLINE	PxU32	getNbPartitions()					const	{ return mNbPartitions;				}
+	PX_FORCE_INLINE	PxU32	getCombinedSlabMaxNbPartitions()	const	{ return mCSlab.mNbMaxPartitions;	}
+	PX_FORCE_INLINE	PxU32	getCombinedSlabNbPartitions()		const	{ return mCSlab.mNbPartitions;		}
+
+	PX_FORCE_INLINE	const PxPinnedArray<PartitionIndexData>&	getPartitionIndexArray()	const	{ return mPartitionIndexArray;	}
+	PX_FORCE_INLINE	const PxPinnedArray<PartitionNodeData>&		getPartitionNodeArray()		const	{ return mPartitionNodeArray;	}
+
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getStartSlabPerPartition()			const	{ return mStartSlabPerPartition;		}
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getArticStartSlabPerPartition()		const	{ return mArticStartSlabPerPartition;	}
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getNbJointsPerPartition()			const	{ return mNbJointsPerPartition;			}
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getNbArticJointsPerPartition()		const	{ return mNbArtiJointsPerPartition;		}
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getNodeInteractionCountArray()		const	{ return mNodeInteractionCountArray;	}
+	PX_FORCE_INLINE	const PxInt32ArrayPinned&		getDestroyedContactEdgeIndices()	const	{ return mDestroyedContactEdgeIndices;	}
+
+	PX_FORCE_INLINE	const PxArray<PxU32>&			getNpIndexArray()					const	{ return mNpIndexArray;					}
+	PX_FORCE_INLINE	const PxArray<PartitionSlab*>&	getPartitionSlabs()					const	{ return mPartitionSlabs;				}
+
+	PX_FORCE_INLINE	const PxArray<PxU32>&			getContactStartIndices()			const	{ return mContactStartIndices;			}
+	PX_FORCE_INLINE	const PxArray<PxU32>&			getJointStartIndices()				const	{ return mJointStartIndices;			}
+	PX_FORCE_INLINE	const PxArray<PxU32>&			getArtiContactStartIndices()		const	{ return mArtiContactStartIndices;		}
+	PX_FORCE_INLINE	const PxArray<PxU32>&			getArtiJointStartIndices()			const	{ return mArtiJointStartIndices;		}
+
+	PX_FORCE_INLINE	PxU32	getCSlabAccumulatedPartitionCount(PxU32 index)	const
+	{
+		PX_ASSERT(index<32);
+		return mCSlab.mPartitionArray[index].mAccumulatedPartitionCount;
+	}
+
+	PX_FORCE_INLINE	PxU32	getCSlabAccumulatedArtiPartitionCount(PxU32 index)	const
+	{
+		PX_ASSERT(index<32);
+		return mCSlab.mPartitionArray[index].mAccumulatedArtiPartitionCount;
+	}
+
+#if PX_ENABLE_ASSERTS
+	PX_FORCE_INLINE	const PxArray<PxU32>&	getAccumulatedPartitionCount()		const	{ return mAccumulatedPartitionCount;		}
+	PX_FORCE_INLINE	const PxArray<PxU32>&	getAccumulatedConstraintCount()		const	{ return mAccumulatedConstraintCount;		}
+	PX_FORCE_INLINE	const PxArray<PxU32>&	getAccumulatedArtiPartitionCount()	const	{ return mAccumulatedArtiPartitionCount;	}
+	PX_FORCE_INLINE	const PxArray<PxU32>&	getAccumulatedArtiConstraintCount()	const	{ return mAccumulatedArtiConstraintCount;	}
+#endif
+	PX_FORCE_INLINE	PxU32	getNbContactBatches()				const	{ return mNbContactBatches;				}
+	PX_FORCE_INLINE	PxU32	getNbConstraintBatches()			const	{ return mNbConstraintBatches;			}
+	PX_FORCE_INLINE	PxU32	getNbArtiContactBatches()			const	{ return mNbArtiContactBatches;			}
+	PX_FORCE_INLINE	PxU32	getNbArtiConstraintBatches()		const	{ return mNbArtiConstraintBatches;		}
+
+	PX_FORCE_INLINE	PxU32	getTotalContacts()					const	{ return mTotalContacts;				}
+	PX_FORCE_INLINE	PxU32	getTotalConstraints()				const	{ return mTotalConstraints;				}
+	PX_FORCE_INLINE	PxU32	getTotalArticulationContacts()		const	{ return mTotalArticulationContacts;	}
+	PX_FORCE_INLINE	PxU32	getTotalArticulationConstraints()	const	{ return mTotalArticulationConstraints;	}
+
+	PX_FORCE_INLINE	bool	hasForceThresholds()				const	{ return mNbForceThresholds!=0;			}
+
+//private:	// PT: TODO: revisit after the dust settles
+
+#if USE_SPLIT_SECOND_PASS_ISLAND_GEN
+	PxBitMap	mActiveCMBitmapCopy;
+#endif
+	void reserveNodes(PxU32 nodeCount);
+
+	void getPreviousAndNextReferencesInSlab(NodeEntryDecoded& prev, NodeEntryDecoded& next, PxU32 index, PxU32 uniqueId, const PartitionSlab* slab, PxU32 slabMask) const;
+
+	PartitionEdge* addEdge_Stage1(const IG::IslandSim& islandSim, IG::EdgeIndex edgeIndex, PxU32 patchIndex, PxU32 npIndex, PxNodeIndex node1, PxNodeIndex node2);
+
+	void addEdge_Stage2(IG::GPUExternalData& islandSimGpuData, IG::EdgeIndex edgeIndex, PartitionEdge* partitionEdge, bool specialHandled, bool doPart1, bool doPart2);
+
+	bool addJointManager(const PartitionEdge* edge, PxgBodySimManager& bodySimManager);
+	bool addContactManager(PartitionEdge* edge, const PxcNpWorkUnit& unit, PxgBodySimManager& bodySimManager);
+
+	void removeEdge(PartitionEdge* edge, IG::GPUExternalData& islandSimGpuData, PxgBodySimManager& manager);
+	PX_FORCE_INLINE void removeAllEdges(IG::GPUExternalData& islandSimGpuData, PxgBodySimManager& bodySimManager, PartitionEdge* partitionEdge);
+	void destroyEdges(const IG::CPUExternalData& islandSimCpuData, IG::GPUExternalData& islandSimGpuData, PxgBodySimManager& bodySimManager, PxgJointManager& jointManager, bool clearDestroyedEdges, bool recordDestroyedEdges);
+
+	void addEdgeInternal(const PartitionEdge* PX_RESTRICT partitionEdge, PartitionSlab* PX_RESTRICT slab, PxU16 id, PxU16 baseId);
+	void removeEdgeInternal(PartitionSlab* PX_RESTRICT slab, const PartitionEdge* PX_RESTRICT edge, PxU32 id);
+
+	void doCompaction();
+#if PX_PARTITION_COMPACTION
+	void pullForwardConstraints(PxU32 nodeIndex);
+#endif
+
+public:	// PT: TODO: revisit after the dust settles
+
+	void updateIncrementalIslands_Part1(
+		IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData,
+		PxsContactManagerOutputIterator& iterator,
+		PxgBodySimManager& bodySimManager, PxgJointManager& jointManager);
+
+	void updateIncrementalIslands_Part2(
+		IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData,
+		PxsContactManagerOutputIterator& iterator,
+		PxgBodySimManager& bodySimManager);
+
+	enum SpecialCase
+	{
+		SPECIAL_CASE_NONE,
+		SPECIAL_CASE_STATIC_RB,
+		SPECIAL_CASE_ARTI_SELF,
+		SPECIAL_CASE_STATIC_ARTI0,
+		SPECIAL_CASE_STATIC_ARTI1
+	};
+
+	struct Part2WorkItem
+	{
+		PxU32	mEdgeID;
+		PxU16	mPatchIndex;
+		PxU16	mSpecialCase;
+		PartitionEdge*	mPartitionEdge;
+	};
+
+	PxArray<Part2WorkItem>	mPart2WorkItems;
+	PxArray<PxU32>			mPart2EdgeCases;	// PT: indices into mPart2WorkItems
+
+	void updateIncrementalIslands_Part2_0(IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData, PxsContactManagerOutputIterator& iterator);
+
+	void updateIncrementalIslands_Part2_1(PxU32 startIndex, PxU32 nbToProcess, IG::IslandSim& islandSim, const IG::AuxCpuData& islandManagerData);
+
+	void updateIncrementalIslands_Part2_2(IG::IslandSim& islandSim, PxgBodySimManager& bodySimManager, bool dopart1, bool dopart2, bool dopart3);
+
+	void updateIncrementalIslands_Part2_2_ProcessEdgeCases(IG::IslandSim& islandSim);
+
+	void updateIncrementalIslands_Part3(IG::IslandSim& islandSim, PxgJointManager& jointManager);
+
+	void processLostPatchesMT(	IG::IslandSim& islandSim, Cm::FlushPool& flushPool, PxBaseTask* continuation,
+								PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, const PxsContactManagerOutputCounts* lostFoundPairOutputs,
+								PxgBodySimManager& bodySimManager, PxgJointManager& jointManager);
+
+	struct PartitionEdgeBatch : public PxUserAllocated
+	{
+		PxArray<PartitionEdge*>	mEdges;
+	};
+	PxArray<PartitionEdgeBatch*>	mBatches;
+};
+
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraintPrep.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraintPrep.h
@@ -0,0 +1,125 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_PREP_H
+#define PXG_CONSTRAINT_PREP_H
+
+#include "PxConstraintDesc.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgD6JointData.h"
+#include "AlignedTransform.h"
+
+#include "PxcNpWorkUnit.h"
+
+namespace physx
+{
+	typedef PxcNpWorkUnitFlag	PxgNpWorkUnitFlag;
+
+	struct PxgSolverBody;
+	struct PxgSolverBodyData;
+	struct PxgSolverConstraintDesc;
+
+	//This structure can update everyframe
+	struct PxgConstraintPrePrep
+	{
+	public:
+		PxNodeIndex mNodeIndexA;	//8		8
+		PxNodeIndex mNodeIndexB;	//16	8
+		PxU32 mFlags;				//20	4
+		float mLinBreakForce;		//24	4
+		float mAngBreakForce;		//28	4
+	};
+
+	struct PxgConstraintData
+	{
+		PxgConstraintInvMassScale mInvMassScale;	//16
+		float4 mRaWorld_linBreakForceW;				//16
+		float4 mRbWorld_angBreakForceW;				//16
+		uint4 mNumRows_Flags_StartIndex;			//16
+	};
+
+	struct PxgBlockConstraint1DData
+	{
+		PX_ALIGN(256, PxgConstraintInvMassScale mInvMassScale[32]);		//512			512
+		PX_ALIGN(256, float4 mRAWorld_linBreakForce[32]);				//1024			512
+		PX_ALIGN(256, float4 mRBWorld_AngBreakForce[32]);				//1152			128
+		PX_ALIGN(128, PxU32 mNumRows[32]);								//1284			128
+		PX_ALIGN(128, PxU32 mFlags[32]);								//1412			128
+	};
+
+	struct PxgConstraint1DData
+	{
+		PxgConstraintInvMassScale	mInvMassScale;						//16	16						
+		float4						mBody0WorldOffset_linBreakForce;	//16	32					
+		float						mAngBreakForce;						//4		36				
+		PxU32						mNumRows;							//4		40					
+		PxU32						mFlags;								//4		44
+		PxU32						mPadding;							//4		48
+	};
+
+	struct /*__device_builtin__*/ __builtin_align__(16) PxgMaterialContactData
+	{
+		PxReal restDistance;				//4		4
+		PxReal staticFriction;				//8		4
+		PxReal dynamicFriction;				//12	4
+		PxU8 mNumContacts;					//13	1
+		PxU8 mSolverFlags;					//14	1
+		PxU8 prevFrictionPatchCount;		//15	1
+		PxU8 pad;							//16	1
+	};
+
+	struct PxgBlockContactData
+	{
+		PX_ALIGN(128,	PxgConstraintInvMassScale	mInvMassScale[32]);				//512	512		
+		PX_ALIGN(128,	float4						normal_restitutionW[32]);		//1024	512
+		PX_ALIGN(128,	PxgMaterialContactData		contactData[32]);				//1536	512
+		PX_ALIGN(128,	PxReal						damping[32]);					//1664	128
+	};
+
+	//This is for articulation contact
+	struct PxgContactData
+	{
+		PxgConstraintInvMassScale mInvMassScale;
+		float4 normal_restitutionW;
+		PxgMaterialContactData contactData;
+	};
+
+	struct PxgBlockContactPoint
+	{
+		PX_ALIGN(256,	float4	point_separationW[32]);
+		PX_ALIGN(256,	float4	targetVel_maxImpulseW[32]);
+	};
+
+	struct PxgContactPoint
+	{
+		float4	point_separationW;
+		float4	targetVel_maxImpulseW;
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgConstraintWriteBack.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgConstraintWriteBack.h
@@ -0,0 +1,44 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONSTRAINT_WRITE_BACK_H
+#define PXG_CONSTRAINT_WRITE_BACK_H
+
+#include "foundation/PxPreprocessor.h"
+#include "vector_types.h"
+
+namespace physx
+{
+	struct PxgConstraintWriteback
+	{
+		float4	linearImpulse_broken;
+		float4  angularImpulse_residual;
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgContext.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgContext.h
@@ -0,0 +1,603 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONTEXT_H
+#define PXG_CONTEXT_H
+
+#include "DyContext.h"
+#include "PxSimulationStatistics.h"
+#include "PxgConstraintPartition.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgConstraintPrep.h"
+#include "PxvNphaseImplementationContext.h"
+
+namespace physx
+{
+	class PxgCudaBroadPhaseSap;
+	class PxgSolverCore;
+	class PxgGpuNarrowphaseCore;
+	class PxgArticulationCore;
+	class PxgSimulationCore;
+	class PxgSoftBodyCore;
+	class PxgFEMClothCore;
+	class PxgPBDParticleSystemCore;
+	class PxgSimulationController;
+	struct PxgIslandContext;
+	class PxgHeapMemoryAllocatorManager;
+	struct PxsTorsionalFrictionData;
+
+	// PT: TODO: all these tasks are missing a proper context ID for the profiler...
+
+	class PxgCpuJointPrePrepTask : public Cm::Task
+	{
+		PxgSimulationController& mSimController;
+
+		const Dy::Constraint*const* mConstraints;
+		PxgConstraintData* mConstraintData;
+		Px1DConstraint* mConstraintRows;
+
+		const PxU32 mStartIndex;
+		const PxU32 mNbToProcess;
+		const PxU32 mGpuJointOffset;
+
+		PxI32* mRowCounts;
+
+		PX_NOCOPY(PxgCpuJointPrePrepTask)
+
+	public:
+		PxgCpuJointPrePrepTask(PxgSimulationController& simConstroller, PxU32 startIndex, PxU32 nbToProcess, PxU32 gpuJointOffset,
+			const Dy::Constraint*const* constraints, PxgConstraintData* constraintData, Px1DConstraint* constraintRows, PxI32* rowCounts) :
+			Cm::Task(0), mSimController(simConstroller), mConstraints(constraints), mConstraintData(constraintData), mConstraintRows(constraintRows),
+			mStartIndex(startIndex), mNbToProcess(nbToProcess), mGpuJointOffset(gpuJointOffset), mRowCounts(rowCounts)
+		{
+		}
+
+		virtual void runInternal() PX_OVERRIDE PX_FINAL;
+
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgTGSCpuJointPrePrepTask";
+		}
+	};
+
+	class PxgGpuContext;
+
+	class PxgCpuPreIntegrationTask : public Cm::Task
+	{
+		PxgGpuContext& mContext;
+
+		PX_NOCOPY(PxgCpuPreIntegrationTask)
+
+	public:
+
+		PxgCpuPreIntegrationTask(PxgGpuContext& context) : Cm::Task(0), mContext(context)
+		{
+		}
+
+		virtual void runInternal() PX_OVERRIDE PX_FINAL;
+
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgCpuPreIntegrationTask";
+		}
+	};
+
+	class PxgCpuContactPrePrepTask : public Cm::Task
+	{
+		//From the below, we should be able to iterate over the partitions, process contact pairs
+		const PxgIncrementalPartition& mPartition;
+		const PxU32 mPartitionIndex;
+		const PxU32 mStartIndexWithinPartition;
+		const PxU32 mNbToProcess;
+
+		const PxU32* mStartSlabIter;
+		const PxU32 mStartSlabOffset;
+		const PxU32* mContactStartIndices;
+
+		PxgConstraintBatchHeader* mBatchHeaders;
+		const PxU32 mNumBatches;
+		const PxU32 mWorkUnitStartIndex;
+
+		PxU32* mPinnedEdgeIds;
+
+		const PxsContactManagerOutputIterator& mOutputIterator;
+
+		const PxU8* mBaseContactPatch;
+		const PxU8* mBaseContactPointer;
+
+		PX_NOCOPY(PxgCpuContactPrePrepTask)
+
+	public:
+		PxgCpuContactPrePrepTask(const PxgIncrementalPartition& partition, PxU32 partitionIndex, PxU32 startIndexWithinPartition, PxU32 nbToProcess,
+			const PxU32* startSlabIter, PxU32 startSlabOffset, const PxU32* contactStartIndices,
+			PxgConstraintBatchHeader* batchHeaders, PxU32 nbBatches, PxU32 workUnitStartIndex,
+			PxU32* pinnedEdgeIds, PxsContactManagerOutputIterator& outputIter,
+			const PxU8* baseContactPatch, const PxU8* baseContactPointer) : Cm::Task(0),
+			mPartition(partition), mPartitionIndex(partitionIndex), mStartIndexWithinPartition(startIndexWithinPartition), mNbToProcess(nbToProcess),
+			mStartSlabIter(startSlabIter), mStartSlabOffset(startSlabOffset), mContactStartIndices(contactStartIndices),
+			mBatchHeaders(batchHeaders), mNumBatches(nbBatches), mWorkUnitStartIndex(workUnitStartIndex),
+			mPinnedEdgeIds(pinnedEdgeIds), mOutputIterator(outputIter),
+			mBaseContactPatch(baseContactPatch), mBaseContactPointer(baseContactPointer)
+		{
+		}
+
+		virtual void runInternal() PX_OVERRIDE PX_FINAL;
+
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgCpuContactPrePrepTask";
+		}
+	};
+
+	class PxgCpuConstraintPrePrepTask : public Cm::Task
+	{
+		const PartitionIndices& mEdgeIds;
+		const PxU32 mStartEdgeIdx;
+		const PxU32 mNumEdges;
+		PxgConstraintBatchHeader* mBatchHeaders;
+
+		const PxU32 mNumBatches;
+		const PxU32 mConstraintBlockStartIndex;
+		const PxU32 mUniqueIdStartIndex;
+
+		PxU32* mPinnedEdgeIds;
+
+		const PxgConstraintPrePrep* mConstraintPrePrep;
+
+		PX_NOCOPY(PxgCpuConstraintPrePrepTask)
+
+	public:
+
+		static const PxU32 NbConstraintsPerTaskTGS = 2048u;
+		static const PxU32 NbConstraintsPerTaskPGS = 8192u;
+
+		PxgCpuConstraintPrePrepTask(const PartitionIndices& edgeIds, PxU32 startEdgeIdx, PxU32 nbEdges, PxgConstraintBatchHeader* batchHeaders, PxU32 nbBatches,
+			PxU32 constraintBlockStartIndex, PxU32 uniqueIdStartIndex, PxU32* pinnedEdgeIds,
+			const PxgConstraintPrePrep* constraintPrePrep) : Cm::Task(0),
+			mEdgeIds(edgeIds), mStartEdgeIdx(startEdgeIdx), mNumEdges(nbEdges), mBatchHeaders(batchHeaders), mNumBatches(nbBatches),
+			mConstraintBlockStartIndex(constraintBlockStartIndex), mUniqueIdStartIndex(uniqueIdStartIndex), mPinnedEdgeIds(pinnedEdgeIds)
+			, mConstraintPrePrep(constraintPrePrep)
+		{
+		}
+
+		virtual void runInternal() PX_OVERRIDE PX_FINAL;
+
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgCpuConstraintPrePrepTask";
+		}
+	};
+
+	//this include contact and constraint
+	class PxgCpuArtiConstraintPrePrepTask : public Cm::Task
+	{
+		const PartitionIndices&		mEdgeIds;
+		const PxU32					mStartEdgeIdx;
+		const PxU32					mNumEdges;
+		PxgConstraintBatchHeader*	mBatchHeaders;
+
+		const PxU32					mNumBatches;
+		const PxU32					mConstraintBlockStartIndex;
+		const PxU32					mUniqueIdStartIndex;
+
+		PxU32*						mPinnedEdgeIds;
+
+		const PxgConstraintPrePrep*	mConstraintPrePrep;
+		const bool					mIsContact;
+
+		PX_NOCOPY(PxgCpuArtiConstraintPrePrepTask)
+
+	public:
+
+		static const PxU32 NbConstraintsPerTaskPGS = 8192u;
+		static const PxU32 NbConstraintsPerTaskTGS = 512u;
+
+		PxgCpuArtiConstraintPrePrepTask(const PartitionIndices& edgeIds, PxU32 startEdgeIdx, PxU32 nbEdges, PxgConstraintBatchHeader* batchHeaders,
+			PxU32 nbBatches, PxU32 constraintBlockStartIndex, PxU32 uniqueIdStartIndex, PxU32* pinnedEdgeIds,
+			const PxgConstraintPrePrep* constraintPrePrep, bool isContact) : Cm::Task(0),
+			mEdgeIds(edgeIds), mStartEdgeIdx(startEdgeIdx), mNumEdges(nbEdges), mBatchHeaders(batchHeaders), mNumBatches(nbBatches),
+			mConstraintBlockStartIndex(constraintBlockStartIndex), mUniqueIdStartIndex(uniqueIdStartIndex), mPinnedEdgeIds(pinnedEdgeIds)
+			, mConstraintPrePrep(constraintPrePrep), mIsContact(isContact)
+		{
+		}
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgCpuArtiConstraintPrePrepTask";
+		}
+	};
+
+	class PxgCpuPrepTask : public Cm::Task
+	{
+		PxgGpuContext& mContext;
+		
+		PX_NOCOPY(PxgCpuPrepTask)
+
+	public:
+							PxgCpuPrepTask(PxgGpuContext& context) : Cm::Task(0), mContext(context)	{}
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+		virtual const char*	getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgCpuPrepTask";
+		}
+	};
+
+	class PxgGpuPrePrepTask : public Cm::Task
+	{
+		PxgGpuContext& mContext;
+
+		PX_NOCOPY(PxgGpuPrePrepTask)
+
+	public:
+							PxgGpuPrePrepTask(PxgGpuContext& context) : Cm::Task(0), mContext(context)	{}
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+		virtual const char* getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgGpuPrePrepTask";
+		}
+	};
+
+	class PxgPostSolveTask : public Cm::Task
+	{
+		PxgGpuContext& mContext;
+	
+		PX_NOCOPY(PxgPostSolveTask)
+
+	public:
+							PxgPostSolveTask(PxgGpuContext& context) : Cm::Task(0), mContext(context)	{}
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+		virtual const char*	getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgPostSolveTask";
+		}
+	};
+
+	class PxgGpuTask : public Cm::Task
+	{
+		PxgGpuContext&	mContext;
+		PxU32			mMaxNodes;
+		PxBitMapPinned*	mChangedHandleMap;//this is for the simulation controller
+		
+		PX_NOCOPY(PxgGpuTask)
+
+	public:
+							PxgGpuTask(PxgGpuContext& context) : Cm::Task(0), mContext(context), mMaxNodes(0), mChangedHandleMap(NULL)	{}
+
+				void		setMaxNodesAndWordCounts(const PxU32 maxNodes, PxBitMapPinned& changedHandleMap) { mMaxNodes = maxNodes; mChangedHandleMap = &changedHandleMap; }
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+		virtual const char*	getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgGpuTask";
+		}
+	};
+
+	class PxgGpuIntegrationTask : public Cm::Task
+	{
+		PxgGpuContext& mContext;
+
+		PX_NOCOPY(PxgGpuIntegrationTask)
+	public:
+							PxgGpuIntegrationTask(PxgGpuContext& context) : Cm::Task(0), mContext(context)	{}
+
+		virtual void		runInternal() PX_OVERRIDE PX_FINAL;
+		virtual const char*	getName() const PX_OVERRIDE PX_FINAL
+		{
+			return "PxgGpuIntegrationTask";
+		}
+	};
+
+	class PxgGpuContext : public Dy::Context
+	{
+		PX_NOCOPY(PxgGpuContext)
+
+	public:
+
+		PxgGpuContext(Cm::FlushPool& flushPool, IG::SimpleIslandManager& islandManager, 
+			PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions, bool enableStabilization, bool useEnhancedDeterminism, 
+			PxReal maxBiasCoefficient, PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+			PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled, bool isTGS);
+
+		virtual ~PxgGpuContext();
+
+		PX_FORCE_INLINE PxgSolverCore* getGpuSolverCore() { return mGpuSolverCore;}
+
+		PX_FORCE_INLINE PxgArticulationCore* getArticulationCore() { return mGpuArticulationCore; }
+
+		PX_FORCE_INLINE PxgGpuNarrowphaseCore* getNarrowphaseCore() { return mGpuNpCore; }
+
+		PX_FORCE_INLINE PxgSimulationCore* getSimulationCore() { return mGpuSimulationCore; }
+
+		PX_FORCE_INLINE PxgCudaBroadPhaseSap*	getGpuBroadPhase() { return mGpuBp;  }
+
+		PX_FORCE_INLINE PxgSoftBodyCore*	getGpuSoftBodyCore() { return mGpuSoftBodyCore; }
+
+		PX_FORCE_INLINE PxgFEMClothCore*	getGpuFEMClothCore() { return mGpuFEMClothCore; }
+
+		PX_FORCE_INLINE PxgParticleSystemCore**	getGpuParticleSystemCores() { return mGpuParticleSystemCores.begin(); }
+		PX_FORCE_INLINE PxU32	getNbGpuParticleSystemCores() { return mGpuParticleSystemCores.size(); }
+
+		PxgParticleSystemCore* getGpuParticleSystemCore();
+
+		PX_FORCE_INLINE PxU32 getCurrentContactStreamIndex() { return mCurrentContactStream; }
+
+		PX_FORCE_INLINE Cm::FlushPool&	getFlushPool() { return mFlushPool; }
+
+		PX_FORCE_INLINE PxU8* getPatchStream(const PxU32 index) { return mPatchStreamAllocators[index]->mStart; }
+		PX_FORCE_INLINE PxU8* getContactStream(const PxU32 index) { return mContactStreamAllocators[index]->mStart; }
+
+		PX_FORCE_INLINE bool enforceConstraintWriteBackToHostCopy() const { return mEnforceConstraintWriteBackToHostCopy; }
+
+		//this method make sure we get PxgSimultionController instead of PxsSimulationController
+		PxgSimulationController*			getSimulationController();
+
+		virtual void						setSimulationController(PxsSimulationController* mSimulationController)	PX_OVERRIDE;
+
+		virtual void						mergeResults()	PX_OVERRIDE;
+		virtual void						getDataStreamBase(void*& contactStreamBase, void*& patchStreamBase, void*& forceAndIndexStreamBase)	PX_OVERRIDE;
+		
+		virtual void						updateBodyCore(PxBaseTask* continuation)	PX_OVERRIDE;
+
+		virtual void						update(	Cm::FlushPool& flushPool, PxBaseTask* continuation, PxBaseTask* postPartitioningTask, PxBaseTask* lostTouchTask,
+													PxvNphaseImplementationContext* nphase, PxU32 maxPatchesPerCM, PxU32 maxArticulationLinks, PxReal dt,
+													const PxVec3& gravity, PxBitMapPinned& changedHandleMap)	PX_OVERRIDE;
+
+		virtual void						updatePostPartitioning(	PxBaseTask* lostTouchTask,
+																	PxvNphaseImplementationContext* nphase, PxU32 maxPatchesPerCM, PxU32 maxArticulationLinks, PxReal dt,
+																	const PxVec3& gravity, PxBitMapPinned& changedHandleMap)	PX_OVERRIDE;
+
+		virtual void setActiveBreakableConstraintCount(PxU32 activeBreakableConstraintCount) PX_OVERRIDE
+		{
+			mEnforceConstraintWriteBackToHostCopy = (activeBreakableConstraintCount > 0);
+		}
+
+		//this is the pre-prepare code for block format joints loaded from the non-block format joints
+		void								doConstraintJointBlockPrePrepGPU();
+
+		void								doStaticArticulationConstraintPrePrep(physx::PxBaseTask* continuation, const PxU32 articulationConstraintBatchIndex, const PxU32 articulationContactBatchIndex);
+
+		void								doStaticRigidConstraintPrePrep(physx::PxBaseTask* continuation);
+		
+		void								doConstraintSolveGPU(PxU32 maxNodes, PxBitMapPinned& changedHandleMap);
+
+		void								doPostSolveTask(physx::PxBaseTask* continuation);
+
+		virtual void						processPatches(	Cm::FlushPool& flushPool, PxBaseTask* continuation,
+															PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, PxsContactManagerOutputCounts* outCounts)	PX_OVERRIDE;
+
+		bool								isTGS() const { return mIsTGS; }
+		bool								isExternalForcesEveryTgsIterationEnabled() const { return mIsExternalForcesEveryTgsIterationEnabled; }
+
+		void								doPreIntegrationTaskCommon(physx::PxBaseTask* continuation);
+
+		void 								doConstraintPrePrepCommon(physx::PxBaseTask* continuation);
+
+		void								doConstraintPrePrepGPUCommon(bool hasForceThresholds);
+
+		void								cpuJointPrePrepTask(physx::PxBaseTask* continuation);
+
+		void 								allocateTempPinnedSolverMemoryCommon();
+
+		PX_FORCE_INLINE bool				getEnableDirectGPUAPI() const { return mEnableDirectGPUAPI;	}
+
+		PxvSimStats&			 			getSimStats() { return mSimStats; }
+
+		PxBaseTask*								mLostTouchTask;
+
+		PxU32									mTotalEdges;
+		PxU32									mTotalPreviousEdges;
+
+		PxsContactManagerOutputIterator			mOutputIterator;
+
+		PxReal*									mGPURestDistances;
+		Sc::ShapeInteraction**					mGPUShapeInteractions;
+		PxsTorsionalFrictionData*				mGPUTorsionalData;
+
+		Cm::FlushPool&							mFlushPool;
+		bool									mSolvedThisFrame;
+		PxgIncrementalPartition					mIncrementalPartition;
+
+		PxPinnedArray<PxNodeIndex>				mActiveNodeIndex;	//this will change everyframe, include rigid bodies and articulation
+		
+		PxgSolverBody							mWorldSolverBody;
+		PxgSolverBodyData						mWorldSolverBodyData;
+		PxgSolverBodySleepData					mWorldSolverBodySleepData;
+		PxgSolverTxIData						mWorldTxIData;
+
+		PxPinnedArray<PxgSolverBody>			mSolverBodyPool;
+		PxPinnedArray<PxAlignedTransform>		mBody2WorldPool;
+
+		//write back from the active articulation
+		//each articulation has max 64 links and max 3 * 63 dofsnd 1 wake counter
+		//see PxgArticulationLinkJointRootStateData
+		PxInt8ArrayPinned						mLinkAndJointAndRootStateDataPool;
+
+		PxPinnedArray<PxgSolverBodySleepData>	mArticulationSleepDataPool;
+		PxPinnedArray<Dy::ErrorAccumulator>		mInternalResidualPerArticulationVelIter; //Internal residuals in first half (do not include residuals from external constraints, e. g. contacts or PxConstraints), second half contains residual from contacts
+		PxPinnedArray<Dy::ErrorAccumulator>		mInternalResidualPerArticulationPosIter; //Internal residuals in first half (do not include residuals from external constraints, e. g. contacts or PxConstraints), second half contains residual from contacts
+
+		PxInt32ArrayPinned						m1dConstraintBatchIndices;
+		PxInt32ArrayPinned						mContactConstraintBatchIndices;
+		PxInt32ArrayPinned						mArti1dConstraintBatchIndices;
+		PxInt32ArrayPinned						mArtiContactConstraintBatchIndices;
+
+		PxInt32ArrayPinned						mConstraintsPerPartition;
+		PxInt32ArrayPinned						mArtiConstraintsPerPartition;
+
+		PxPinnedArray<PxgSolverBodyData>		mSolverBodyDataPool;
+		PxPinnedArray<PxgSolverBodySleepData>	mSolverBodySleepDataPool;
+		PxPinnedArray<PxgSolverTxIData>			mSolverTxIDataPool;
+
+		PxgPinnedHostLinearMemoryAllocator*		mPinnedMemoryAllocator;
+
+		PxgPinnedHostLinearMemoryAllocator*		mContactStreamAllocators[2];
+		PxgPinnedHostLinearMemoryAllocator*		mPatchStreamAllocators[2];
+		PxgPinnedHostLinearMemoryAllocator*		mForceStreamAllocator;
+		PxgPinnedHostLinearMemoryAllocator*		mFrictionPatchStreamAllocator;
+
+		PxU32									mCurrentContactStream;
+
+		PxgIslandContext*						mIslandContextPool;
+		PxU32									mNumIslandContextPool;
+
+		PxU32									mNum1DConstraintBlockPrepPool; //this and mNum1dConstraintBatches is the same, we can get rid of it later
+
+		PxU32									mNumContactManagers;
+		PxU32									mNum1DConstraints;
+
+		PxU32									mKinematicCount;
+		PxU32									mArticulationCount;
+		PxU32									mArticulationStartIndex; //record the start node index in the mActiveNodeIndex for articulation
+
+		PxU32									mBodyCount;
+
+		PxI32									mNumContactBatches;
+		PxI32									mNum1dConstraintBatches;
+		PxI32									mNumArtiContactBatches;
+		PxI32									mNumArti1dConstraintBatches;
+		PxI32									mNumStaticArtiContactBatches;
+		PxI32									mNumStaticArti1dConstraintBatches;
+		PxI32									mNumSelfArtiContactBatches;
+		PxI32									mNumSelfArti1dConstraintBatches;
+
+		PxI32									mNumStaticRigidContactBatches;
+		PxI32									mNumStaticRigid1dConstraintBatches;
+
+		PxI32									mArtiStaticConstraintBatchOffset;
+		PxI32									mArtiStaticContactBatchOffset;
+
+		//KS - we can't know this on CPU because the offset comes after the articulation constraints, which
+		//are computed on GPU
+		//PxI32									mRigidStaticConstraintBatchOffset;
+		//PxI32									mRigidStaticContactBatchOffset;
+
+		PxU32*									mConstraintUniqueIndices;
+		PxU32*									mContactUniqueIndices;
+		PxU32*									mArtiConstraintUniqueIndices;
+		PxU32*									mArtiContactUniqueIndices;
+		PxU32*									mArtiStaticConstraintUniqueIndices;
+		PxU32*									mArtiStaticContactUniqueIndices;
+		PxU32*									mArtiSelfConstraintUniqueIndices;
+		PxU32*									mArtiSelfContactUniqueIndices;
+		PxU32*									mArtiStaticConstraintStartIndex;
+
+		PxU32*									mRigidStaticConstraintUniqueIndices;
+		PxU32*									mRigidStaticContactUniqueIndices;
+
+		PxU32*									mArtiStaticConstraintCount;
+		PxU32*									mArtiStaticContactStartIndex;
+		PxU32*									mArtiStaticContactCount;
+
+		PxU32*									mRigidStaticConstraintStartIndex;
+		PxU32*									mRigidStaticConstraintCount;
+
+		PxI32									mCachedPositionIterations;
+		PxI32									mCachedVelocityIterations;
+
+		PxInt32ArrayPinned						mArtiStaticContactCounts;
+		PxInt32ArrayPinned						mArtiStaticJointCounts;
+
+		PxInt32ArrayPinned						mArtiStaticContactIndices;
+		PxInt32ArrayPinned						mArtiStaticJointIndices;
+
+		PxInt32ArrayPinned						mArtiSelfContactCounts;
+		PxInt32ArrayPinned						mArtiSelfJointCounts;
+
+		PxInt32ArrayPinned						mArtiSelfContactIndices;
+		PxInt32ArrayPinned						mArtiSelfJointIndices;
+
+		PxInt32ArrayPinned						mRigidStaticContactCounts;
+		PxInt32ArrayPinned						mRigidStaticJointCounts;
+
+		PxInt32ArrayPinned						mRigidStaticContactIndices;
+		PxInt32ArrayPinned						mRigidStaticJointIndices;
+
+		PxInt32ArrayPinned						mNodeIndicesStagingBuffer;
+		PxInt32ArrayPinned						mIslandIds;
+		PxInt32ArrayPinned						mIslandStaticTouchCounts;
+
+		//other joint type(not d6) cpu constraints
+		PxgConstraintBatchHeader*				mConstraintBatchHeaders;
+		PxgConstraintBatchHeader*				mArticConstraintBatchHeaders;
+
+		PxU32									mNumConstraintBatches;
+
+		PxU32									mNumArticConstraintBatches;
+		PxU32									mNumArtiStaticConstraintBatches;
+		PxU32									mNumArtiSelfConstraintBatches;
+
+		PxU32									mNumRigidStaticConstraintBatches;
+
+		bool									mHasForceThresholds;
+		const bool								mIsTGS;
+	    bool									mIsExternalForcesEveryTgsIterationEnabled;
+
+		PxgCudaBroadPhaseSap*					mGpuBp;
+		PxgGpuNarrowphaseCore*					mGpuNpCore;
+		PxgArticulationCore*					mGpuArticulationCore;
+		PxgSimulationCore*						mGpuSimulationCore;
+		PxgSoftBodyCore*						mGpuSoftBodyCore;
+		PxgFEMClothCore*						mGpuFEMClothCore;
+		PxArray<PxgParticleSystemCore*>			mGpuParticleSystemCores;
+		PxgParticleSystemCore*					mGpuPBDParticleSystemCore;
+		
+		PxgSolverCore*							mGpuSolverCore;
+
+		PxU32									mMaxNumStaticPartitions;
+
+		const bool								mEnableDirectGPUAPI;
+		bool									mRecomputeArticulationBlockFormat;
+
+		// when Direct GPU API is enabled, the constraint writeback data might have to be copied to host to
+		// support breakable D6 joints
+		bool									mEnforceConstraintWriteBackToHostCopy;
+
+		PxgCpuPreIntegrationTask				mPreIntegrationTask;
+		PxgCpuPrepTask							mPrepTask;
+		PxgGpuPrePrepTask						mGpuPrePrepTask;
+		PxgGpuIntegrationTask					mGpuIntegrationTask;
+		PxgGpuTask								mGpuTask; //this task include preprepare constraint, prepare constraint, solve and integration tasks
+		PxgPostSolveTask						mPostSolveTask;
+
+		void									doConstraintPrepGPU();
+		void									doPreIntegrationGPU(); //this is the pre integration code(copying data from pxgbodysim to solver body data)
+		void									doArticulationGPU(); //this is the articulation forward dynamic code
+		void									doSoftbodyGPU();//this is the soft body update tetrahedron rotations code
+		void									doFEMClothGPU();
+		void									doConstraintPrePrepGPU(); //this is the pre-prepare code for block format contacts and non-block format joints
+	};
+}
+
+#endif
+
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgCudaSolverCore.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgCudaSolverCore.h
@@ -0,0 +1,208 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CUDA_SOLVERCORE_H
+#define PXG_CUDA_SOLVERCORE_H
+
+#include "PxgSolverCore.h"
+
+namespace physx
+{
+	// PT: TODO: rename to PxgPGSSolverCore ?
+	class PxgCudaSolverCore : public PxgSolverCore
+	{
+		PX_NOCOPY(PxgCudaSolverCore)
+	private:
+
+		//This is the warp-based constraint format. TODO - remove!
+		PxgTypedCudaBuffer<PxgSolverContactHeader>		mContactHeaderStream;
+		PxgTypedCudaBuffer<PxgSolverFrictionHeader>		mFrictionHeaderStream;
+		PxgTypedCudaBuffer<PxgSolverContactPointExt>	mContactStream;
+		PxgTypedCudaBuffer<PxgSolverContactFrictionExt>	mFrictionStream;
+
+		// Each bit encodes the activation of a slab (32 bits). When there are more than 32 slabs, use multiple indices.
+		// To query the reference count, count the number of active slabs/bits.
+		PxgTypedCudaBuffer<PxU32>						mSolverEncodedReferenceCount;
+
+		//This is the new articulation block constraint format!
+		//It shares the original rigid body contact/constraint format but adds in
+		//an additional buffer for the response vectors
+		PxgTypedCudaBuffer<PxgArticulationBlockResponse>		mArtiConstraintBlockResponse;
+
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mForceThresholdStream;
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mTmpForceThresholdStream;
+
+		PxgTypedCudaBuffer<PxU32>		mConstraint1DBatchIndices;
+		PxgTypedCudaBuffer<PxU32>		mContactBatchIndices;
+
+		PxgTypedCudaBuffer<PxU32>		mArtiContactBatchIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiConstraint1dBatchIndices;
+	
+		PxgTypedCudaBuffer<PxReal>		mAccumulatedForceObjectPairs; //store the accumulated force for a pair of objects
+		PxgCudaBufferN<2>	mExceededForceElements;
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mForceChangeThresholdElements;
+
+		PxgTypedCudaBuffer<PxReal>		mThresholdStreamAccumulatedForce;
+		PxgTypedCudaBuffer<PxReal>		mBlocksThresholdStreamAccumulatedForce;
+
+		PxgTypedCudaBuffer<PxU32>		mThresholdStreamWriteIndex;
+		PxgTypedCudaBuffer<PxU32>		mBlocksThresholdStreamWriteIndex;
+		PxgTypedCudaBuffer<bool>		mThresholdStreamWriteable;
+
+		PxgTypedCudaBuffer<PxU32>		mIslandIds;
+		PxgTypedCudaBuffer<PxU32>		mIslandStaticTouchCount;
+		
+		PxgSolverSharedDesc<IterativeSolveData>*	mSharedDesc;
+
+		void radixSort(const PxU32 nbPasses);
+
+		friend class PxgArticulationCore;
+
+	public:
+
+		bool mFrictionEveryIteration;
+
+		PxgCudaSolverCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, PxgGpuContext* dynamicContext, 
+			PxgHeapMemoryAllocatorManager* heapMemoryManager, const PxGpuDynamicsMemoryConfig& init,
+			const bool frictionEveryIteration);
+		~PxgCudaSolverCore();
+
+		void constructSolverSharedDesc(PxgSolverSharedDesc<IterativeSolveData>& desc, 
+			const PxgConstantData& cData, Cm::UnAlignedSpatialVector* deferredZ, PxU32* articulationDirty, uint4* articulationSlabMask);
+		
+		void constructConstraitPrepareDesc(PxgConstraintPrepareDesc& desc, const PxU32 numDynamicConstraintBatchHeader,
+			const PxU32 numStaticConstraintBatchHeaders, const PxU32 numDynamic1dConstraintBatches, const PxU32 numStatic1dConstraintBatches,
+			const PxU32 numDynamicContactBatches, const PxU32 numStaticContactBatches,
+			const PxU32 numArtiConstraints, const PxU32 numArtiContacts, 
+			const PxU32 numArtiStatic1dConstraintBatches, const PxU32 numArtiStaticContactBatches, 
+			const PxU32 numArtiSelf1dConstraintBatches, const PxU32 numArtiSelfContactBatches,
+			const PxgConstantData& cData, PxU32 totalCurrentEdges,
+			PxU32 totalPreviousEdges, PxU32 totalBodies);
+
+		void constructSolverDesc(PxgSolverCoreDesc& desc, PxU32 numIsland, PxU32 numSolverBodies, PxU32 numConstraintBatchHeader, PxU32 numArticConstraints, PxU32 numSlabs, bool enableStabilization);
+
+		void syncSimulationController();
+
+		virtual void createStreams();
+		virtual void releaseStreams();
+
+		virtual void acquireContext();
+		virtual void releaseContext();
+
+		PxU32 getDescriptorsAllocationSize();
+		void allocatePinnedDescriptors(PxgPinnedHostLinearMemoryAllocator& hostAllocator);
+
+		void gpuMemDMAUpContactData(PxgPinnedHostLinearMemoryAllocator* compressedContactsHostMemoryAllocator,
+				PxU32 compressedContactStreamUpperPartSize, 
+				PxU32 compressedContactStreamLowerPartSize, 
+				PxgPinnedHostLinearMemoryAllocator* compressedPatchesHostMemoryAllocator,
+				PxU32 compressedPatchStreamUpperPartSize, 
+				PxU32 compressedPatchStreamLowerPartSize, 
+				PxU32 totalContactManagers,
+				const PartitionIndexData* partitionIndexData,
+				const PartitionNodeData* partitionNodeData,
+				const PxgSolverConstraintManagerConstants* constantData,
+				PxU32 constantDataCount,
+				PxU32 partitionIndexDataCount,
+				const PxU32* partitionConstraintBatchStartIndices,
+				const PxU32* partitionArticConstraintBatchStartIndices,
+				const PxU32* partitionJointBatchCounts,
+				const PxU32* partitionArtiJointBatchCounts,
+				PxU32 nbPartitions,
+				const PxU32* destroyedEdges,
+				PxU32 nbDestroyedEdges,
+				const PxU32* npIndexArray, PxU32 npIndexArraySize,
+				PxU32 totalNumJoints,
+				const PxU32* islandIds, const PxU32* nodeInteractionCounts, PxU32 nbNodes, const PxU32* islandStaticTouchCount, PxU32 nbIslands);
+
+		void gpuMemDmaUpBodyData(PxPinnedArray<PxgSolverBodyData>& solverBodyDataPool,
+			PxPinnedArray<PxgSolverTxIData>& solverTxIDataPool,
+			const PxU32 numSolverBodies,
+			const PxU32 totalNumRigidBatches, const PxU32 totalNumArticBatches,
+			const PxU32 nbSlabs, const PxU32 nbStaticSlabs, const PxU32 maxNumStaticPartitions);
+
+		void allocateSolverBodyBuffers(const PxU32 numSolverBodies,
+			PxPinnedArray<PxNodeIndex>& islandNodeIndices,
+			const PxU32 numActiveActiculations, const PxU32 maxArticulationLinks);
+
+		void gpuMemDMAUp(PxgPinnedHostLinearMemoryAllocator& hostAllocator, const PxgConstraintPrePrepData& data,
+			const PxU32 numSolverBodies, PxgConstraintBatchHeader* constraintBatchHeaders,
+			PxgIslandContext* islandContextPool, const PxU32 numIslands, const PxgPartitionData& partitionData,
+			const PxU32 numConstraintBatchHeader, const PxU32 numStaticConstraintBatchHeader,
+			const PxU32 numArticConstraintBatchHeader, const PxU32 numStaticArticulationBatchHeader, 
+			const PxU32 numArtiSelfConstraintBatchHeader, const PxgConstantData& cData,
+			const PxU32 numContactBlockes, const PxU32 numFrictionBlockes,
+			const PxU32 numArtiContacts, const PxU32 numArtiFrictions,
+			const PxU32 totalCurrentEdges, const PxU32 totalPreviousEdges, const PxU32 numSlabs, const PxU32 maxNbPartitions,
+			const bool enableStabilization, PxU8* cpuContactPatchStreamBase, PxU8* cpuContactStreamBase, PxU8* cpuForceStreamBase, PxsContactManagerOutputIterator& outputIterator,
+			const PxU32 totalActiveBodyCount, const PxU32 activeBodyStartIndex, const PxU32 nbArticulations, Cm::UnAlignedSpatialVector* deferredZ,
+			PxU32* articulationDirty, uint4* articulationSlabMask, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances,
+			PxsTorsionalFrictionData* torsionalData,
+			PxU32* artiStaticContactIndices, const PxU32 artiContactIndSize, PxU32* artiStaticJointIndices, PxU32 artiStaticJointSize,
+			PxU32* artiStaticContactCounts, PxU32* artiStaticJointCounts,
+			PxU32* artiSelfContactIndices, const PxU32 artiSelfContactIndSize, PxU32* artiSelfJointIndices, PxU32 artiSelfJointSize,
+			PxU32* artiSelfContactCounts, PxU32* artiSelfJointCounts, 
+			PxU32* rigidStaticContactIndices, const PxU32 rigidContactIndSize, PxU32* rigidStaticJointIndices, const PxU32 rigidStaticJointSize,
+			PxU32* rigidStaticContactCounts, PxU32* rigidSaticJointCounts, const PxReal lengthScale, bool hasForceThresholds);
+
+		void gpuMemDMAbackSolverData(PxU8* forceBufferPool, PxU32 forceBufferOffset, PxU32 forceBufferUpperPartSize,
+			PxU32 forceBufferLowerPartSize, Dy::ThresholdStreamElement* changedElems, bool hasForceThresholds, Dy::ConstraintWriteback* constraintWriteBack,
+			const PxU32 writeBackSize, bool copyAllToHost, Dy::ErrorAccumulator*& contactError);
+
+		void syncDmaBack(PxU32& nbChangedThresholdElements);
+
+		void preIntegration(const PxU32 offset, const PxU32 nbSolverBodies, const PxReal dt, const PxVec3& gravity);
+
+		void jointConstraintBlockPrePrepParallel(PxU32 nbConstraintBatches);
+
+		void jointConstraintPrepareParallel(PxU32 nbJointBatches);
+		void contactConstraintPrepareParallel(PxU32 nbContactBatches);
+		void artiJointConstraintPrepare(PxU32 nbArtiJointBatches);
+		void artiContactConstraintPrepare(PxU32 nbArtiContactBatches);
+		//soft body/cloth/particle constraint prepare
+		void nonRigidConstraintPrepare(PxU32 nbArticulations);
+
+		void solveContactMultiBlockParallel(PxgIslandContext* islandContexts, const PxU32 numIslands, const PxU32 maxPartitions, 
+			PxInt32ArrayPinned& constraintsPerPartition, PxInt32ArrayPinned& artiConstraintsPerPartition, const PxVec3& gravity,
+			PxReal* posIterResidualSharedMem, PxU32 posIterResidualSharedMemSize, Dy::ErrorAccumulator* posIterError, PxPinnedArray<Dy::ErrorAccumulator>& artiContactPosIterError, 
+			PxPinnedArray<Dy::ErrorAccumulator>& perArticulationInternalError);
+
+		void writeBackBlock(PxU32 a, PxgIslandContext& context);
+
+		void solvePartitions(PxgIslandContext* islandContexts, PxInt32ArrayPinned& constraintsPerPartition, PxInt32ArrayPinned& artiConstraintsPerPartition,
+			PxU32 islandIndex, bool doFriction, bool anyArticulationConstraints);
+
+		void accumulatedForceThresholdStream(PxU32 maxNodes);
+		void integrateCoreParallel(const PxU32 offset, const PxU32 nbSolverBodies);
+
+		void getDataStreamBase(void*& contactStreamBase, void*& patchStreamBase, void*& forceAndIndexStreamBase);
+	};
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgD6Joint.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgD6Joint.h
@@ -0,0 +1,149 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_D6_JOINT_H
+#define PXG_D6_JOINT_H
+
+#include "PxConstraintDesc.h"
+#include "PxgD6JointLimit.h"
+
+namespace physx
+{
+	struct PxgD6Drive  // equivalent of PxD6Drive
+	{
+		enum Enum
+		{
+			eX			= 0,		//!< drive along the X-axis
+			eY			= 1,		//!< drive along the Y-axis
+			eZ			= 2,		//!< drive along the Z-axis
+			eSWING		= 3,		//!< drive of displacement from the X-axis
+			eTWIST		= 4,		//!< drive of the displacement around the X-axis
+			eSLERP		= 5,		//!< drive of all three angular degrees along a SLERP-path
+			eSWING1		= 6,
+			eSWING2		= 7,
+			eCOUNT
+		};
+	};
+
+	struct PxgD6Motion
+	{
+		enum Enum
+		{
+			eLOCKED,	//!< The DOF is locked, it does not allow relative motion.
+			eLIMITED,	//!< The DOF is limited, it only allows motion within a specific range.
+			eFREE,		//!< The DOF is free and has its full range of motion.
+			eFORCE_DWORD = 0x7fffffff
+		};
+	};
+
+	struct PxgD6Axis
+	{
+		enum Enum
+		{
+			eX      = 0,	//!< motion along the X axis
+			eY      = 1,	//!< motion along the Y axis
+			eZ      = 2,	//!< motion along the Z axis
+			eTWIST  = 3,	//!< motion around the X axis
+			eSWING1 = 4,	//!< motion around the Y axis
+			eSWING2 = 5,	//!< motion around the Z axis
+			eCOUNT	= 6
+		};
+	};
+
+	struct PxgD6JointDriveFlag
+	{
+		PX_CUDA_CALLABLE PxgD6JointDriveFlag(){}
+
+		enum Enum
+		{
+			// IMPORTANT: the enum values need to match the ones in PxD6JointDriveFlag. Unfortunately, the GPU
+			//            version just copy pasted all the D6 logic. Testing with a compile time assert would
+			//            create a bit of a mess with our code hierarchy (on CPU, joints are a concept known
+			//            to the PhysXExtensions library only)
+
+			eACCELERATION	= (1 << 0),	//!< drive spring is for the acceleration at the joint (rather than the force) 
+			eOUTPUT_FORCE	= (1 << 1)	// see PxD6JointDriveFlag::eOUTPUT_FORCE
+		};
+	};
+	typedef PxFlags<PxgD6JointDriveFlag::Enum, PxU32> PxgD6JointDriveFlags;
+
+	class PxgSpring
+	{
+	public:
+
+		PxReal					stiffness;			//!< the spring strength of the drive: that is, the force proportional to the position error
+		PxReal					damping;			//!< the damping strength of the drive: that is, the force proportional to the velocity error
+
+		PX_CUDA_CALLABLE PxgSpring(PxReal stiffness_, PxReal damping_): stiffness(stiffness_), damping(damping_) {}
+	};
+
+	class PxgD6JointDrive : public PxgSpring
+	{
+
+	public:
+		PxReal						forceLimit;			//!< the force limit of the drive - may be an impulse or a force depending on PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES
+		PxgD6JointDriveFlags		flags;				//!< the joint drive flags 
+
+
+		/**
+		\brief default constructor for PxD6JointDrive.
+		*/
+
+		PX_CUDA_CALLABLE PxgD6JointDrive(): PxgSpring(0,0), forceLimit(PX_MAX_F32), flags(0) {}
+
+		/**
+		\brief constructor a PxD6JointDrive.
+
+		\param[in] driveStiffness the stiffness of the drive spring.
+		\param[in] driveDamping the damping of the drive spring
+		\param[in] driveForceLimit the maximum impulse or force that can be exerted by the drive
+		\param[in] isAcceleration whether the drive is an acceleration drive or a force drive
+		*/
+
+
+		PX_CUDA_CALLABLE PxgD6JointDrive(PxReal driveStiffness, PxReal driveDamping, PxReal driveForceLimit, bool isAcceleration = false)
+		: PxgSpring(driveStiffness, driveDamping)
+		, forceLimit(driveForceLimit)
+		, flags(isAcceleration?(PxU32)PxgD6JointDriveFlag::eACCELERATION : 0) 
+		{}
+
+
+		/** 
+		\brief returns true if the drive is valid
+		*/
+
+		bool isValid() const
+		{
+			return PxIsFinite(stiffness) && stiffness>=0 &&
+				   PxIsFinite(damping) && damping >=0 &&
+				   PxIsFinite(forceLimit) && forceLimit >=0;
+		}
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgD6JointData.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgD6JointData.h
@@ -0,0 +1,117 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_D6_JOINT_DATA_H
+#define PXG_D6_JOINT_DATA_H
+
+#include "PxConstraintDesc.h"
+#include "PxgD6JointLimit.h"
+#include "AlignedTransform.h"
+#include "PxgSolverConstraintDesc.h"
+
+namespace physx
+{
+	struct PxgJointData
+	{
+		PxgConstraintInvMassScale	invMassScale;
+		PxTransform32				c2b[2];
+	};
+
+	struct PxgD6JointData : public PxgJointData
+	{
+	
+	public:
+
+		PX_CUDA_CALLABLE PxgD6JointData(){}
+
+		static constexpr PxU32 sDriveEntryCapacity = 6;
+
+		PxgD6Motion::Enum			motion[6];
+		PxgJointLinearLimit			distanceLimit;
+		PxgJointLinearLimitPair		linearLimitX;
+		PxgJointLinearLimitPair		linearLimitY;
+		PxgJointLinearLimitPair		linearLimitZ;
+		PxgJointAngularLimitPair	twistLimit;
+		PxgJointLimitCone			swingLimit;
+		PxgJointLimitPyramid		pyramidSwingLimit;
+		
+		PxgD6JointDrive				drive[sDriveEntryCapacity];
+
+		PxTransform					drivePosition;
+		PxVec3						driveLinearVelocity;
+		PxVec3						driveAngularVelocity;
+
+		// derived quantities
+
+		PxU32						locked;		// bitmap of locked DOFs
+		PxU32						limited;	// bitmap of limited DOFs
+		PxU32						driving;	// bitmap of active drives (implies driven DOFs not locked)
+
+		PxReal						distanceMinDist;	// distance limit minimum distance to get a good direction
+
+		// PT: the PxD6Motion values are now shared for both kind of linear limits, so we need
+		// an extra bool to know which one(s) should be actually used.
+		bool						mUseDistanceLimit;
+		bool						mUseNewLinearLimits;
+
+		// PT: the swing limits can now be a cone or a pyramid, so we need
+		// an extra bool to know which one(s) should be actually used.
+		bool						mUseConeLimit;
+		bool						mUsePyramidLimits;
+
+		PxU8						angularDriveConfig;  // stores the angular drive config (PxD6AngularDriveConfig::Enum)
+
+		//Please don't add fields above this line since the layout must match D6JointData
+
+		// forestall compiler complaints about not being able to generate a constructor
+	private:
+		PxgD6JointData(const PxgJointLinearLimit& distance,
+			const PxgJointLinearLimitPair& linearX,
+			const PxgJointLinearLimitPair& linearY,
+			const PxgJointLinearLimitPair& linearZ,
+			const PxgJointAngularLimitPair& twist,
+			const PxgJointLimitCone& swing,
+			const PxgJointLimitPyramid& pyramid):
+			distanceLimit(distance),
+			linearLimitX(linearX),
+			linearLimitY(linearY),
+			linearLimitZ(linearZ),
+			twistLimit(twist),
+			swingLimit(swing),
+			pyramidSwingLimit(pyramid),
+			mUseDistanceLimit(false),
+			mUseNewLinearLimits(false),
+			mUseConeLimit(false),
+			mUsePyramidLimits(false),
+			angularDriveConfig(0)
+		{}
+
+	};
+	PX_COMPILE_TIME_ASSERT(sizeof(PxgD6JointData) <= 512);
+}
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgD6JointLimit.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgD6JointLimit.h
@@ -0,0 +1,340 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_D6_JOINT_LIMIT_H
+#define PXG_D6_JOINT_LIMIT_H
+
+#include "common/PxTolerancesScale.h"
+#include "PxgD6Joint.h"
+
+namespace physx
+{
+	// PT: why aren't these shared with the CPU ?
+
+	class PxgJointLimitParameters
+	{
+	public:
+		
+		PxReal restitution;
+		PxReal bounceThreshold;
+		PxReal stiffness;
+		PxReal damping;
+
+		PX_CUDA_CALLABLE PxgJointLimitParameters()
+		: restitution(0)
+		, bounceThreshold(0)
+		, stiffness(0)
+		, damping(0)
+		{
+		}	
+
+		/**
+		\brief Returns true if the current settings are valid.
+
+		\return true if the current settings are valid
+		*/
+		PX_INLINE bool isValid() const
+		{
+			return	PxIsFinite(restitution) && restitution >= 0 && restitution <= 1 && 
+					PxIsFinite(stiffness) && stiffness >= 0 && 
+					PxIsFinite(damping) && damping >= 0 &&
+					PxIsFinite(bounceThreshold) && bounceThreshold >= 0;
+		}
+
+		PX_CUDA_CALLABLE PX_INLINE bool isSoft() const
+		{
+			return damping>0 || stiffness>0;
+		}
+	};
+
+
+	class PxgJointLinearLimit : public PxgJointLimitParameters
+	{
+
+	public:
+		
+		PxReal value;
+
+		PX_CUDA_CALLABLE PxgJointLinearLimit(PxReal extent) : value(extent)
+		{
+		}
+
+		PX_CUDA_CALLABLE PxgJointLinearLimit(PxReal extent, const PxgSpring& spring) : value(extent)
+		{
+			stiffness = spring.stiffness;
+			damping = spring.damping;
+		}
+
+		PX_CUDA_CALLABLE PxgJointLinearLimit() {}
+
+		PX_INLINE bool isValid() const
+		{
+			return PxgJointLimitParameters::isValid() &&
+				   PxIsFinite(value) && 
+				   value > 0;
+		}
+	};
+
+	class PxgJointLinearLimitPair : public PxgJointLimitParameters
+	{
+	public:
+
+		PX_CUDA_CALLABLE PxgJointLinearLimitPair() {}
+		/**
+		\brief the range of the limit. The upper limit must be no lower than the
+		lower limit, and if they are equal the limited degree of freedom will be treated as locked.
+
+		<b>Range:</b> See the joint on which the limit is used for details<br>
+		<b>Default:</b> lower = -PX_MAX_F32/3, upper = PX_MAX_F32/3
+		*/
+		PxReal upper, lower;
+
+		/**
+		\brief Construct a linear hard limit pair. The lower distance value must be less than the upper distance value.
+
+		\param[in] scale		A PxTolerancesScale struct. Should be the same as used when creating the PxPhysics object.
+		\param[in] lowerLimit	The lower distance of the limit
+		\param[in] upperLimit	The upper distance of the limit
+
+		\see PxJointLimitParameters PxTolerancesScale
+		*/
+		PX_CUDA_CALLABLE PxgJointLinearLimitPair(const PxTolerancesScale& scale, PxReal lowerLimit, PxReal upperLimit) :
+			upper(upperLimit),
+			lower(lowerLimit)
+		{
+			bounceThreshold = 2.0f*scale.length;
+		}
+
+		/**
+		\brief construct a linear soft limit pair
+
+		\param[in] lowerLimit	The lower distance of the limit
+		\param[in] upperLimit	The upper distance of the limit
+		\param[in] spring		The stiffness and damping parameters of the limit spring
+
+		\see PxJointLimitParameters PxTolerancesScale
+		*/
+		PX_CUDA_CALLABLE PxgJointLinearLimitPair(PxReal lowerLimit, PxReal upperLimit, const PxgSpring& spring) :
+			upper(upperLimit),
+			lower(lowerLimit)
+		{
+			stiffness = spring.stiffness;
+			damping = spring.damping;
+		}
+
+		/**
+		\brief Returns true if the limit is valid.
+
+		\return true if the current settings are valid
+		*/
+		PX_INLINE bool isValid() const
+		{
+			return PxgJointLimitParameters::isValid() &&
+				PxIsFinite(upper) && PxIsFinite(lower) && upper >= lower &&
+				PxIsFinite(upper - lower);
+		}
+	};
+
+
+	class PxgJointAngularLimitPair : public PxgJointLimitParameters
+	{
+	
+	public:
+		
+		PxReal upper, lower;
+
+		PX_CUDA_CALLABLE PxgJointAngularLimitPair(PxReal lowerLimit, PxReal upperLimit)
+		: upper(upperLimit)
+		, lower(lowerLimit)
+		{
+			bounceThreshold = 0.5f;
+		}
+
+
+		PX_CUDA_CALLABLE PxgJointAngularLimitPair(PxReal lowerLimit, PxReal upperLimit, const PxgSpring& spring)
+		: upper(upperLimit)
+		, lower(lowerLimit)
+		{
+			stiffness = spring.stiffness;
+			damping = spring.damping;
+		}
+
+		PX_CUDA_CALLABLE PxgJointAngularLimitPair(){}
+
+
+		/**
+		\brief Returns true if the limit is valid.
+
+		\return true if the current settings are valid
+		*/
+		PX_INLINE bool isValid() const
+		{
+			return PxgJointLimitParameters::isValid() &&
+				   PxIsFinite(upper) && PxIsFinite(lower) && upper >= lower;
+		}
+	};
+
+	class PxgJointLimitCone : public PxgJointLimitParameters
+	{
+	
+	public:
+		
+		PxReal yAngle;
+		PxReal zAngle;
+
+		PX_CUDA_CALLABLE PxgJointLimitCone(PxReal yLimitAngle, PxReal zLimitAngle):
+		  yAngle(yLimitAngle),
+		  zAngle(zLimitAngle)
+		  {
+				bounceThreshold = 0.5f;
+		  }
+
+		PX_CUDA_CALLABLE PxgJointLimitCone(PxReal yLimitAngle, PxReal zLimitAngle, const PxgSpring& spring):
+		  yAngle(yLimitAngle),
+		  zAngle(zLimitAngle)
+		  {
+			  stiffness = spring.stiffness;
+			  damping = spring.damping;
+		  }
+
+		PX_CUDA_CALLABLE PxgJointLimitCone(){}
+
+		/**
+		\brief Returns true if the limit is valid.
+
+		\return true if the current settings are valid
+		*/
+		PX_INLINE bool isValid() const
+		{
+			return PxgJointLimitParameters::isValid() &&
+				   PxIsFinite(yAngle) && yAngle>0 && yAngle<PxPi && 
+				   PxIsFinite(zAngle) && zAngle>0 && zAngle<PxPi;
+		}
+	};
+
+	class PxgJointLimitPyramid : public PxgJointLimitParameters
+	{
+	public:
+		/**
+		\brief the minimum angle from the Y axis of the constraint frame.
+
+		<b>Unit:</b> Angular: Radians
+		<b>Range:</b> Angular: (-PI,PI)<br>
+		<b>Default:</b> -PI/2
+		*/
+		PxReal yAngleMin;
+
+		/**
+		\brief the maximum angle from the Y axis of the constraint frame.
+
+		<b>Unit:</b> Angular: Radians
+		<b>Range:</b> Angular: (-PI,PI)<br>
+		<b>Default:</b> PI/2
+		*/
+		PxReal yAngleMax;
+
+		/**
+		\brief the minimum angle from the Z-axis of the constraint frame.
+
+		<b>Unit:</b> Angular: Radians
+		<b>Range:</b> Angular: (-PI,PI)<br>
+		<b>Default:</b> -PI/2
+		*/
+		PxReal zAngleMin;
+
+		/**
+		\brief the maximum angle from the Z-axis of the constraint frame.
+
+		<b>Unit:</b> Angular: Radians
+		<b>Range:</b> Angular: (-PI,PI)<br>
+		<b>Default:</b> PI/2
+		*/
+		PxReal zAngleMax;
+
+		PX_CUDA_CALLABLE PxgJointLimitPyramid() {}
+
+		/**
+		\brief Construct a pyramid hard limit.
+
+		\param[in] yLimitAngleMin	The minimum limit angle from the Y-axis of the constraint frame
+		\param[in] yLimitAngleMax	The maximum limit angle from the Y-axis of the constraint frame
+		\param[in] zLimitAngleMin	The minimum limit angle from the Z-axis of the constraint frame
+		\param[in] zLimitAngleMax	The maximum limit angle from the Z-axis of the constraint frame
+
+		\see PxJointLimitParameters
+		*/
+		PX_CUDA_CALLABLE PxgJointLimitPyramid(PxReal yLimitAngleMin, PxReal yLimitAngleMax, PxReal zLimitAngleMin, PxReal zLimitAngleMax) :
+			yAngleMin(yLimitAngleMin),
+			yAngleMax(yLimitAngleMax),
+			zAngleMin(zLimitAngleMin),
+			zAngleMax(zLimitAngleMax)
+		{
+			bounceThreshold = 0.5f;
+		}
+
+		/**
+		\brief Construct a pyramid soft limit.
+
+		\param[in] yLimitAngleMin	The minimum limit angle from the Y-axis of the constraint frame
+		\param[in] yLimitAngleMax	The maximum limit angle from the Y-axis of the constraint frame
+		\param[in] zLimitAngleMin	The minimum limit angle from the Z-axis of the constraint frame
+		\param[in] zLimitAngleMax	The maximum limit angle from the Z-axis of the constraint frame
+		\param[in] spring			The stiffness and damping of the limit spring
+
+		\see PxJointLimitParameters
+		*/
+		PX_CUDA_CALLABLE PxgJointLimitPyramid(PxReal yLimitAngleMin, PxReal yLimitAngleMax, PxReal zLimitAngleMin, PxReal zLimitAngleMax, const PxgSpring& spring) :
+			yAngleMin(yLimitAngleMin),
+			yAngleMax(yLimitAngleMax),
+			zAngleMin(zLimitAngleMin),
+			zAngleMax(zLimitAngleMax)
+		{
+			stiffness = spring.stiffness;
+			damping = spring.damping;
+		}
+
+		/**
+		\brief Returns true if the limit is valid.
+
+		\return true if the current settings are valid
+		*/
+		PX_INLINE bool isValid() const
+		{
+			return PxgJointLimitParameters::isValid() &&
+				PxIsFinite(yAngleMin) && yAngleMin>-PxPi && yAngleMin<PxPi &&
+				PxIsFinite(yAngleMax) && yAngleMax>-PxPi && yAngleMax<PxPi &&
+				PxIsFinite(zAngleMin) && zAngleMin>-PxPi && zAngleMin<PxPi &&
+				PxIsFinite(zAngleMax) && zAngleMax>-PxPi && zAngleMax<PxPi &&
+				yAngleMax >= yAngleMin && zAngleMax >= zAngleMin;
+		}
+	};
+
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgDynamicsConfiguration.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgDynamicsConfiguration.h
@@ -0,0 +1,35 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_DYNAMICS_CONFIGURATION_H
+#define	PXG_DYNAMICS_CONFIGURATION_H
+
+#define PXG_BATCH_SIZE	32u
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgDynamicsContext.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgDynamicsContext.h
@@ -0,0 +1,64 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_DYNAMICS_CONTEXT_H
+#define PXG_DYNAMICS_CONTEXT_H
+
+#include "PxgContext.h"
+
+namespace physx
+{
+	namespace Cm
+	{
+		class FlushPool;
+	}
+
+	class PxBaseTask;
+
+	class PxsKernelWranglerManager;
+
+	/**
+	\brief A class to represent a GPU dynamics context for the GPU rigid body solver
+	*/
+	class PxgDynamicsContext : public PxgGpuContext
+	{
+		PX_NOCOPY(PxgDynamicsContext)
+
+	public:
+		PxgDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+			const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions,
+			bool enableStabilization, bool useEnhancedDeterminism, PxReal maxBiasCoefficient, PxvSimStats& simStats,
+			PxgHeapMemoryAllocatorManager* heapMemoryManager, bool frictionEveryIteration, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled);
+
+		virtual void						destroy();
+
+		virtual PxSolverType::Enum			getSolverType()	const { return PxSolverType::ePGS; }
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgEdgeType.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgEdgeType.h
@@ -0,0 +1,51 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_EDGE_TYPE_H
+#define PXG_EDGE_TYPE_H
+
+namespace physx
+{
+//This is the same as IG::Edge::EdgeType, but we have more enum type so we can represent articulation
+//contacts and joints
+
+struct PxgEdgeType
+{
+	enum Enum
+	{
+		eCONTACT_MANAGER	= 0,
+		eCONSTRAINT,
+		eARTICULATION_CONTACT,
+		eARTICULATION_CONSTRAINT,
+		eEDGE_TYPE_COUNT
+	};
+};
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgFrictionPatch.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgFrictionPatch.h
@@ -0,0 +1,126 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+
+#ifndef PXG_FRICTION_PATCH_H
+#define PXG_FRICTION_PATCH_H
+
+namespace physx
+{
+
+struct PxgFrictionPatch	
+{
+	float4	body0Normal;
+	float4	body1Normal;
+
+	float4	body0Anchors[2];
+	float4	body1Anchors[2];
+
+	PxU32	anchorCount;
+	PxU32	broken;
+	PxU32	contactID[2];
+
+	PX_CUDA_CALLABLE PX_FORCE_INLINE	void	operator = (const PxgFrictionPatch& other)
+	{
+		broken = other.broken;
+		anchorCount = other.anchorCount;
+		body0Normal = other.body0Normal;
+		body1Normal = other.body1Normal;
+		body0Anchors[0] = other.body0Anchors[0];   
+		body0Anchors[1] = other.body0Anchors[1];
+		body1Anchors[0] = other.body1Anchors[0];
+		body1Anchors[1] = other.body1Anchors[1];
+		contactID[0] = other.contactID[0];
+		contactID[1] = other.contactID[1];
+	}
+};  
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgFrictionPatch)==112);
+
+struct PxgBlockFrictionPatch
+{
+	PX_ALIGN(256, float4 body0Normal[32]);
+	PX_ALIGN(256, float4 body1Normal[32]);
+
+	PX_ALIGN(128, PxU32 anchorCount[32]);
+	PX_ALIGN(128, PxU32 broken[32]);
+	PX_ALIGN(128, PxU32 contactID[2][32]);
+
+	PX_ALIGN(256, float4 anchorPoints[2][32]);
+	PX_ALIGN(128, PxU32 patchIndex[32]);
+};
+
+struct PxgBlockFrictionAnchorPatch
+{
+	PX_ALIGN(256, float4 body0Anchors[2][32]);	//1024	1024
+	PX_ALIGN(256, float4 body1Anchors[2][32]);	//2048	1024
+};
+
+struct PxgFrictionAnchorPatch
+{
+	float4 body0Anchors[2];	
+	float4 body1Anchors[2];	
+};
+
+struct PxgFrictionPatchGPU
+{
+	static const PxU32 MAX_ANCHORS = 2;	//!< Patch friction anchor max count
+	PxVec3 points[MAX_ANCHORS];			//!< Patch friction anchors points
+	PxVec3 impulses[MAX_ANCHORS];		//!< Patch friction impulses at anchors
+	PxU32 anchors;						//!< Patch friction anchor count
+};
+
+/**
+This class is used for friction correlation using the block friction format. The idea is simple - we have an array of these block friction index objects.
+These objects contain a pointer to the block patch and the index that this particular constraint is in that structure.
+This allows us to allocate individual block friction patches per-block constraint and then index into them.
+
+Advantage - we can use colaesced memory access patterns for the friction patches, broken flags etc. Can remove the need for multiple pointers to friction patches in constraint descs.
+Disadvantage - one extra level of indirection to access previous friction patches. No guarantees that accessing previous friction patches won't diverge (in practice,
+they should be similar but they could still diverge if new constraints are introduced that change the layout of constraints within a given partition).
+*/
+struct PxgBlockFrictionIndex
+{
+	PxU64 mPatchIndex_threadIdxLow;
+
+	PX_CUDA_CALLABLE PxU64 getPatchIndex() const { return mPatchIndex_threadIdxLow >> 5ull; }
+	PX_CUDA_CALLABLE PxU32 getThreadIdx() const { return PxU32(mPatchIndex_threadIdxLow&31); }
+
+	PX_CUDA_CALLABLE void createPatchIndex(const PxU32 patchIndex, const PxU32 threadIndexInWarp)
+	{
+		mPatchIndex_threadIdxLow = (PxU64(patchIndex) << 5ull) | threadIndexInWarp;
+	}
+};
+
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgIslandContext.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgIslandContext.h
@@ -0,0 +1,66 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_ISLAND_CONTEXT_H
+#define PXG_ISLAND_CONTEXT_H
+
+#include "foundation/PxArray.h"
+
+namespace physx
+{
+	struct PxgIslandContext
+	{
+		PxU32 mBodyStartIndex;
+		PxU32 mBodyCount;
+
+		PxU32 mArticulationCount;
+
+		PxU32 mDescStartIndex;
+		PxU32 mDescCount;
+
+		PxI32 mNumPositionIterations;
+		PxI32 mNumVelocityIterations;
+
+		PxU32 mStartPartitionIndex;
+		PxU32 mNumPartitions;
+
+		PxU32 mBatchStartIndex;
+		PxU32 mBatchCount;
+
+		PxU32 mArtiBatchStartIndex;
+		PxU32 mArtiBatchCount;
+		PxU32 mStaticArtiBatchCount;
+		PxU32 mSelfArtiBatchCount;
+
+		PxU32 mStaticRigidBatchCount;
+
+		PxReal mBiasCoefficient;
+
+	};
+}
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgPartitionNode.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgPartitionNode.h
@@ -0,0 +1,60 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_PARTITION_NODE_H
+#define PXG_PARTITION_NODE_H
+
+#include "PxNodeIndex.h"
+
+namespace physx
+{
+	struct PartitionIndexData
+	{
+		PxU16 mPartitionIndex;		//! The current partition this edge is in. Used to find the edge efficiently. PxU8 is probably too small (256 partitions max) but PxU16 should be more than enough
+		PxU8 mPatchIndex;			//! The patch index for this partition edge. There may be multiple entries for a given edge if there are multiple patches.
+		PxU8 mCType;				//! The type of constraint this is (PxgEdgeType)
+		PxU32 mPartitionEntryIndex;	//! index of partition edges for this partition
+	};
+
+	// PT: stored in incremental partition code's mPartitionNodeArray,
+	// indexed by a partition edge's unique index.
+	struct PartitionNodeData
+	{
+		// PT: copies of PartitionEdge' node indices (the nodes connected by this edge)
+		// - created in PxgIncrementalPartition::addEdge_Stage1
+		PxNodeIndex mNodeIndex0;
+		PxNodeIndex mNodeIndex1;
+
+		// PT: links to next edge unique indices containing the same nodes
+		// - computed in PxgIncrementalPartition::addEdge_Stage2 => PxgIncrementalPartition::addEdgeInternal
+		// - used in constraintContactBlockPrePrepLaunch / constraint1DBlockPrePrepLaunch
+		// - unclear what we need this for
+		PxU32 mNextIndex[2];
+	};
+}
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolver.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolver.h
@@ -0,0 +1,40 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_H
+#define PXG_SOLVER_H
+
+
+namespace physx
+{
+	
+	//this is needed to force PhysXSolverGpu linkage as Static Library!
+	void createPxgSolver();
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverBody.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverBody.h
@@ -0,0 +1,202 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_BODY_H
+#define PXG_SOLVER_BODY_H
+
+#include "PxvConfig.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxMat33.h"
+#include "foundation/PxTransform.h"
+#if !PX_CUDA_COMPILER
+#include <vector_types.h>
+#endif
+#include "AlignedMat33.h"
+#include "AlignedTransform.h"
+#include "PxNodeIndex.h"
+#include "PxSpatialMatrix.h"
+
+namespace physx
+{
+
+class PxsRigidBody;
+struct PxgSolverBody;
+class PxgArticulation;
+
+struct PxgSolverTxIData
+{
+	PxTransform			deltaBody2World;	// 64 body delta transform
+	PxMat33				sqrtInvInertia;		// 36 inverse inertia in world space
+};
+
+struct PxgSolverBodyPrepData
+{
+#if !PX_CUDA_COMPILER
+	PX_ALIGN(16, PxVec3			initialAngVel);					//	12 initial ang vel
+	PxReal						penBiasClamp;					//	16 the penetration bias clamp
+	PxVec3						initialLinVel;					//	28 initial lin vel
+	PxReal						invMass;						//	32 inverse mass
+#else
+	float4						initialAngVelXYZ_penBiasClamp;
+	float4						initialLinVelXYZ_invMassW;
+#endif	
+
+	PxAlignedTransform			body2World;
+
+#if !PX_CUDA_COMPILER
+	PX_FORCE_INLINE PxReal projectVelocity(const PxVec3& lin, const PxVec3& ang)	const
+	{
+		return initialLinVel.dot(lin) + initialAngVel.dot(ang);
+	}
+#else
+	PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal projectVelocity(const PxVec3& lin, const PxVec3& ang)	const
+	{
+		//return initialLinVel.dot(lin) + initialAngVel.dot(ang);
+		PxVec3 initialLinVel(initialLinVelXYZ_invMassW.x, initialLinVelXYZ_invMassW.y, initialLinVelXYZ_invMassW.z);
+		PxVec3 initialAngVel(initialAngVelXYZ_penBiasClamp.x, initialAngVelXYZ_penBiasClamp.y, initialAngVelXYZ_penBiasClamp.z);
+		return initialLinVel.dot(lin) + initialAngVel.dot(ang);
+	}
+#endif
+};
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+
+struct PxgSolverBodyData : public PxgSolverBodyPrepData
+{
+	PxNodeIndex		islandNodeIndex;		// 40
+	PxReal			reportThreshold;		// 44 contact force threshold	
+	PxReal			maxImpulse;				// 48
+	PxU32			flags;					// 52 hasSpeculativeCCD etc.
+	PxReal			offsetSlop;
+};
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+PX_COMPILE_TIME_ASSERT((sizeof(PxgSolverBodyData)& 0xf) == 0);
+
+class PxgSolverExtBody
+{
+public:
+	union
+	{
+		const PxgArticulation* articulation;
+		const PxgSolverBodyData* body;
+	};
+
+	//if linkIndex is 0xffff, the solver body is rigid body, otherwise, it is articulation
+	PxU16 linkIndex;
+	PxU16 isKinematic;
+	PxU32 bodyIndex;
+	PxU32 islandNodeIndex;
+
+};
+
+struct PxgSolverExtBody2
+{
+	PxSpatialMatrix mSpatialResponse;		//144
+	Cm::UnAlignedSpatialVector velocity;	//168
+	PxTransform body2World;					//196
+	PxReal penBiasClamp;					//200
+	PxReal maxImpulse;						//204
+	PxU16 linkIndex;						//206
+	PxU16 isKinematic;						//208
+	PxU32 bodyIndex;						//212
+	PxNodeIndex islandNodeIndex;			//216
+	PxReal cfm;								//220
+	PxReal offsetSlop;						//224
+};
+
+//we need to DMA back the sleep data to CPU. PxgBodySim has the same information. However, PxgBodySim is too
+//big to dma back.
+struct PxgSolverBodySleepData
+{
+	PxReal						wakeCounter;
+	PxU32						internalFlags;
+};
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning (disable : 4201)
+#endif
+struct PxgSolverBody
+{
+#if !PX_CUDA_COMPILER
+	PX_ALIGN(16, PxVec3	linearVelocity);	// post-solver linear velocity in world space
+	PxU32 pad;
+	PxVec3				angularVelocity;	// post-solver angular velocity in world space	
+	PxU32 pad2;	
+#else
+	float4				linearVelocity;
+	float4				angularVelocity;
+#endif
+};
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverBody) == 32);
+
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning (disable : 4201)
+#endif
+struct PxgTGSSolverBody
+{
+#if !PX_CUDA_COMPILER
+	PX_ALIGN(16, PxVec3		linearVelocity);				// 12 post-solver linear velocity in world space
+	PxVec3					angularVelocity;				// 24 post-solver angular velocity in world space	
+	PxVec3					linearDelta;					// 36 linear delta motion in world space
+	PxVec3					angularDelta;					// 48 angular delta motion in world space
+#else
+	float4					linearVelocityXYZ_angX;
+	float4					angularVelocityYZ_linDeltaXY;
+	float4					linDeltaZ_angDeltaXYZ;
+#endif
+};
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverBody) == 32);
+
+
+struct PxgSolverReferences
+{
+	PxU32 mRemappedBodyIndex;
+};
+
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraint1D.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraint1D.h
@@ -0,0 +1,159 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXG_SOLVER_CONSTRAINT_1D_H
+#define PXG_SOLVER_CONSTRAINT_1D_H
+
+#include "foundation/PxVec3.h"
+#include "PxConstraintDesc.h"
+#include "PxgSolverConstraintDesc.h"
+#include "DySolverConstraintTypes.h"
+#include "CmSpatialVector.h"
+
+namespace physx
+{
+
+struct PxgSolverConstraint1DHeader
+{
+	float4		body0WorldOffset_linBreakImpulse;
+	//Strict ordering required - invInertiaScale0->invMassScale0->invInertiaScale1->invMassScale1. Do not change!
+	PxReal		invInertiaScale0;
+	PxReal		invMassScale0;
+	PxReal		invInertiaScale1;
+	PxReal		invMassScale1;
+	
+
+	PxU32		rowCounts;								// numbers of rows each 1D constraints 
+	PxU32		breakable;											// indicate whether the constraint are breakable or not 
+	PxReal		angBreakImpulse;
+	PxU32		writeBackOffset;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverConstraint1DHeader) == 48);
+
+PX_ALIGN_PREFIX(16)
+struct PxgSolverConstraint1DCon
+{
+	PxVec3 ang0;							//12	12
+	PxVec3 lin0;							//12	24
+	PxVec3 ang1;							//12	36
+	PxVec3 lin1;							//12	48
+	Cm::UnAlignedSpatialVector deltaVA;		//24	72
+	Cm::UnAlignedSpatialVector deltaVB;		//24	96
+	PxReal minImpulse;						//4		100
+	PxReal maxImpulse;						//4		104
+	PxReal velMultiplier;					//4		108
+	PxReal impulseMultiplier;				//4		112
+
+} PX_ALIGN_SUFFIX(16);
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverConstraint1DCon) == 112);
+
+struct PxgSolverConstraint1DMod
+{
+	PxVec3	ang0Writeback;					//!< unscaled angular velocity projection (body 0)
+	PxReal	constant;						//!< constant	 
+	PxReal	unbiasedConstant;				//!< unbiased constant
+	PxReal	appliedForce;					//!< applied force to correct velocity+bias
+	PxU32	flags;							
+};
+
+
+
+struct PxgTGSSolverConstraint1DHeader
+{
+	PxU16		rowCounts;								// numbers of rows each 1D constraints 
+	PxU16		breakable;								// indicate whether the constraint are breakable or not 
+	PxReal		linBreakImpulse;
+	PxReal		angBreakImpulse;
+	PxU32		writeBackOffset;
+
+	PxVec4		raWorld;
+	PxVec4		rbWorld;
+
+	//Strict ordering required - invInertiaScale0->invMassScale0->invInertiaScale1->invMassScale1. Do not change!
+	PxReal		invInertiaScale0;
+	PxReal		invMassScale0;
+	PxReal		invInertiaScale1;
+	PxReal		invMassScale1;
+	
+
+	//There is no orthogonalization with articulation constraints, so we do not need to
+	//add anything to reflect that in this code!
+};
+
+PX_ALIGN_PREFIX(16)
+struct PxgTGSSolverConstraint1DCon
+{
+	PxVec3 ang0;							//12	12
+	PxVec3 lin0;							//12	24
+	PxVec3 ang1;							//12	36
+	PxVec3 lin1;							//12	48
+	Cm::UnAlignedSpatialVector deltaVA;		//24	72
+	Cm::UnAlignedSpatialVector deltaVB;		//24	96
+
+	PxReal minImpulse;						//4		100
+	PxReal maxImpulse;						//4		104
+	PxReal velMultiplier;					//4		108
+	PxReal impulseMultiplier;				//4		112
+
+	PxReal error;							//4		116
+	PxReal velTarget;						//4		120
+	PxReal recipResponse;					//4		124
+	PxReal angularErrorScale;				//4		128
+
+} PX_ALIGN_SUFFIX(16);
+
+PX_COMPILE_TIME_ASSERT(sizeof(PxgTGSSolverConstraint1DCon) == 128);
+
+struct PxgTGSSolverConstraint1DMod
+{
+	PxReal	appliedForce;					//!< applied force to correct velocity+bias
+	PxReal	maxBias;
+	PxReal	biasScale;						
+	PxU32	flags;
+};
+
+struct PxgJointParams
+{
+	PxgSolverConstraint1DHeader* jointHeader;
+	PxgSolverConstraint1DCon* jointCon;
+	PxgSolverConstraint1DMod* jointMod;
+};
+
+struct PxgTGSJointParams
+{
+	PxgTGSSolverConstraint1DHeader* jointHeader;
+	PxgTGSSolverConstraint1DCon* jointCon;
+	PxgTGSSolverConstraint1DMod* jointMod;
+};
+
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraintBlock1D.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraintBlock1D.h
@@ -0,0 +1,202 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXG_SOLVER_CONSTRAINT_BLOCK_1D_H
+#define PXG_SOLVER_CONSTRAINT_BLOCK_1D_H
+
+#include "foundation/PxVec3.h"
+#include "PxConstraintDesc.h"
+#include "DySolverConstraintTypes.h"
+#include "PxgSolverConstraintDesc.h"
+#include "vector_types.h"
+#include "vector_functions.h"
+
+
+namespace physx
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+struct PxgBlockSolverConstraint1DHeader
+{
+	PX_ALIGN(128, PxU16		rowCounts[32]);								// numbers of rows each 1D constraints 
+	PX_ALIGN(128, PxU16		breakable[32]);											// indicate whether the constraint are breakable or not 
+	PX_ALIGN(128, PxReal	angBreakImpulse[32]);
+	PX_ALIGN(128, float4	body0WorldOffset_linBreakImpulse[32]);
+	PX_ALIGN(128, PxReal	invMass0D0[32]);
+	PX_ALIGN(128, PxReal	invMass1D1[32]);
+	PX_ALIGN(128, PxReal	invInertiaScale0[32]);
+	PX_ALIGN(128, PxReal	invInertiaScale1[32]);
+	PX_ALIGN(128, PxU32		writeBackOffset[32]);
+
+	PX_ALIGN(128, PxReal	cfm[32]);
+};
+
+  
+
+struct PxgBlockSolverConstraint1DCon 
+{
+public:
+
+	// To use different mass for mass-splitting every sub-timestep (or iteration),
+	// unitResponse, recipResponse, velMultiplier, etc. are computed every sub-timestep (or iteration).
+	// To compute them at every sub-timestep (or iteration), resp0, resp1, and other relevant data are stored additionally.
+	PX_ALIGN(256, float4	lin0XYZ_minImpulse[32]);			//512	512		//!< linear velocity projection (body 0) and min impulse term	
+	PX_ALIGN(256, float4	lin1XYZ_maxImpulse[32]);			//1024	512		//!< linear velocity projection (body 1) and max impulse term
+	PX_ALIGN(256, float4	ang0XYZ_resp0[32]);					//1536	512		//!< angular velocity projection (body 0) and resp0
+	PX_ALIGN(256, float4	ang1XYZ_resp1[32]);					//2048	512		//!< angular velocity projection (body 1) and resp1
+	PX_ALIGN(256, PxReal	initJointSpeed[32]);				
+} ;
+
+struct PxgBlockSolverConstraint1DMod 
+{
+public:
+	PX_ALIGN(128, PxVec3	ang0Writeback[32]);					//!< unscaled angular velocity projection (body 0)
+	PX_ALIGN(128, PxReal	appliedForce[32]);					//!< applied force to correct velocity+bias
+	PX_ALIGN(128, PxU32		flags[32]);							
+	PX_ALIGN(128, PxReal	residual[32]);
+
+	// coeff0, coeff1: coefficients used to compute constant, unbiasedConstant, velMultiplier, and impulseMultiplier.
+	// See also "queryReduced1dConstraintSolverConstantsPGS" 
+	PX_ALIGN(128, PxReal	coeff0[32]);						
+	PX_ALIGN(128, PxReal	coeff1[32]);						
+} ;
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+struct PxgTGSBlockSolverConstraint1DHeader
+{
+	PX_ALIGN(128, uchar4	rowCounts_breakable_orthoAxisCount[32]);
+	PX_ALIGN(128, float4	rAWorld_invMass0D0[32]);
+	PX_ALIGN(128, float4	rBWorld_invMass1D1[32]);
+	PX_ALIGN(128, PxReal	invInertiaScale0[32]);
+	PX_ALIGN(128, PxReal	invInertiaScale1[32]);
+	PX_ALIGN(128, PxU32		writeBackOffset[32]);
+
+	//Orthogonalization data
+	PX_ALIGN(128, float4	angOrthoAxis0_recipResponseW[3][32]);
+	PX_ALIGN(128, float4	angOrthoAxis1_ErrorW[3][32]);
+
+	PX_ALIGN(128, PxReal	linBreakImpulse[32]);
+	PX_ALIGN(128, PxReal	angBreakImpulse[32]);
+
+	PX_ALIGN(128, PxReal	cfm[32]);
+};
+
+
+
+struct PxgTGSBlockSolverConstraint1DCon
+{
+public:
+	// For rigid body joints, coef0, coef1, coef2, and coef3 store initBias, biasScale, velMultiplier, and velTarget,
+	// respectively.
+	 
+	// For articulation, the coefficients used in "compute1dConstraintSolverConstantsTGS" and
+	// "queryReduced1dConstraintSolverConstantsTGS" are stored in the last (w) component. 
+
+	PX_ALIGN(128, float4	lin0XYZ_initBiasOrCoeff0[32]);				//512	512		//!< linear velocity projection (body 0) and an additional coef	
+	PX_ALIGN(128, float4	lin1XYZ_biasScaleOrCoeff1[32]);				//1024	512		//!< linear velocity projection (body 1) and an additional coef
+	PX_ALIGN(128, float4	ang0XYZ_velMultiplierOrCoeff2[32]);			//1536	512		//!< angular velocity projection (body 0) and an additional coef
+	PX_ALIGN(128, float4	ang1XYZ_velTargetOrCoeff3[32]);				//2048	512		//!< angular velocity projection (body 1) and an additional coef
+
+	// resp0, resp1, and other relevant data are stored additionally.
+	PX_ALIGN(128, PxReal	resp0[32]);
+	PX_ALIGN(128, PxReal	resp1[32]);
+	PX_ALIGN(128, PxReal	geometricError[32]);
+
+	PX_ALIGN(128, PxReal	minImpulse[32]);
+	PX_ALIGN(128, PxReal	maxImpulse[32]);
+	PX_ALIGN(128, PxReal	maxBias[32]);
+	PX_ALIGN(128, PxReal	angularErrorScale[32]);
+	PX_ALIGN(128, PxU32		flags[32]);
+	PX_ALIGN(128, PxReal	appliedForce[32]);
+	PX_ALIGN(128, PxReal	residual[32]);
+};
+
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE void init(PxgBlockSolverConstraint1DCon& ccon, PxgBlockSolverConstraint1DMod& cmod,
+						  const PxVec3& _linear0, const PxVec3& _linear1, 
+						  const PxVec3& _angular0, const PxVec3& _angular1,
+						  const PxReal _minImpulse, const PxReal _maxImpulse, 
+						  const PxU32 index)
+{
+	PX_ASSERT(_linear0.isFinite());
+	PX_ASSERT(_linear1.isFinite());
+
+	ccon.lin0XYZ_minImpulse[index] = make_float4(_linear0.x, _linear0.y, _linear0.z, _minImpulse);
+	ccon.lin1XYZ_maxImpulse[index] = make_float4(_linear1.x, _linear1.y, _linear1.z, _maxImpulse);
+	ccon.ang0XYZ_resp0[index] = make_float4(_angular0.x, _angular0.y, _angular0.z, 0.f);
+	ccon.ang1XYZ_resp1[index] = make_float4(_angular1.x, _angular1.y, _angular1.z, 0.f);
+	ccon.initJointSpeed[index] = 0.f;
+
+	cmod.coeff0[index] = 0.f;
+	cmod.coeff1[index] = 0.f;
+
+	cmod.flags[index]					= 0;
+	cmod.appliedForce[index]			= 0.f;
+	cmod.residual[index]				= 0.f;
+}
+
+struct PxgJointBlockParams
+{
+	PxgBlockSolverConstraint1DHeader* jointHeader;
+	PxgBlockSolverConstraint1DCon* jointCon;
+	PxgBlockSolverConstraint1DMod* jointMod;
+	PxReal dt;
+	PxReal invDt;
+};
+
+struct PxgTGSJointBlockParams
+{
+	PxgTGSBlockSolverConstraint1DHeader* jointHeader;
+	PxgTGSBlockSolverConstraint1DCon* jointCon;
+	PxReal dt;
+	PxReal totalDt;
+	PxReal invDt;
+	PxReal invTotalDt;
+	PxReal lengthScale;
+	PxReal biasCoefficient;
+};
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraintDesc.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverConstraintDesc.h
@@ -0,0 +1,217 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_CONSTRAINT_DESC_H
+#define PXG_SOLVER_CONSTRAINT_DESC_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxPreprocessor.h"
+#include <vector_types.h>
+#include "PxNodeIndex.h"
+
+
+namespace physx
+{
+struct PxgSolverBody;
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324)
+#endif
+struct PxgBlockConstraint1DVelocities
+{
+	PX_ALIGN(256, float4	linear0XYZ_geometricErrorW[32]);			//!< linear component of velocity jacobian in world space, geometric error of the constraint along this axi
+	PX_ALIGN(256, float4	angular0XYZ_velocityTargetW[32]);			//!< angular component of velocity jacobian in world space, velocity target for the constraint along this axis
+
+	PX_ALIGN(256, float4	linear1XYZ_minImpulseW[32]);				//!< linear component of velocity jacobian in world space, minimum impulse the solver may apply to enforce this constraint
+	PX_ALIGN(256, float4	angular1XYZ_maxImpulseW[32]);				//!< angular component of velocity jacobian in world space, maximum impulse the solver may apply to enforce this constraint
+};
+
+struct PxgBlockConstraint1DParameters
+{
+	
+	union
+	{
+		struct SpringModifiers
+		{
+			PX_ALIGN(128, PxReal	stiffness[32]);				//!< spring parameter, for spring constraints
+			PX_ALIGN(128, PxReal	damping[32]);				//!< damping parameter, for spring constraints
+		} spring;
+		struct RestitutionModifiers
+		{
+			PX_ALIGN(128, PxReal	restitution[32]);			//!< restitution parameter for determining additional "bounce"
+			PX_ALIGN(128, PxReal	velocityThreshold[32]);		//!< minimum impact velocity for bounce
+		} bounce;
+	} mods;
+
+	PX_ALIGN(128, PxU32		flags[32]);					//!< a set of Px1DConstraintFlags
+	PX_ALIGN(128, PxU32		solveHint[32]);				//!< constraint optimization hint, should be an element of PxConstraintSolveHint
+};
+
+
+struct PxgConstraint1DVelocities
+{
+	float4	linear0XYZ_geometricErrorW;			//!< linear component of velocity jacobian in world space, geometric error of the constraint along this axi
+	float4	angular0XYZ_velocityTargetW;			//!< angular component of velocity jacobian in world space, velocity target for the constraint along this axis
+
+	float4	linear1XYZ_minImpulseW;				//!< linear component of velocity jacobian in world space, minimum impulse the solver may apply to enforce this constraint
+	float4	angular1XYZ_maxImpulseW;				//!< angular component of velocity jacobian in world space, maximum impulse the solver may apply to enforce this constraint
+};
+
+struct PxgConstraint1DParameters
+{
+
+	union
+	{
+		struct SpringModifiers
+		{
+			PxReal	stiffness;				//!< spring parameter, for spring constraints
+			PxReal	damping;				//!< damping parameter, for spring constraints
+		} spring;
+		struct RestitutionModifiers
+		{
+			PxReal	restitution;			//!< restitution parameter for determining additional "bounce"
+			PxReal	velocityThreshold;		//!< minimum impact velocity for bounce
+		} bounce;
+	} mods;
+
+	PxU16		flags;					//!< a set of Px1DConstraintFlags
+	PxU16		solveHint;				//!< constraint optimization hint, should be an element of PxConstraintSolveHint
+
+	PxU32		pad;
+};
+
+struct PxgSolverConstraintDesc
+{
+	enum PxgConstraintType
+	{
+		eCONSTRAINT_1D,
+		eCONTACT,
+		eARTICULATION_CONSTRAINT_1D,
+		eARTICULATION_CONTACT,
+	};												
+	PxU8*					constraint;				//8
+	PxU32					bodyAIndex;
+	PxU32					bodyBIndex;
+	PxU16					constraintType;
+	PxU16					patchIndex;
+	//PxU16					pad;
+};
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(PxgSolverConstraintDesc) == 16);
+#endif
+
+PX_ALIGN_PREFIX(16)
+struct PxgConstraintInvMassScale
+{
+
+#if !PX_CUDA_COMPILER
+	PxReal linear0;		//!< multiplier for inverse mass of body0
+	PxReal angular0;	//!< multiplier for inverse MoI of body0
+	PxReal linear1;		//!< multiplier for inverse mass of body1
+	PxReal angular1;	//!< multiplier for inverse MoI of body1
+#else
+	float4 lin0X_ang0Y_lin1Z_ang1W;
+#endif
+}PX_ALIGN_SUFFIX(16);
+
+
+struct PxgBlockContactData;
+struct PxgBlockContactPoint;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+
+struct PxgConstraintBatchHeader
+{
+	PxU16										mDescStride;				//number between 1 to 32
+	PxU16										constraintType;				//constraint type (joint or contact)
+	PxU32										mConstraintBatchIndex;		//constraint batch index (the index for the specific joint/contact batch)
+	PxU32										mStartPartitionIndex;		//start partition index (the start index for the set of partition edges representing this batch)
+	PxU32										mask;						//Only used by the articulation internal constraint solver
+};
+
+
+struct PxgBlockConstraintBatch
+{
+	PxU16										mDescStride; //number between 1 to 32
+	PxU16										constraintType;
+	PxU32										blockContactIndex;
+	PxU32										mConstraintBatchIndex;
+	PxU32										startConstraintIndex;
+	PxU32										startFrictionIndex;  
+	PxU32										mStartPartitionIndex;
+	PxU32										mArticulationResponseIndex; //Only required for articulation constraints!
+	PxU32										mask;
+
+	PX_ALIGN(128, PxNodeIndex bodyANodeIndex[32]);
+	PX_ALIGN(128, PxNodeIndex bodyBNodeIndex[32]);
+
+	PX_ALIGN(128, PxU32	bodyAIndex[32]);
+	PX_ALIGN(128, PxU32	bodyBIndex[32]);
+
+	PX_ALIGN(128, PxU32	remappedBodyAIndex[32]);   
+	PX_ALIGN(128, PxU32	remappedBodyBIndex[32]);
+
+	PX_ALIGN(128, PxU32 slabId[32]);
+	
+	PX_ALIGN(128, Sc::ShapeInteraction* shapeInteraction[32]); // used for force-threshold reporting
+};
+
+struct PxgBlockWorkUnit
+{
+	PX_ALIGN(128, PxU32 mWriteback[32]);
+	
+	PX_ALIGN(128, float	mRestDistance[32]);
+
+	PX_ALIGN(128,	PxU32	mEdgeIndex[32]);
+	PX_ALIGN(128,	PxU32	mFlags[32]);
+	PX_ALIGN(128,	PxU32	mPatchIndex[32]);
+	PX_ALIGN(128,	PxU32	mFrictionPatchIndex[32]);
+	PX_ALIGN(128,	float2	mTorsionalFrictionData[32]);
+};
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+//This used in contact preprep(constraintContactBlockPrePrepLaunch) and joint prep code(setupSolverConstraintBlockGPU) in GPU 
+struct PxgSolverConstraintManagerConstants
+{
+	PxU32 mEdgeIndex;
+	PxU32 mConstraintWriteBackIndex;
+};
+
+
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverContext.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverContext.h
@@ -0,0 +1,49 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_CONTEXT_H
+#define	PXG_SOLVER_CONTEXT_H
+
+namespace physx
+{
+
+struct PxgThresholdStreamElement;
+struct PxgSolverBodyData;
+
+struct PxgSolverContext
+{
+	bool doFriction;
+
+	PX_CUDA_CALLABLE PxgSolverContext()
+	{
+	}
+};
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverCore.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverCore.h
@@ -0,0 +1,485 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_CORE_H
+#define PXG_SOLVER_CORE_H
+
+#include "foundation/PxPinnedArray.h"
+#include "foundation/PxUserAllocated.h"
+#include "PxgConstraint.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxvNphaseImplementationContext.h"
+#include "PxgCudaBuffer.h"
+#include "PxScene.h"
+
+namespace physx
+{
+	namespace Dy
+	{
+		struct ConstraintWriteback;
+	}
+
+	struct PxgConstraintPrePrepData
+	{
+	public:
+		PxU32 nbGpuRigidJoints;				//gpu preprep joints
+		PxU32 nbTotalRigidJoints;			//cpu + gpu preprep joints
+		PxU32 nbGpuArtiJoints;				//gpu preprep joints
+		PxU32 nbTotalArtiJoints;			//cpu + gpu preprep joint
+		PxU32 numContactBatches;
+		PxU32 num1dConstraintBatches;
+		PxU32 numStaticContactBatches;
+		PxU32 numStatic1dConstraintBatches;
+
+		PxU32 numArtiContactsBatches;
+		PxU32 numArti1dConstraintBatches;
+		PxU32 numArtiStaticContactsBatches;
+		PxU32 numArtiStatic1dConstraintBatches;
+		PxU32 numArtiSelfContactsBatches;
+		PxU32 numArtiSelf1dConstraintBatches;
+
+		PxU32 artiStaticConstraintBatchOffset;
+		PxU32 artiStaticContactBatchOffset;
+
+		PxU32* constraintUniqueIndices;
+		PxU32* contactUniqueIndices;
+		PxU32* constraintStaticUniqueIndices;
+		PxU32* contactStaticUniqueIndices;
+		PxU32* artiConstraintUniqueindices;
+		PxU32* artiContactUniqueIndices;
+		PxU32* artiStaticConstraintUniqueIndices;
+		PxU32* artiStaticContactUniqueIndices;
+
+		PxU32* artiStaticConstraintStartIndex;
+		PxU32* artiStaticConstraintCount;
+		PxU32* artiStaticContactStartIndex;
+		PxU32* artiStaticContactCount;
+
+		PxU32* rigidStaticConstraintStartIndex;
+		PxU32* rigidStaticConstraintCount;
+
+		//mapped memory
+		PxU32* constraint1DBatchIndices;
+		PxU32* constraintContactBatchIndices;
+		PxU32* artiConstraintContactBatchIndices;
+		PxU32* artiConstraint1dBatchindices;
+	};
+
+	struct PxgConstantData
+	{
+	public:
+		PxReal dt;
+		PxReal invDtF32;
+		PxReal bounceThresholdF32;
+		PxReal frictionOffsetThreshold;
+		PxReal correlationDistance;
+		PxReal ccdMaxSeparation;
+		PxReal biasCoefficient;
+		PxVec3 gravity;
+	};
+
+	struct PxgPartitionData
+	{
+	public:
+		const PxU32* constraintsPerPartition; //rigid body contact and 1d constraint
+		PxU32 numConstraintsPerPartition;
+
+		const PxU32* artiConstraintsPerPartition; // articulation contact and 1d constraint
+		PxU32 numArtiConstraintsPerPartition; 
+
+		PxU32 numTotalConstraints;
+		PxU32 numTotalContacts;
+		PxU32 numTotalStaticConstraints;
+		PxU32 numTotalStaticContacts;
+
+		PxU32 numTotalArtiContacts; //dynamic contacts
+		PxU32 numTotalArtiConstraints; //external constraints
+		PxU32 numTotalArtiStaticContacts; //static contacts
+		PxU32 numTotalArtiStaticConstraints; //static constraints
+		PxU32 numTotalArtiSelfContacts; //static contacts
+		PxU32 numTotalArtiSelfConstraints; //static constraints
+
+		PxU32 artiStaticContactBatchOffset;
+		PxU32 artiStaticConstraintBatchOffset;
+	};
+
+	class PxgPinnedHostLinearMemoryAllocator;
+
+	class PxgRadixSortBuffers
+	{
+		public:
+						PxgRadixSortBuffers(PxgHeapMemoryAllocatorManager* heapMemoryManager);
+
+		void			constructRadixSortDesc(PxgRadixSortDesc* rsDesc)	const;
+		void			allocate(PxU32 totalContactBatches);
+
+		PxgCudaBuffer	mInputKeys;
+		PxgCudaBuffer	mInputRanks;
+		PxgCudaBuffer	mOutputKeys;
+		PxgCudaBuffer	mOutputRanks;
+		PxgCudaBuffer	mRadixCounts; 
+	};
+
+	class PxgSolverCore : public PxUserAllocated
+	{
+	protected:
+
+		PxgCudaKernelWranglerManager*	mGpuKernelWranglerManager;
+		PxCudaContextManager*			mCudaContextManager;
+		PxCudaContext*					mCudaContext;
+		PxgGpuContext*					mGpuContext;
+		PxgHeapMemoryAllocatorManager*  mHeapMemoryManager;
+		
+		//PxgSimulationController*		mSimulationController;
+		/*PxgArticulationCore*			mArticulationCore;*/
+
+		PxgSolverCoreDesc*				mSolverCoreDesc;
+		PxgConstraintPrepareDesc*		mPrepareDesc;
+		PxgPrePrepDesc*					mPrePrepDesc;
+		PxgRadixSortDesc*				mRsDesc;
+
+		CUdeviceptr						mIslandContextPool;
+		CUdeviceptr						mSolverCoreDescd;
+		CUdeviceptr						mSharedDescd;
+		CUdeviceptr						mPrepareDescd;
+		CUdeviceptr						mPrePrepDescd;
+		CUdeviceptr						mPartionDescd;
+		CUdeviceptr						mRadixSortDescd[2];
+
+		PxU32							mNbStaticRigidSlabs;
+		PxU32							mMaxNumStaticPartitions;
+
+		PxU32							mTotalContactManagers;
+		PxU32							mNbPrevExceededForceElements;
+
+		PxU32							mNbArticSlabs;
+		PxU32							mNbConstraintSlabs; // slabs used for contacts and joints.
+
+		void							allocateNodeInteractionCounts(PxU32 nbNodes);
+		void							uploadNodeInteractionCounts(const PxU32* nodeInteractionCounts, PxU32 nbNodes);
+
+	public:
+
+		PxgSolverCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+			PxgGpuContext* dynamicContext, PxgHeapMemoryAllocatorManager* heapMemoryManager);
+
+		virtual ~PxgSolverCore(){}
+
+		/*PX_FORCE_INLINE void setSimulationController(PxgSimulationController* simulationController) { mSimulationController = simulationController; }
+		PX_FORCE_INLINE PxgSimulationController* getSimulationController() { return mSimulationController; }*/
+
+		/*PX_FORCE_INLINE void setArticulationCore(PxgArticulationCore* articulationCore) { mArticulationCore = articulationCore; }
+		PX_FORCE_INLINE PxgArticulationCore* getArticulationCore() { return mArticulationCore; }
+*/
+		PX_FORCE_INLINE CUdeviceptr getPrePrepDescDeviceptr() { return mPrePrepDescd; }
+		PX_FORCE_INLINE CUdeviceptr getPrepDescDeviceptr() { return mPrepareDescd; }
+		PX_FORCE_INLINE CUdeviceptr getSolverCoreDescDeviceptr() { return mSolverCoreDescd; }
+		PX_FORCE_INLINE CUdeviceptr getSharedDescDeviceptr() { return mSharedDescd; }
+
+		virtual PxU32 getDescriptorsAllocationSize() = 0;
+		virtual void allocatePinnedDescriptors(PxgPinnedHostLinearMemoryAllocator& hostAllocator) = 0;
+
+		virtual void syncSimulationController() = 0;
+
+		virtual void gpuMemDMAUpContactData(PxgPinnedHostLinearMemoryAllocator* compressedContactsHostMemoryAllocator,
+				PxU32 compressedContactStreamUpperPartSize, 
+				PxU32 compressedContactStreamLowerPartSize, 
+				PxgPinnedHostLinearMemoryAllocator* compressedPatchesHostMemoryAllocator,
+				PxU32 compressedPatchStreamUpperPartSize, 
+				PxU32 compressedPatchStreamLowerPartSize, 
+				PxU32 totalContactManagers,
+				const PartitionIndexData* partitionIndexData,
+				const PartitionNodeData* partitionNodeData,
+				const PxgSolverConstraintManagerConstants* constantData,
+				PxU32 constantDataCount,
+				PxU32 partitionIndexDataCount,
+				const PxU32* partitionConstraintBatchStartIndices,
+				const PxU32* partitionArticConstraintBatchStartIndices,
+				const PxU32* partitionJointBatchCounts,
+				const PxU32* partitionArtiJointBatchCounts,
+				PxU32 nbPartitions,
+				const PxU32* destroyedEdges,
+				PxU32 nbDestroyedEdges,
+				const PxU32* npIndexArray, PxU32 npIndexArraySize,
+				PxU32 totalNumJoints,
+				const PxU32* islandIds, const PxU32* nodeInteractionCounts, PxU32 nbNodes, const PxU32* islandStaticTouchCount, PxU32 nbIslands) = 0;
+
+		virtual void gpuMemDmaUpBodyData(PxPinnedArray<PxgSolverBodyData>& solverBodyDataPool,
+			PxPinnedArray<PxgSolverTxIData>& solverTxIDataPool,
+			const PxU32 numSolverBodies,
+			const PxU32 totalNumRigidBatches, const PxU32 totalNumArticBatches,
+			const PxU32 nbSlabs, const PxU32 nbStaticSlabs, const PxU32 maxNumStaticPartitions) = 0;
+
+		virtual void allocateSolverBodyBuffers(const PxU32 numSolverBodies,
+			PxPinnedArray<PxNodeIndex>& islandNodeIndices,
+			const PxU32 numActiveActiculations, const PxU32 maxArticulationLinks) = 0;
+
+		virtual void gpuMemDMAUp(PxgPinnedHostLinearMemoryAllocator& hostAllocator, const PxgConstraintPrePrepData& data,
+			const PxU32 numSolverBodies, PxgConstraintBatchHeader* constraintBatchHeaders,
+			PxgIslandContext* islandContextPool, const PxU32 numIslands, const PxgPartitionData& partitionData ,
+			const PxU32 numConstraintBatchHeader, const PxU32 numStaticConstraintBatchHeader,
+			const PxU32 numArticConstraintBatchHeader, const PxU32 numArticStaticConstraintBatchHeader,
+			const PxU32 numArtiSelfConstraintBatchHeader, const PxgConstantData& cData,
+			const PxU32 numContactBlockes, const PxU32 numFrictionBlockes, 
+			const PxU32 numArtiContacts, const PxU32 numArtiFrictions,
+			const PxU32 totalCurrentEdges, const PxU32 totalPreviousEdges, const PxU32 numSlabs, const PxU32 maxNbPartitions,
+			const bool enableStabilization,
+			PxU8* cpuContactPatchStreamBase, PxU8* cpuContactStreamBase, PxU8* cpuForceStreamBase, PxsContactManagerOutputIterator& outputIterator,
+			const PxU32 totalActiveBodyCount, const PxU32 activeBodyStartIndex, const PxU32 numArticulations, Cm::UnAlignedSpatialVector* deferredZ,
+			PxU32* articulationDirty, uint4* articulationSlabMask, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances,
+			PxsTorsionalFrictionData* torsionalData,
+			PxU32* artiStaticContactIndices, const PxU32 artiContactIndSize, PxU32* artiStaticJointIndices, PxU32 artiStaticJointSize,
+			PxU32* artiStaticContactCounts, PxU32* artiStaticJointCounts,
+			PxU32* artiSelfContactIndices, const PxU32 artiSelfContactIndSize, PxU32* artiSelfJointIndices, PxU32 artiSelfJointSize,
+			PxU32* artiSelfContactCounts, PxU32* artiSelfJointCounts, 
+			PxU32* rigidStaticContactIndices, const PxU32 rigidContactIndSize, PxU32* rigidStaticJointIndices, const PxU32 rigidStaticJointSize,
+			PxU32* rigidStaticContactCounts, PxU32* rigidSaticJointCounts, const PxReal lengthScale, bool hasForceThresholds) = 0;
+
+		virtual void gpuMemDMAbackSolverData(PxU8* forceBufferPool, PxU32 forceBufferOffset, PxU32 forceBufferUpperPartSize,
+			PxU32 forceBufferLowerPartSize, Dy::ThresholdStreamElement* changedElems, bool hasForceThresholds, Dy::ConstraintWriteback* constraintWriteBack,
+			const PxU32 writeBackSize, bool copyAllToHost, Dy::ErrorAccumulator*& contactError) = 0;
+
+
+		virtual void syncDmaBack(PxU32& nbChangedThresholdElements) = 0;
+
+		virtual void createStreams() = 0;
+		virtual void releaseStreams() = 0;
+
+		virtual void acquireContext() = 0;
+		virtual void releaseContext() = 0;
+
+		virtual void preIntegration(const PxU32 offset, const PxU32 nbSolverBodies, const PxReal dt, const PxVec3& gravity) = 0;
+		
+		virtual void jointConstraintBlockPrePrepParallel(PxU32 nbConstraintBatches) = 0;
+
+		virtual void jointConstraintPrepareParallel(PxU32 nbJointBatches) = 0;
+		virtual void contactConstraintPrepareParallel(PxU32 nbContactBatches) = 0;
+		virtual void artiJointConstraintPrepare(PxU32 nbArtiJointBatches) = 0;
+		virtual void artiContactConstraintPrepare(PxU32 nbArtiContactBatches) = 0;
+		virtual void nonRigidConstraintPrepare(PxU32 nbParticulations) = 0;
+
+		virtual void solveContactMultiBlockParallel(PxgIslandContext* islandContexts, const PxU32 numIslands, const PxU32 maxPartitions,
+			PxInt32ArrayPinned& constraintsPerPartition, PxInt32ArrayPinned& artiConstraintsPerPartition, const PxVec3& gravity,
+			PxReal* posIterResidualSharedMem, PxU32 posIterResidualSharedMemSize, Dy::ErrorAccumulator* posIterError, PxPinnedArray<Dy::ErrorAccumulator>& artiContactPosIterError,
+			PxPinnedArray<Dy::ErrorAccumulator>& perArticulationInternalError) = 0;
+
+		virtual void accumulatedForceThresholdStream(PxU32 maxNodes) = 0;
+		virtual void integrateCoreParallel( const PxU32 offset, const PxU32 nbSolverBodies) = 0;
+
+		virtual void getDataStreamBase(void*& contactStreamBase, void*& patchStreamBase, void*& forceAndIndexStreamBase) = 0;
+
+		PX_FORCE_INLINE PxgDevicePointer<PxNodeIndex> getGpuIslandNodeIndices()  { return mIslandNodeIndices2.getTypedDevicePtr(); }
+
+		PX_FORCE_INLINE void setGpuContactManagerOutputBase(PxsContactManagerOutput* gpuContactManagerOutputBase) { mGpuContactManagerOutputBase = reinterpret_cast<CUdeviceptr>(gpuContactManagerOutputBase); }
+
+		PX_FORCE_INLINE CUstream getStream() { return mStream; } 
+
+		PX_FORCE_INLINE PxgDevicePointer<PxU32> getSolverBodyIndices() { return mSolverBodyIndices.getTypedDevicePtr(); }
+
+		PX_FORCE_INLINE PxgTypedCudaBuffer<PxgSolverBodyData>*	getSolverBodyData() { return &mSolverBodyDataPool; }
+
+		PX_FORCE_INLINE PxgDevicePointer<PxgSolverBodySleepData>	getSolverBodySleepData() { return mSolverBodySleepDataPool.getTypedDevicePtr();}
+
+		PX_FORCE_INLINE PxNodeIndex* getCpuIslandNodeIndices() { return mCpuIslandNodeIndices; }
+
+		PX_FORCE_INLINE PxgDevicePointer<PxgConstraintWriteback> getConstraintWriteBackBufferDevicePtr() const { return mConstraintWriteBackBuffer.getTypedDevicePtr(); }
+
+		void allocateFrictionPatchStream(PxI32 numContactBatches, PxI32 numArtiContactBatches);
+		PxgBlockFrictionIndex* allocateFrictionPatchIndexStream(PxU32 totalFrictionPatchCount);
+		void allocateFrictionCounts(PxU32 totalEdges);
+
+		void gpuMemDMAbackSolverBodies(float4* solverBodyPool, PxU32 nbSolverBodies,
+			PxPinnedArray<PxAlignedTransform>& body2WorldPool,
+			PxPinnedArray<PxgSolverBodySleepData>& solverBodySleepDataPool, bool enableDirectGPUAPI);
+
+		void allocateSolverBodyBuffersCommon(PxU32 numSolverBodies, PxPinnedArray<PxNodeIndex>& islandNodeIndices);
+
+		void constructConstraintPrePrepDesc(PxgPrePrepDesc& preDesc, PxU32 numBatches, PxU32 numStaticBatches, PxU32 numArticBatches, PxU32 numArticStaticBatches, PxU32 numArticSelfBatches,
+			const PxgPartitionData& pData, PxContact* cpuCompressedcontactsBase, PxContactPatch* cpuCompressedPatchesBase, PxReal* cpuForceBufferBase,
+			PxU32 nbD6RigidJoint, PxU32 nbD6ArtiJoint, PxU32 nbTotalArtiJoints,
+			PxsContactManagerOutputIterator& outputIterator, PxU32 maxConstraintPartitions, PxU32 totalActiveBodies, PxU32 totalActiveArticulations,
+			PxU32 activeBodyStartOffset, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances, PxsTorsionalFrictionData* torsionalData, PxU32 nbElementsPerBody, PxU32 numSlabs);
+
+		void constructSolverSharedDescCommon(PxgSolverSharedDescBase& desc,
+			const PxgConstantData& cData, Cm::UnAlignedSpatialVector* deferredZ, PxU32* articulationDirty, uint4* articulationSlabMask);
+
+		void constructSolverDesc(PxgSolverCoreDesc& scDesc, PxU32 numIslands, PxU32 numSolverBodies, PxU32 numConstraintBatchHeader, PxU32 numArticConstraints, PxU32 numSlabs, bool enableStabilization);
+
+		void gpuMemDMAUpJointData(const PxPinnedArray<PxgConstraintData>& cpuJointDataPool, const PxPinnedArray<Px1DConstraint>& cpuJointRowPool,
+			PxU32 nbCpuJoints, PxU32 nbGpuJoints, PxU32 totalCpuRows);
+
+		void gpuMemDMAUpArtiJointData(const PxPinnedArray<PxgConstraintData>& artiJointDataPool, const PxPinnedArray<Px1DConstraint>& artiJointRowPool,
+			PxU32 nbCpuArtiJoints, PxU32 nbGpuArtiJoints, PxU32 totalArtiRows);
+
+		void constraintPrePrepParallel(PxU32 nbConstraintBatches, PxU32 nbD6Joints, PxU32 numBodies);
+		
+		void precomputeReferenceCount(PxgIslandContext* islandContext, PxU32 islandIndex, PxInt32ArrayPinned& constraintsPerPartition,
+			PxInt32ArrayPinned& artiConstraintsPerPartition, bool isTGS, PxReal minPen = 0.0f, PxReal elapsedTime = 0.0f);
+		
+		void resetVelocities(bool isTGS);
+
+		PX_FORCE_INLINE	void	resetMemoryAllocator()
+		{
+			mCurrentIndex = 1 - mCurrentIndex;
+		}
+
+		PxgCudaBuffer		mContactHeaderBlockStream; //Different types for PGS and TGS
+		PxgCudaBuffer		mFrictionHeaderBlockStream; //Different types for PGS and TGS
+		PxgCudaBuffer		mContactBlockStream; //Different types for PGS and TGS
+		PxgCudaBuffer		mFrictionBlockStream; //Different types for PGS and TGS
+
+		PxgCudaBuffer		mJointHeaderBlockStream; //Different types for PGS and TGS
+		PxgCudaBuffer		mJointRowBlockStreamCon; //Different types for PGS and TGS
+		PxgTypedCudaBuffer<PxgBlockSolverConstraint1DMod>	mJointRowBlockStreamMod;
+
+		PxgTypedCudaBuffer<PxgBlockContactData>				mConstraintContactPrepBlockPool;
+
+		PxgTypedCudaBuffer<PxgBlockConstraint1DData>		mConstraint1DPrepBlockPool;
+		PxgTypedCudaBuffer<PxgBlockConstraint1DVelocities>	mConstraint1DPrepBlockPoolVel;
+		PxgTypedCudaBuffer<PxgBlockConstraint1DParameters>	mConstraint1DPrepBlockPoolPar;
+
+		PxgTypedCudaBuffer<PxgConstraintData>				mConstraintDataPool;
+		PxgTypedCudaBuffer<Px1DConstraint>					mConstraintRowPool;
+
+		PxgTypedCudaBuffer<PxgConstraintData>				mArtiConstraintDataPool;
+		PxgTypedCudaBuffer<Px1DConstraint>					mArtiConstraintRowPool;
+
+		PxgTypedCudaBuffer<float4>					mSolverBodyPool;
+		PxgTypedCudaBuffer<float4>					mTempStaticBodyOutputPool;
+		PxgTypedCudaBuffer<PxNodeIndex>				mIslandNodeIndices2;
+		PxgTypedCudaBuffer<PxU32>					mSolverBodyIndices;
+
+		PxgTypedCudaBuffer<float4>					mOutVelocityPool;		//this is the output of linear and angular velocity for the solver body
+		PxgTypedCudaBuffer<PxAlignedTransform>		mOutBody2WorldPool;		//this is the output of body to world transform for the solver body
+		PxgTypedCudaBuffer<PxgSolverBodyData>		mSolverBodyDataPool;
+		PxgTypedCudaBuffer<PxgSolverBodySleepData>	mSolverBodySleepDataPool;
+
+		PxgTypedCudaBuffer<float4>					mOutArtiVelocityPool; //velocity(linear and angular) of the link for the articulations
+		
+		PxgTypedCudaBuffer<PxgSolverTxIData>		mSolverTxIDataPool;
+		PxgTypedCudaBuffer<PxU32>					mConstraintsPerPartition;
+		PxgTypedCudaBuffer<PxU32>					mArtiConstraintsPerPartition;
+		PxgTypedCudaBuffer<float4>					mMotionVelocityArray;
+
+		PxgTypedCudaBuffer<PxgBlockConstraintBatch>	mBlockConstraintBatches;
+		CUdeviceptr									mConstraintBatchHeaders;
+		CUdeviceptr									mConstraintUniqueIndices;
+		CUdeviceptr									mContactUniqueIndices;
+
+		CUdeviceptr			mArtiConstraintUniqueIndices;
+		CUdeviceptr			mArtiContactUniqueIndices;
+
+		CUdeviceptr			mArtiStaticConstraintUniqueIndices;
+		CUdeviceptr			mArtiStaticContactUniqueIndices;
+
+		PxgTypedCudaBuffer<PxU32>		mArtiOrderedStaticConstraints;
+		PxgTypedCudaBuffer<PxU32>		mArtiOrderedStaticContacts;
+
+		PxgTypedCudaBuffer<PxgSolverReferences>		mSolverBodyReferences;
+		PxgTypedCudaBuffer<PxgBlockWorkUnit>		mBlockWorkUnits;
+
+		//Body remapping information
+		PxgTypedCudaBuffer<PartitionIndexData>		mPartitionIndexData;
+		PxgTypedCudaBuffer<PartitionNodeData>		mPartitionNodeData;
+		PxgTypedCudaBuffer<PxgSolverConstraintManagerConstants>		mSolverConstantData;
+		PxgTypedCudaBuffer<PxU32>	mPartitionStartBatchIndices;
+		PxgTypedCudaBuffer<PxU32>	mPartitionArticulationStartBatchIndices;
+		PxgTypedCudaBuffer<PxU32>	mPartitionJointBatchCounts;
+		PxgTypedCudaBuffer<PxU32>	mPartitionArtiJointBatchCounts;
+									
+		PxgTypedCudaBuffer<PxU32>	mDestroyedEdgeIndices;
+		PxgTypedCudaBuffer<PxU32>	mNpIndexArray;
+
+		PxgTypedCudaBuffer<PxgBlockContactPoint>	mGpuContactBlockBuffer;
+		PxgTypedCudaBuffer<PxU32>					mDataBuffer;
+		PxgTypedCudaBuffer<PxContact>				mCompressedContacts;
+		PxgTypedCudaBuffer<PxContactPatch>			mCompressedPatches;
+		PxgTypedCudaBuffer<PxgConstraintWriteback>	mConstraintWriteBackBuffer; //1d constraint write back buffer
+		PxgTypedCudaBuffer<PxReal>					mForceBuffer; // contact write back buffer
+		PxgTypedCudaBuffer<PxFrictionPatch>			mFrictionPatches;
+
+		CUdeviceptr						mGpuContactManagerOutputBase;
+
+		PxgTypedCudaBuffer<PxU32>		mArtiStaticContactIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiStaticJointIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiStaticContactCounts;
+		PxgTypedCudaBuffer<PxU32>		mArtiStaticJointCounts;
+
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticContactIndices;
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticJointIndices;
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticContactCounts;
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticJointCounts;
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticContactStartIndices;
+		PxgTypedCudaBuffer<PxU32>		mRigidStaticJointStartIndices;
+
+		PxgTypedCudaBuffer<PxU32>		mTempContactUniqueIndicesBlockBuffer;
+		PxgTypedCudaBuffer<PxU32>		mTempConstraintUniqueIndicesBlockBuffer;
+		PxgTypedCudaBuffer<PxU32>		mTempContactHeaderBlockBuffer;
+		PxgTypedCudaBuffer<PxU32>		mTempConstraintHeaderBlockBuffer;
+
+		PxgTypedCudaBuffer<PxU32>		mArtiSelfContactIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiSelfJointIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiSelfContactCounts;
+		PxgTypedCudaBuffer<PxU32>		mArtiSelfJointCounts;
+
+		PxgTypedCudaBuffer<PxU32>		mNodeInteractionCounts;
+
+		PxgCudaBufferN<2>	mFrictionPatchBlockStream;
+		PxgCudaBufferN<2>	mFrictionAnchorPatchBlockStream;
+		PxgCudaBufferN<2>	mFrictionIndexStream;
+		PxgCudaBufferN<2>	mFrictionPatchCounts;
+
+		//Non-block versions for articulation contacts. Remove!
+		PxgCudaBufferN<2>	mFrictionPatchStream;
+		PxgCudaBufferN<2>	mFrictionAnchorPatchStream;
+
+		PxU32				mCurrentIndex;
+
+		CUdeviceptr			mArtiStaticConstraintStartIndex;
+		CUdeviceptr			mArtiStaticConstraintCount;
+		CUdeviceptr			mArtiStaticContactStartIndex;
+		CUdeviceptr			mArtiStaticContactCount;
+
+		CUstream			mStream;
+		CUstream			mStream2;
+
+		PxU32*				mPinnedEvent;
+
+		CUevent				mEventDmaBack;
+
+		CUevent				mIntegrateEvent;
+
+		PxNodeIndex*		mCpuIslandNodeIndices;
+
+		PxU32				mSolverBodyOutputVelocityOffset;
+
+		PxgRadixSortBuffers	mRadixSort;
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverCoreDesc.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverCoreDesc.h
@@ -0,0 +1,495 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_CORE_DESC_H
+#define PXG_SOLVER_CORE_DESC_H
+
+#include "PxgNarrowphaseCore.h"
+#include "DyResidualAccumulator.h"
+
+struct float4;
+
+namespace physx
+{
+	namespace Cm
+	{
+		struct UnAlignedSpatialVector;
+	}
+
+	namespace Sc
+	{
+		class ShapeInteraction;
+	}
+
+	struct PxgConstraintData;
+	struct PxgConstraintPrePrep;
+
+	struct PxgBlockConstraint1DData;
+	struct PxgBlockConstraint1DVelocities;
+	struct PxgBlockConstraint1DParameters;
+	struct PxgBlockContactData;
+	struct PxgBlockContactPoint;
+	struct PxgConstraint1DData;
+	struct PxgConstraint1DVelocities;
+	struct PxgConstraint1DParameters;
+
+	struct PxgSolverBodyData;
+	struct PxgSolverBodySleepData;
+	struct PxgSolverTxIData;
+	
+	struct PxgIslandContext;
+	struct PxgBodySim;
+	struct PxgBodySimVelocities;
+	class PxgArticulation;
+
+	struct PxgSolverConstraintDesc;
+	struct PxgBlockWorkUnit;
+
+	struct PxgBlockConstraintBatch;
+
+	struct PxgBlockFrictionPatch;
+	struct PxgBlockFrictionAnchorPatch;
+
+	struct PxgBlockSolverConstraint1DHeader;
+	struct PxgBlockSolverConstraint1DCon;
+	struct PxgBlockSolverConstraint1DMod;
+
+	struct PxgTGSBlockSolverConstraint1DHeader;
+	struct PxgTGSBlockSolverConstraint1DCon;
+	
+	struct PxgBlockSolverContactHeader;
+	struct PxgBlockSolverFrictionHeader;
+	struct PxgBlockSolverContactPoint;
+	struct PxgBlockSolverContactFriction;
+	struct PxgBlockFrictionIndex;
+
+	struct PxgTGSBlockSolverContactHeader;
+	struct PxgTGSBlockSolverFrictionHeader;
+	struct PxgTGSBlockSolverContactPoint;
+	struct PxgTGSBlockSolverContactFriction;
+
+	struct PxgFrictionPatch;
+	struct PxgFrictionAnchorPatch;
+
+	struct PxgSolverConstraint1DHeader;
+	struct PxgSolverConstraint1DCon;
+	struct PxgSolverConstraint1DMod;
+
+	struct PxgTGSSolverConstraint1DHeader;
+	struct PxgTGSSolverConstraint1DCon;
+	struct PxgTGSSolverConstraint1DMod;
+
+	struct PxgSolverContactHeader;
+	struct PxgSolverFrictionHeader;
+	struct PxgSolverContactPointExt;
+	struct PxgSolverContactFrictionExt;
+	  
+	struct PxContact;
+	struct PxContactPatch;
+	struct PxgD6JointData;
+	struct PxgSolverReferences;
+	struct PxFrictionPatch;
+
+	struct PxsContactManagerOutput;
+	struct PartitionIndexData;
+	struct PartitionNodeData;
+	struct PxgSolverConstraintManagerConstants;
+	struct PxgConstraintBatchHeader;
+	struct PxgConstraintWriteback;
+	class PxAlignedTransform;
+	struct Px1DConstraint;
+
+	struct PxgTGSSolverContactHeader;
+	struct PxgTGSSolverContactPointExt;
+	struct PxgTGSSolverFrictionExt;
+
+	struct PxgArticulationBlockResponse;
+
+	struct PxsTorsionalFrictionData;
+
+	namespace Dy
+	{
+		struct ThresholdStreamElement;
+		class ThresholdStream;
+	}
+
+	struct IterativeSolveData
+	{
+		PxgBlockConstraintBatch*		blockConstraintBatch;
+		PxgBlockSolverConstraint1DHeader* blockJointConstraintHeaders;
+		PxgBlockSolverConstraint1DCon*	blockJointConstraintRowsCon;
+		PxgBlockSolverConstraint1DMod*	blockJointConstraintRowsMod;
+
+		PxgBlockSolverContactHeader*	blockContactHeaders;
+		PxgBlockSolverFrictionHeader*	blockFrictionHeaders;
+		PxgBlockSolverContactPoint*		blockContactPoints;
+		PxgBlockSolverContactFriction*	blockFrictions;
+
+		//first numSolverBodies float4s are linear velocity, last numSolverBodies float4s are angular velocity
+		float4*							solverBodyVelPool; 
+		float4*							tempStaticBodyOutputPool;
+
+		// Each bit encodes the activation of a slab (32 bits). When there are more than 32 slabs, use multiple indices.
+		// To query the reference count, count the number of active slabs/bits.
+		PxU32*							solverEncodedReferenceCount;
+		PxgSolverContactHeader*			contactHeaders;
+		PxgSolverFrictionHeader*		frictionHeaders;
+		PxgSolverContactPointExt*		contactPoints;
+		PxgSolverContactFrictionExt*	frictions;
+
+		PxgArticulationBlockResponse*	artiResponse;
+	};
+
+	struct IterativeSolveDataTGS
+	{
+		PxgBlockConstraintBatch*				blockConstraintBatch;
+		PxgTGSBlockSolverConstraint1DHeader*	blockJointConstraintHeaders;
+		PxgTGSBlockSolverConstraint1DCon*		blockJointConstraintRowsCon;
+		PxgBlockSolverConstraint1DMod*			blockJointConstraintRowsMod;
+
+		PxgTGSBlockSolverContactHeader*			blockContactHeaders;
+		PxgTGSBlockSolverFrictionHeader*		blockFrictionHeaders;
+		PxgTGSBlockSolverContactPoint*			blockContactPoints;
+		PxgTGSBlockSolverContactFriction*		blockFrictions;
+
+		//first numSolverBodies float4s are linear velocity, last numSolverBodies float4s are angular velocity
+		float4*									solverBodyVelPool;
+		float4*									tempStaticBodyOutputs;
+
+		// Each bit encodes the activation of a slab (32 bits). When there are more than 32 slabs, use multiple indices.
+		// To query the reference count, count the number of active slabs/bits.
+		PxU32*									solverEncodedReferenceCount;
+		
+		PxgTGSSolverContactHeader*				contactHeaders;
+		PxgSolverFrictionHeader*				frictionHeaders;	//Technically, not needed
+		PxgTGSSolverContactPointExt*			contactPoints;
+
+		PxgTGSSolverFrictionExt*				frictions;
+
+		PxgArticulationBlockResponse*			artiResponse;
+	};
+
+	struct PxgSolverSharedDescBase
+	{
+		PxgBlockFrictionPatch*		blockCurrentFrictionPatches;
+		PxgBlockFrictionPatch*		blockPreviousFrictionPatches;
+
+		PxgFrictionPatch*			currentFrictionPatches;
+		PxgFrictionPatch*			previousFrictionPatches;
+
+		PxgBodySim*					mBodySimBufferDeviceData; //If the body is articulation, we will have a remap index to the articulation array
+		PxgArticulation*			articulations;
+
+		Cm::UnAlignedSpatialVector* articulationDeferredZ;
+		PxU32*						articulationDirty;
+		uint4*						articulationSlabMask;
+		PxU32						deltaOutOffset;
+
+		float						dt;
+		float						stepDt;
+		float						invDtF32;
+		float						stepInvDtF32;
+
+		float						lengthScale;
+	};
+
+	//this desc is shared by solve and prepare kernels
+	template <typename IterData>
+	struct PxgSolverSharedDesc : PxgSolverSharedDescBase
+	{
+		IterData	iterativeData;
+	};
+
+	struct PxgSolverCoreDesc
+	{
+		float4* outSolverVelocity;
+		PxAlignedTransform* outBody2World;
+		PxgSolverBodyData* solverBodyDataPool;
+		PxgSolverTxIData* solverBodyTxIDataPool;
+		PxgSolverBodySleepData* solverBodySleepDataPool;
+
+		float4* outArtiVelocity;
+
+		PxgIslandContext* islandContextPool;
+		float4* motionVelocityArray; // first numSolverBodies float4s are linear velocity, last numSolverBodies float4s are angular velocity
+		PxU32* constraintsPerPartition;
+		PxU32* artiConstraintsPerPartition;
+		Dy::ThresholdStreamElement* thresholdStream;
+		Dy::ThresholdStreamElement* tmpThresholdStream;
+		Dy::ThresholdStreamElement*	exceededForceElements;
+		Dy::ThresholdStreamElement*	prevExceededForceElements;
+		Dy::ThresholdStreamElement*	forceChangeThresholdElements; //this is store all pairs which will trigger force exceeded or lost events
+		PxReal* thresholdStreamAccumulatedForce;
+		PxReal* thresholdStreamAccumulatedForceBetweenBlocks;
+		PxU32* thresholdStreamWriteIndex;
+		PxU32* thresholdStreamWriteIndexBetweenBlocks;
+		bool*  thresholdStreamWriteable;
+		PxReal* accumulatedForceObjectPairs;
+
+		PxgConstraintWriteback* constraintWriteBack; // 1D constraint write back
+		PxF32* forceBuffer; // contact write back
+		PxFrictionPatch* frictionPatches;
+
+		PxU32* mRigidStaticContactCounts;
+		PxU32* mRigidStaticContactStartIndices;
+
+		PxU32* mRigidStaticJointCounts;
+		PxU32* mRigidStaticJointStartIndices;
+
+		PxgSolverReferences* solverBodyReferences;
+		PxsContactManagerOutput* contactManagerOutputBase;
+		PxgBodySim*	mBodySimBufferDeviceData;
+		PxgBodySimVelocities* mBodySimPrevVelocitiesBufferDeviceData;
+
+		PxU32 numIslands;	 
+		PxU32 numBatches;
+		PxU32 numArticBatches;
+		PxU32 numSolverBodies;
+		PxU32 numSlabs;
+		PxU32 accumulatedBodyDeltaVOffset;
+		
+		PxI32 sharedThresholdStreamIndex;
+		
+		bool enableStabilization;
+	
+		PxU32 nbExceededThresholdElements;
+		PxU32 nbPrevExceededThresholdElements;
+		PxU32 nbForceChangeElements;
+
+		PxU32 maxLinksPerArticulation;
+
+		Dy::ErrorAccumulator contactErrorAccumulator;
+	};
+
+	struct PxgConstraintPrepareDesc
+	{
+		PxU32* jointConstraintBatchIndices;			//indices for joint batch
+		PxU32* contactConstraintBatchIndices;		//indices for contact batch
+		PxU32* artiJointConstraintBatchIndices;		//indices for articulation joint batch
+		PxU32* artiContactConstraintBatchIndices;	//indices for articulation contact batch
+
+		PxgSolverConstraintManagerConstants*	solverConstantData;
+		PxgBlockConstraint1DData*				blockJointPrepPool;
+		PxgBlockConstraint1DVelocities*			blockJointPrepPool0;
+		PxgBlockConstraint1DParameters*			blockJointPrepPool1;
+
+		PxgSolverBodyData*						solverBodyDataPool;
+		PxgSolverTxIData*						solverBodyTxIDataPool;
+		PxgBlockWorkUnit*						blockWorkUnit;
+
+		PxgBlockFrictionIndex*					blockCurrentFrictionIndices;
+		PxgBlockFrictionIndex*					blockPreviousFrictionIndices;
+
+		PxgBlockContactData*					blockContactCurrentPrepPool;
+		PxgBlockContactPoint*					blockContactPoints;
+
+		PxgBlockFrictionAnchorPatch*			blockCurrentAnchorPatches;
+		PxgBlockFrictionAnchorPatch*			blockPreviousAnchorPatches;
+
+		////////////////////////////////////////////////////////////////////////////
+		//for articulation
+		PxgFrictionPatch*						currentFrictionPatches;
+		PxgFrictionPatch*						previousFrictionPatches;
+
+		PxgFrictionAnchorPatch*					currentAnchorPatches;
+		PxgFrictionAnchorPatch*					previousAnchorPatches;
+
+	/*	PxgConstraint1DData*					jointPrepPool;
+		PxgConstraint1DVelocities*				jointPrepPool0;
+		PxgConstraint1DParameters*				jointPrepPool1;*/
+
+		//////////////////////////////////////////////////////////////////////////////
+
+		PxAlignedTransform*						body2WorldPool;
+
+		PxsContactManagerOutput* contactManagerOutputBase;
+
+		PxU32* constraintUniqueIndices;
+		PxU32* artiConstraintUniqueIndices;
+		PxU32* artiContactUniqueIndices;
+
+		PxU32 num1dConstraintBatches;
+		PxU32 numContactBatches;
+
+		PxU32 numStatic1dConstraintBatches;
+		PxU32 numStaticContactBatches;
+
+		PxU32 numArti1dConstraintBatches;
+		PxU32 numArtiStatic1dConstraintBatches;
+		PxU32 numArtiSelf1dConstraintBatches;
+		PxU32 numArtiContactBatches;
+		PxU32 numArtiStaticContactBatches;
+		PxU32 numArtiSelfContactBatches;
+
+		PxU32 totalBodyCount;
+		
+		PxU32 numBatches;
+		PxU32 numStaticBatches;
+	
+		float bounceThresholdF32;
+		float frictionOffsetThreshold;
+		float correlationDistance;
+		float ccdMaxSeparation;
+		
+		PxU32 totalPreviousEdges;
+		PxU32 totalCurrentEdges;
+
+		PxU32 articContactIndex;
+		PxU32 articJointIndex;
+		PxU32 nbElementsPerBody;
+
+		PxReal biasCoefficient;
+	};
+
+
+	struct PxgPrePrepDesc
+	{
+		PxgBlockConstraintBatch*			blockBatches;
+		PxU32								numBatches;
+		PxU32								numStaticBatches;
+		PxU32								numArtiBatches;
+		PxU32								numArtiStaticBatches;
+		PxU32								numArtiSelfBatches;
+		PxU32								nbD6RigidJoints;
+		PxU32								nbD6ArtiJoints;
+		PxU32								nbTotalArtiJoints; //Only used for an assert
+
+		PxU32								numTotalContacts;
+		PxU32								numTotalConstraints;
+		PxU32								numTotalStaticContacts;
+		PxU32								numTotalStaticConstraints;
+
+		PxU32								numTotalArtiContacts;		//dynamic contacts
+		PxU32								numTotalArtiConstraints;	//external constraints
+		PxU32								numTotalStaticArtiContacts;		//static contacts
+		PxU32								numTotalStaticArtiConstraints;	//static constraints
+		PxU32								numTotalSelfArtiContacts;		//static contacts
+		PxU32								numTotalSelfArtiConstraints;	//static constraints
+
+		PxU32								artiStaticConstraintBatchOffset;
+		PxU32								artiStaticContactBatchOffset;
+
+		PxgBlockWorkUnit*					blockWorkUnit;
+		PxgBlockContactData*				blockContactData;						//GPU output data		
+		PxgBlockContactPoint*				blockContactPoints;
+		PxContact*							compressedContacts;
+		PxContactPatch*						compressedPatches;
+		PxU8*								forceBuffer;
+
+		PxContact*							cpuCompressedContactsBase;
+		PxContactPatch*						cpuCompressedPatchesBase;
+		PxReal*								cpuForceBufferBase;
+
+		PxgBlockConstraint1DData*			blockPrepData;						//GPU output data
+		PxgBlockConstraint1DVelocities*		blockPrepVelocityData;				//GPU output data
+		PxgBlockConstraint1DParameters*		blockPrepParameterData;				//GPU output data
+
+		PxgConstraintData*					constraintData;						//GPU output/Input data for d6 joint, GPU input for cpu joint
+		Px1DConstraint*						constraintRows;						//GPU output/Input joint row data  for d6 joint. GPU input for cpu joint
+
+		PxgConstraintData*					artiConstraintData;					//GPU input data
+		Px1DConstraint*						artiConstraintRows;					//GPU input joint row data
+
+		const PxgD6JointData*				rigidJointData;						//GPU input data
+		const PxgConstraintPrePrep*			rigidConstraintPrePrep;				//GPU input data
+
+		const PxgD6JointData*				artiJointData;						//GPU input data
+		const PxgConstraintPrePrep*			artiConstraintPrePrep;				//GPU input data
+
+		PxsContactManagerOutput*			contactManagerOutputBase;
+
+		PxU32								sharedJointRowIndex;
+		PxU32								sharedFrictionConstraintIndex;
+		PxU32								sharedContactConstraintIndex;
+		PxU32								sharedArticulationResponseIndex;
+		PxU32*								solverBodyIndices;
+
+		PartitionIndexData*					mPartitionIndices;
+		PxU32*								mPartitionstartBatchIndices;
+		PxU32*								mPartitionArtiStartBatchIndices;
+		PxU32*								mPartitionJointCounts;		
+		PxU32*								mPartitionArtiJointCounts;
+
+		PxU32*								prevFrictionPatchCount;
+		PxU32*								currFrictionPatchCount;
+
+		PxU32*								mNpOutputIndices;
+
+		PxgSolverBodyData*					mSolverBodyData;
+
+		PxU32								mCmOutputOffsets[GPU_BUCKET_ID::eCount];
+		PartitionNodeData*					mPartitionNodeData;
+		PxgSolverConstraintManagerConstants* mContactConstantData;
+
+		PxgConstraintBatchHeader*			mBatchHeaders;
+		PxU32*								mContactUniqueIndices;
+		PxU32*								mConstraintUniqueIndices;
+
+		PxU32*								mArtiConstraintUniqueIndices; //external constraints
+		PxU32*								mArtiContactUniqueIndices; //dynamic contacts
+
+		PxgSolverReferences*				mSolverBodyReferences;
+		PxU32								mMaxConstraintPartitions;
+		PxU32								mTotalSlabs;
+		PxU32								mTotalActiveBodies;
+		PxU32								mTotalActiveArticulations;
+		PxU32								mActiveBodyStartOffset;
+		PxU32								nbElementsPerBody;
+
+		Sc::ShapeInteraction**				mShapeInteractions;
+		PxReal*								mRestDistances;
+		PxsTorsionalFrictionData*			mTorsionalFrictionData;
+
+		//Static articulation contact data
+		PxU32*								mArtiStaticContactIndices;
+		PxU32*								mArtiStaticConstraintIndices;
+		PxU32*								mArtiStaticContactCounts;
+		PxU32*								mArtiStaticConstraintCounts;
+
+		PxU32*								mArtiSelfContactIndices;
+		PxU32*								mArtiSelfConstraintIndices;
+		PxU32*								mArtiSelfContactCounts;
+		PxU32*								mArtiSelfConstraintCounts;
+
+		//Static rigid body contact data
+		PxU32*								mRigidStaticContactIndices;
+		PxU32*								mRigidStaticConstraintIndices;
+		PxU32*								mRigidStaticContactCounts;
+		PxU32*								mRigidStaticConstraintCounts;
+
+		PxU32*								mRigidStaticContactStartIndices;
+		PxU32*								mRigidStaticConstraintStartIndices;
+
+		PxU32*								mTempContactUniqueIndices;
+		PxU32*								mTempConstraintUniqueIndices;
+		PxU32*								mTempContactBlockHeader;
+		PxU32*								mTempConstraintBlockHeader;
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverFlags.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverFlags.h
@@ -0,0 +1,59 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_FLAGS_H
+#define PXG_SOLVER_FLAGS_H
+
+namespace physx
+{
+struct PxgSolverContactFlags
+{
+	enum Enum
+	{
+		eHAS_FORCE_THRESHOLDS = 1 << 0,
+
+		// This flag...
+		// - disables correlation of contact patches with friction patches from the previous frame
+		// - enables target velocities being read from the friction anchor contact points
+		//
+		// Two scenarios will raise this flag:
+		// - strong/sticky friction is disabled
+		// - contact modification sets a target velocity on contact points
+		ePER_POINT_FRICTION = 1 << 1,
+
+		eDISABLE_FRICTION = 1 << 2,
+		eCOMPLIANT_ACCELERATION_SPRING = 1 << 3,
+
+		eLAST
+	};
+};
+PX_COMPILE_TIME_ASSERT(PxgSolverContactFlags::eLAST <= ((1 << 7) + 1)); // we store these Flags as PxU8
+
+} // namespace physx
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgSolverKernelIndices.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgSolverKernelIndices.h
@@ -0,0 +1,129 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SOLVER_KERNEL_INDICES_H
+#define PXG_SOLVER_KERNEL_INDICES_H
+
+namespace physx
+{
+
+#define PXG_USE_SHARED_MEMORY_PRE_PREP 0
+
+struct PxgKernelBlockDim
+{
+	enum
+	{
+		//Constraint partition
+		CONSTRAINT_PRE_PARTITION						= 1024,
+		CONSTRAINT_PARTITION							= 1024,
+
+		PRE_INTEGRATION									= 128,
+
+		//Constraint pre-preparation
+		CONSTRAINT_PREPREP_BLOCK						= 128,
+
+		//Constraint preparation
+		CONSTRAINT_PREPARE_BLOCK_PARALLEL				= 64,
+
+		ARTI_CONSTRAINT_PREPARE							= 64,
+		
+		//Multi-block solver code
+		ZERO_BODIES										= 256,
+		SOLVE_BLOCK_PARTITION							= 64,
+		CONCLUDE_BLOCKS									= 256,
+		WRITEBACK_BLOCKS								= 256,
+		WRITE_BACK_BODIES								= 256,
+		COMPUTE_BODIES_AVERAGE_VELOCITY					= 256,
+
+		//threshold stream
+		INITIALIZE_INPUT_AND_RANKS						= 256,
+		RADIXSORT										= 256,
+		REORGANIZE_THRESHOLDSTREAM						= 256,
+		COMPUTE_ACCUMULATED_THRESHOLDSTREAM				= 256,
+		OUTPUT_ACCUMULATED_THRESHOLDSTREAM				= 256,
+		WRITEOUT_ACCUMULATEDFORCEPEROBJECT				= 256,
+		COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE	= 256,
+		OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE	= 256,
+		SET_THRESHOLDELEMENT_MASK						= 256,
+		COMPUTE_THRESHOLDELEMENT_MASK_INDICES			= 256,
+		OUTPUT_THRESHOLDELEMENT_MASK_INDICES			= 256,
+		CREATE_FORCECHANGE_THRESHOLDELEMENTS			= 256,
+
+		//Integration
+		INTEGRATE_CORE_PARALLEL							= 128,
+		CLEAR_FRICTION_PATCH_COUNTS						= 256,
+		DMA_CHANGED_ELEMS								= 512,
+		COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT			= 512
+	};
+};
+
+struct PxgKernelGridDim
+{
+	enum
+	{
+		//Constraint partition
+		CONSTRAINT_PRE_PARTITION						= 1,
+		CONSTRAINT_PARTITION							= 1,
+
+		PRE_INTEGRATION									= 64,
+		//Constraint preparation
+		CONSTRAINT_PREPREP_BLOCK						= 128,
+
+		CONSTRAINT_PREPARE_BLOCK_PARALLEL				= 256,	
+		
+		//Multi-block solver code
+		ZERO_BODIES								= 64,
+		SOLVE_BLOCK_PARTITION					= 64,
+		CONCLUDE_BLOCKS							= 64,
+		WRITEBACK_BLOCKS						= 64,
+		WRITE_BACK_BODIES						= 64,
+		COMPUTE_BODIES_AVERAGE_VELOCITY			= 128,
+
+		//threshold stream
+		INITIALIZE_INPUT_AND_RANKS						= 64,
+		RADIXSORT										= 32, //this must be 32 to match the BLOCK_SIZE for the radix sort kernel
+		REORGANIZE_THRESHOLDSTREAM						= 64,
+		COMPUTE_ACCUMULATED_THRESHOLDSTREAM				= 32,//this must be 32 to match the BLOCK_SIZE for the compute kernel
+		OUTPUT_ACCUMULATED_THRESHOLDSTREAM				= 32,//this must be 32 to match the BLOCK_SIZE for the output kernel
+		WRITEOUT_ACCUMULATEDFORCEPEROBJECT				= 64,
+		COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE	= 32,
+		OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE	= 32,
+		SET_THRESHOLDELEMENT_MASK						= 64,
+		COMPUTE_THRESHOLDELEMENT_MASK_INDICES			= 32,
+		OUTPUT_THRESHOLDELEMENT_MASK_INDICES			= 32,
+		CREATE_FORCECHANGE_THRESHOLDELEMENTS			= 64,
+		//Integration
+		INTEGRATE_CORE_PARALLEL							= 64,
+		CLEAR_FRICTION_PATCH_COUNTS						= 64,
+		DMA_CHANGED_ELEMS								= 64,
+	};
+};
+
+}  
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgTGSCudaSolverCore.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgTGSCudaSolverCore.h
@@ -0,0 +1,204 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_TGS_CUDA_SOLVER_CORE_H
+#define PXG_TGS_CUDA_SOLVER_CORE_H
+
+#include "PxgSolverCore.h"
+
+namespace physx
+{
+	// PT: TODO: rename to just PxgTGSSolverCore ?
+	class PxgTGSCudaSolverCore : public PxgSolverCore
+	{
+		PX_NOCOPY(PxgTGSCudaSolverCore)
+	private:
+
+		//this is for articulation
+		PxgCudaBuffer		mConstraintContactPrepPool;
+		PxgTypedCudaBuffer<PxgTGSSolverContactHeader>		mContactHeaderStream;
+		PxgTypedCudaBuffer<PxgTGSSolverContactPointExt>		mContactStream;
+		PxgTypedCudaBuffer<PxgTGSSolverFrictionExt>		mFrictionStream;
+
+		// Each bit encodes the activation of a slab (32 bits). When there are more than 32 slabs, use multiple indices.
+		// To query the reference count, count the number of active slabs/bits.
+		PxgTypedCudaBuffer<PxU32>		mSolverEncodedReferenceCount;
+
+		//This is the new articulation block constraint format!
+		//It shares the original rigid body contact/constraint format but adds in
+		//an additional buffer for the response vectors
+		PxgTypedCudaBuffer<PxgArticulationBlockResponse>		mArtiConstraintBlockResponse;
+		
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mForceThresholdStream;
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mTmpForceThresholdStream;
+
+		PxgTypedCudaBuffer<PxU32>		mConstraint1DBatchIndices;
+		PxgTypedCudaBuffer<PxU32>		mContactBatchIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiContactBatchIndices;
+		PxgTypedCudaBuffer<PxU32>		mArtiConstraint1dBatchIndices;
+
+		PxgTypedCudaBuffer<PxReal>		mAccumulatedForceObjectPairs; //store the accumulated force for a pair of objects
+		PxgCudaBufferN<2>	mExceededForceElements;
+		PxgTypedCudaBuffer<Dy::ThresholdStreamElement>		mForceChangeThresholdElements;
+
+		PxgTypedCudaBuffer<PxReal>		mThresholdStreamAccumulatedForce;
+		PxgTypedCudaBuffer<PxReal>		mBlocksThresholdStreamAccumulatedForce;
+
+		PxgTypedCudaBuffer<PxU32>		mThresholdStreamWriteIndex;
+		PxgTypedCudaBuffer<PxU32>		mBlocksThresholdStreamWriteIndex;
+		PxgTypedCudaBuffer<bool>		mThresholdStreamWriteable;
+
+		PxgTypedCudaBuffer<PxU32>		mIslandIds;
+		PxgTypedCudaBuffer<PxU32>		mIslandStaticTouchCount;
+
+		PxgSolverSharedDesc<IterativeSolveDataTGS>* mSharedDesc;
+
+		void radixSort(const PxU32 nbPasses);
+
+		friend class PxgArticulationCore;
+
+	public:
+
+		PxgTGSCudaSolverCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, 
+			PxgGpuContext* dynamicContext, PxgHeapMemoryAllocatorManager* heapMemoryManager, const PxGpuDynamicsMemoryConfig& init);
+		~PxgTGSCudaSolverCore();
+
+		void constructSolverSharedDesc(PxgSolverSharedDesc<IterativeSolveDataTGS>& desc, const PxgConstantData& cData,
+			const PxU32 numIters, const PxReal lengthScale, Cm::UnAlignedSpatialVector* deferredZ, PxU32* articulationDirty,
+			uint4* articulationSlabMask);
+
+		void constructConstraitPrepareDesc(PxgConstraintPrepareDesc& desc, const PxU32 numDynamicConstraintBatchHeader,
+			const PxU32 numStaticConstraintBatchHeaders, const PxU32 numDynamic1dConstraintBatches, const PxU32 numStatic1dConstraintBatches,
+			const PxU32 numDynamicContactBatches, const PxU32 numStaticContactBatches,
+			const PxU32 numArti1dConstraintBatches, const PxU32 numArtiContactBatches,
+			const PxU32 numArtiStatic1dConstraintBatches, const PxU32 numArtiStaticContactBatches, 
+			const PxU32 numArtiSelf1dConstraintBatches, const PxU32 numArtiSelfContactBatches,
+			const PxgConstantData& cData, PxU32 totalCurrentEdges, PxU32 totalPreviousEdges, PxU32 totalBodies);
+
+		void constructSolverDesc(PxgSolverCoreDesc& desc, PxU32 numIsland, PxU32 numSolverBodies, PxU32 numConstraintBatchHeader,
+			PxU32 numArticConstraints, PxU32 numSlabs, bool enableStabilization);
+
+		void syncSimulationController();
+
+		virtual void createStreams();
+		virtual void releaseStreams();
+
+		virtual void acquireContext();
+		virtual void releaseContext();
+
+		void gpuMemDMAUpContactData(PxgPinnedHostLinearMemoryAllocator* compressedContactsHostMemoryAllocator,
+			PxU32 compressedContactStreamUpperPartSize,
+			PxU32 compressedContactStreamLowerPartSize,
+			PxgPinnedHostLinearMemoryAllocator* compressedPatchesHostMemoryAllocator,
+			PxU32 compressedPatchStreamUpperPartSize,
+			PxU32 compressedPatchStreamLowerPartSize,
+			PxU32 totalContactManagers,
+			const PartitionIndexData* partitionIndexData,
+			const PartitionNodeData* partitionNodeData,
+			const PxgSolverConstraintManagerConstants* constantData,
+			PxU32 constantDataCount,
+			PxU32 partitionIndexDataCount,
+			const PxU32* partitionConstraintBatchStartIndices,
+			const PxU32* partitionArticConstraintBatchStartIndices,
+			const PxU32* partitionJointBatchCounts,
+			const PxU32* partitionArtiJointBatchCounts,
+			PxU32 nbPartitions,
+			const PxU32* destroyedEdges,
+			PxU32 nbDestroyedEdges,
+			const PxU32* npIndexArray, PxU32 npIndexArraySize,
+			PxU32 totalNumJoints,
+			const PxU32* islandIds, const PxU32* nodeInteractionCounts, PxU32 nbNodes, const PxU32* islandStaticTouchCount, PxU32 nbIslands);
+
+		void gpuMemDmaUpBodyData(PxPinnedArray<PxgSolverBodyData>& solverBodyDataPool,
+			PxPinnedArray<PxgSolverTxIData>& solverTxIDataPool,
+			const PxU32 numSolverBodies,
+			const PxU32 totalNumRigidBatches, const PxU32 totalNumArticBatches,
+			const PxU32 nbSlabs, const PxU32 nbStaticSlabs, const PxU32 maxNumStaticPartitions);
+
+		void allocateSolverBodyBuffers(const PxU32 numSolverBodies,
+			PxPinnedArray<PxNodeIndex>& islandNodeIndices,
+			const PxU32 numActiveActiculations, const PxU32 maxArticulationLinks);
+
+		PxU32 getDescriptorsAllocationSize();
+		void allocatePinnedDescriptors(PxgPinnedHostLinearMemoryAllocator& hostAllocator); 
+
+		void gpuMemDMAUp(PxgPinnedHostLinearMemoryAllocator& hostAllocator, const PxgConstraintPrePrepData& data,
+			const PxU32 numSolverBodies, PxgConstraintBatchHeader* constraintBatchHeaders,
+			PxgIslandContext* islandContextPool, const PxU32 numIslands, const PxgPartitionData& partitionData,
+			const PxU32 numConstraintBatchHeader, const PxU32 numStaticConstraintBatchHeader,
+			const PxU32 numArticConstraintBatchHeader, const PxU32 numArticStaticConstraintBatchHeader, 
+			const PxU32 numArtiSelfConstraintBatchHeader, const PxgConstantData& cData,
+			const PxU32 numContactBlockes, const PxU32 numFrictionBlockes,
+			const PxU32 numArtiContacts, const PxU32 numArtiFrictions,
+			const PxU32 totalCurrentEdges, const PxU32 totalPreviousEdges, const PxU32 numSlabs, const PxU32 maxNbPartitions,
+			const bool enableStabilization, PxU8* cpuContactPatchStreamBase, PxU8* cpuContactStreamBase, PxU8* cpuForceStreamBase, PxsContactManagerOutputIterator& outputIterator,
+			const PxU32 totalActiveBodyCount, const PxU32 activeBodyStartIndex, const PxU32 nbArticulations, Cm::UnAlignedSpatialVector* deferredZ,
+			PxU32* articulationDirty, uint4* articulationSlabMask, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances,
+			PxsTorsionalFrictionData* torsionalData,
+			PxU32* artiStaticContactIndices, const PxU32 artiContactIndSize, PxU32* artiStaticJointIndices, PxU32 artiStaticJointSize,
+			PxU32* artiStaticContactCounts, PxU32* artiStaticJointCounts,
+			PxU32* artiSelfContactIndices, const PxU32 artiSelfContactIndSize, PxU32* artiSelfJointIndices, PxU32 artiSelfJointSize,
+			PxU32* artiSelfContactCounts, PxU32* artiSelfJointCounts, 
+			PxU32* rigidStaticContactIndices, const PxU32 rigidContactIndSize, PxU32* rigidStaticJointIndices, const PxU32 rigidStaticJointSize,
+			PxU32* rigidStaticContactCounts, PxU32* rigidSaticJointCounts, const PxReal lengthScale, bool hasForceThresholds);
+
+		void gpuMemDMAbackSolverData(PxU8* forceBufferPool, PxU32 forceBufferOffset, PxU32 forceBufferUpperPartSize,
+			PxU32 forceBufferLowerPartSize, Dy::ThresholdStreamElement* changedElems, bool hasForceThresholds, Dy::ConstraintWriteback* constraintWriteBack,
+			const PxU32 writeBackSize, bool copyAllToHost, Dy::ErrorAccumulator*& contactError);
+
+		void syncDmaBack(PxU32& nbChangedThresholdElements);
+
+		void preIntegration(const PxU32 offset, const PxU32 nbSolverBodies, const PxReal dt, const PxVec3& gravity);
+		
+		void jointConstraintBlockPrePrepParallel(PxU32 nbConstraintBatches);
+
+		void jointConstraintPrepareParallel(PxU32 nbJointBatches);
+		void contactConstraintPrepareParallel(PxU32 nbContactBatches);
+		void artiJointConstraintPrepare(PxU32 nbArtiJointBatches);
+		void artiContactConstraintPrepare(PxU32 nbArtiContactBatches);
+		//soft body/cloth/particle constraint prepare
+		void nonRigidConstraintPrepare(PxU32 nbArticulations);
+
+		void solveContactMultiBlockParallel(PxgIslandContext* islandContexts, const PxU32 numIslands, const PxU32 maxPartitions,
+			PxInt32ArrayPinned& constraintsPerPartition, PxInt32ArrayPinned& artiConstraintsPerPartition, const PxVec3& gravity,
+			PxReal* posIterResidualSharedMem, PxU32 posIterResidualSharedMemSize, Dy::ErrorAccumulator* posIterError, PxPinnedArray<Dy::ErrorAccumulator>& artiContactPosIterError,
+			PxPinnedArray<Dy::ErrorAccumulator>& perArticulationInternalError);
+
+		void writeBackBlock(PxU32 a, PxgIslandContext& context);
+
+		void solvePartitions(PxgIslandContext* islandContexts, PxInt32ArrayPinned& constraintsPerPartition, PxInt32ArrayPinned& artiConstraintsPerPartition,
+			PxU32 islandIndex, bool doFriction, PxReal accumulatedDt, PxReal minPen, bool anyArticulationConstraints, bool isVelocityIteration);
+
+		void accumulatedForceThresholdStream(PxU32 maxNodes);
+		void integrateCoreParallel(const PxU32 offset, const PxU32 nbSolverBodies);
+
+		void getDataStreamBase(void*& contactStreamBase, void*& patchStreamBase, void*& forceAndIndexStreamBase);
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/include/PxgTGSDynamicsContext.h
+++ b/engine/third_party/physx/source/gpusolver/include/PxgTGSDynamicsContext.h
@@ -0,0 +1,64 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_TGS_DYNAMICS_CONTEXT_H
+#define PXG_TGS_DYNAMICS_CONTEXT_H
+
+#include "PxgContext.h"
+
+namespace physx
+{
+	namespace Cm
+	{
+		class FlushPool;
+	}
+
+	class PxBaseTask;
+
+	class PxsKernelWranglerManager;
+
+	/**
+	\brief A class to represent a GPU dynamics context for the GPU rigid body solver
+	*/
+	class PxgTGSDynamicsContext : public PxgGpuContext
+	{
+		PX_NOCOPY(PxgTGSDynamicsContext)
+
+	public:
+		PxgTGSDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, 
+			const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions, bool enableStabilization, bool useEnhancedDeterminism,
+			PxReal maxBiasCoefficient, PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager, bool externalForcesEveryTgsIterationEnabled, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID,
+			bool isResidualReportingEnabled);
+
+		virtual void						destroy();
+
+		virtual PxSolverType::Enum			getSolverType()	const	{ return PxSolverType::eTGS;	}
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/accumulateThresholdStream.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/accumulateThresholdStream.cu
@@ -0,0 +1,848 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgRadixSortDesc.h"
+#include "DyThresholdTable.h"
+#include "RadixSort.cuh"
+#include "PxgRadixSortDesc.h"
+#include "PxgCommonDefines.h"
+
+#include "reduction.cuh"
+#include <stdio.h>
+#include "PxgSolverKernelIndices.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels0() {}
+
+extern "C" __global__ void bodyInputAndRanksSingleBlockLaunch(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
+	uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
+	PxU32* gRadixCount = desc->radixBlockCounts;
+	const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
+
+	radixSortSingleBlock<PxgKernelBlockDim::RADIXSORT/WARP_SIZE>(gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount);
+}
+
+extern "C" __global__ void bodyInputAndRanksBlocksLaunch(const PxgSolverCoreDesc* solverDesc, PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
+	uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
+	PxU32* gOutputKeys = desc->outputKeys;
+	PxU32* gOutputRanks = desc->outputRanks;
+
+	PxU32* gRadixCount = desc->radixBlockCounts;
+	const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
+
+	radixSortCalculateRanks<PxgKernelBlockDim::RADIXSORT / WARP_SIZE>( gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
+}
+
+extern "C" __global__ void initialRanksAndBodyIndexB(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	PxU32* gInputKeys = rsDesc->inputKeys;
+	PxU32* gInputRanks = rsDesc->inputRanks;
+
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
+
+	for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		Dy::ThresholdStreamElement& elements =  thresholdStream[i];
+		gInputKeys[i] = elements.nodeIndexB.index();
+		gInputRanks[i] = i;
+	}
+
+	//we need to pad the handles to the multiply of 4 for the radix sort
+	const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
+
+	for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		const PxU32 index = i + nbThresholdElements;
+		gInputKeys[index]  = 0xffffffff;
+		gInputRanks[index] = index;
+	}
+
+}
+
+extern "C" __global__ void initialRanksAndBodyIndexA(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	//we need to use the inputRanks from the bodyAIndex to reorganize the threshold stream
+	PxU32* gInputKeys = rsDesc->inputKeys;
+	PxU32* gInputRanks = rsDesc->inputRanks;
+
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
+
+	for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		Dy::ThresholdStreamElement& elements =  thresholdStream[gInputRanks[i]];
+		gInputKeys[i] = elements.nodeIndexA.index();
+	}
+
+	//we need to pad the handles to the multiply of 4 for the radix sort
+	const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
+
+	for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		const PxU32 index = i + nbThresholdElements;
+		gInputKeys[index]  = 0xffffffff;
+	}
+
+}
+
+extern "C" __global__ void reorganizeThresholdElements(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	Dy::ThresholdStreamElement* tmpThresholdStream = solverDesc->tmpThresholdStream;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32* gInputRanks = rsDesc->inputRanks;
+	PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 i = globalThreadIdx/8; i < nbThresholdElements; i+=((blockDim.x*gridDim.x)/8))
+	{
+		PxU32* dest = reinterpret_cast<PxU32*>(thresholdStream + i);
+		PxU32* src = reinterpret_cast<PxU32*>(tmpThresholdStream + gInputRanks[i]);
+		
+		dest[threadIdx.x&7] = src[threadIdx.x&7];
+	}
+
+}
+
+extern "C" __global__ void computeAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert(WARP_PERBLOCK_SIZE == (1 << LOG2_WARP_PERBLOCK_SIZE));
+
+	__shared__ PxReal sWarpAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxReal sBlockAccumulator;
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	//Each body can be made of multiple shapes, therefore, we need to accumulated difference forces from the shapes to the body pairs. In this case, we will have thresholdStreams have
+	//same bodyAIndex and bodyBIndex
+
+	//The threshold stream has been sorted based on the bodyAIndex and bodyBIndex, therefore, if pairs have the same bodyAIndex and bodyBIndex, they will laied in continuously memory
+	Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
+
+	PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
+	PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockAccumulator = 0;
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+	
+		PxReal val = 0.f;
+		bool isNewPair = false;
+		PxNodeIndex nodeIndexA(PX_INVALID_NODE);
+		PxNodeIndex nodeIndexB(PX_INVALID_NODE);
+	
+		if(workIndex < nbThresholdElements)
+		{
+			val = gThresholdStream[workIndex].normalForce;
+			nodeIndexA = gThresholdStream[workIndex].nodeIndexA;
+			nodeIndexB = gThresholdStream[workIndex].nodeIndexB;
+
+			if(workIndex+1 < nbThresholdElements)
+			{
+				Dy::ThresholdStreamElement& nElement = gThresholdStream[workIndex+1];
+				if(!(nodeIndexA == nElement.nodeIndexA && nodeIndexB == nElement.nodeIndexB))
+				{
+					isNewPair = true;
+				}
+			}
+			else
+			{
+				isNewPair = true;
+			}
+		}
+
+		//warpScan is inclusive add but the accumVal is exclusive add result
+		const PxReal accumVal = warpScan<AddOpPxReal, PxReal>(FULL_MASK, val) - val;
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isNewPair)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpAccumulator[warpIndex] = accumVal + val;
+			sWarpPairsAccumulator[warpIndex] = accumPairs + isNewPair;
+		}
+
+		const PxReal prevBlockAccumulator = sBlockAccumulator;
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			PxReal forceVal = sWarpAccumulator[idx];
+
+			const PxReal accumulatedForce = warpScan<AddOpPxReal, PxReal, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, forceVal) - forceVal;
+			sWarpAccumulator[idx] = accumulatedForce;
+		
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockAccumulator += (accumulatedForce + forceVal);
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < nbThresholdElements)
+		{
+			//accumVal is exclusive result within a warp and sWarpAccumulator is the exclusive result within a block
+			gThresholdStreamAccumulatedForce[workIndex] = val + accumVal + prevBlockAccumulator + sWarpAccumulator[warpIndex]; //this is inclusive
+			gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = isNewPair;
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gThresholdStreamAccumulatedForceBetweenBlocks[blockIdx.x] = sBlockAccumulator;
+		gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+
+}
+
+extern "C" __global__ void outputAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_ACCUMULATED_THRESHOLDSTREAM;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
+	PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	__shared__ PxReal sBlockForceAccum[nbBlocks];
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	const PxU32 idx = threadIdx.x;
+//	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+
+	PxReal val = 0;
+	PxReal res = 0;
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		val = gThresholdStreamAccumulatedForceBetweenBlocks[idx];
+		pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
+
+		res = warpScan<AddOpPxReal, PxReal>(mask_idx, val) - val;
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockForceAccum[idx] = res;
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxReal blockForceAccum = sBlockForceAccum[blockIdx.x];
+
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < nbThresholdElements)
+		{
+			gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			gThresholdStreamAccumulatedForce[workIndex] = gThresholdStreamAccumulatedForce[workIndex] + blockForceAccum;
+			
+		}
+	}
+
+}
+
+extern "C" __global__ void writeoutAccumulatedForcePerObject(PxgSolverCoreDesc* solverDesc)
+{
+	
+	PxReal* gAccumulatedForces = solverDesc->thresholdStreamAccumulatedForce;
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	//accumulate normal force between blocks
+	for(PxU32 workIndex = globalThreadIdx; workIndex < nbThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
+
+		bool isNewPairs = gThresholdStreamWriteable[workIndex];
+
+		if(isNewPairs)
+		{
+			gAccumulatedForceObjectPairs[writeIndex] = gAccumulatedForces[workIndex];
+		}
+	}
+}
+
+
+extern "C" __global__ void computeExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	const PxReal dt = sharedDesc->dt;
+
+	Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
+	PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+
+		bool isExceededForce = false;
+	
+		if(workIndex < nbThresholdElements)
+		{
+			Dy::ThresholdStreamElement& element = gThresholdStream[workIndex];
+
+			//we are reusing the write index buffer. However, because the work index is the same, so as long as we read before we write, it should be safe
+			const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
+			PxReal accumulatedForce = gAccumulatedForceObjectPairs[writeIndex];
+
+			if(writeIndex > 0)
+			{
+				accumulatedForce -=  gAccumulatedForceObjectPairs[writeIndex-1];
+			}
+
+			//write back the accumulated force
+			element.accumulatedForce = accumulatedForce;
+
+			isExceededForce = accumulatedForce > (element.threshold * dt);
+		}
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isExceededForce)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpPairsAccumulator[warpIndex] = accumPairs + isExceededForce;
+		}
+
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < nbThresholdElements)
+		{
+			gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = isExceededForce;
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+
+}
+
+extern "C" __global__ void outputExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE/WARP_SIZE;
+	
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+	Dy::ThresholdStreamElement* gThresholdElements = solverDesc->thresholdStream;
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	const PxU32 idx = threadIdx.x;
+//	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < nbThresholdElements)
+		{
+			///gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			bool isExceededForce = gThresholdStreamWriteable[workIndex];
+
+			if(isExceededForce)
+			{
+				Dy::ThresholdStreamElement& element = gThresholdElements[workIndex];
+				Dy::ThresholdStreamElement tempElement;
+				tempElement.shapeInteraction = element.shapeInteraction;
+				tempElement.nodeIndexA = element.nodeIndexA;
+				tempElement.nodeIndexB = element.nodeIndexB;
+				tempElement.normalForce = element.normalForce;
+				tempElement.accumulatedForce = element.accumulatedForce;
+				tempElement.threshold = element.threshold;
+				gExceededForceElements[writeIndex] = tempElement;
+			}
+
+			//last element
+			if(workIndex == nbThresholdElements -1)
+			{
+				solverDesc->nbExceededThresholdElements = isExceededForce ? (writeIndex + 1) : writeIndex;
+			}
+		}
+	}
+}
+
+
+//we will insure nbPrevExceededThresholdPairs > 0 in the CPU and all pair masks have been set to 1
+//The data laied out in gExceededForceElementMask is previous exceeded threshold element mask first, then current exceeded threshold element mask second.
+//persistent exceeded threshold element mask last. Persistent exceeded elements have to be in the previous exceeded threshold elements array and current 
+//exceeded threshold elements array so the number of persistent exceeded elements will be less than or equal to the previous exceeded threshold elements. 
+//Therefore, Persistent exceeded thresold element maks has the same size as the previous exceeded threshold and corresponding to the same element as in
+//previous exceeded force element. 
+extern "C" __global__ void setThresholdElementsMask(PxgSolverCoreDesc* solverDesc)
+{
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+	Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 workIndex = globalThreadIdx; workIndex < nbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		Dy::ThresholdStreamElement& element = gExceededForceElements[workIndex];
+
+		//this will find the last element match the element if value exist in the array 
+		PxU32 pos = binarySearch<Dy::ThresholdStreamElement>(gPrevExceededForceElements, nbPrevExceededThresholdElements, element);
+		Dy::ThresholdStreamElement* prePair = &gPrevExceededForceElements[pos];
+
+		bool done = false;
+
+		while (!done)
+		{
+			done = true;
+			if (prePair->nodeIndexA == element.nodeIndexA && prePair->nodeIndexB == element.nodeIndexB)
+			{
+				if (prePair->shapeInteraction == element.shapeInteraction)
+				{
+					//found a pair, raise 0 in the masks so that we won't generate any force change event. Because the mask array store previous and current exceeded force pairs, we need to
+					// raise 0 in two position: one for the previous mask and one for the current mask
+					gExceededForceElementMask[pos] = 0;
+					gExceededForceElementMask[nbPrevExceededThresholdElements + workIndex] = 0;
+				}
+				else if (pos > 1)
+				{
+					pos = pos - 1;
+					prePair = &gPrevExceededForceElements[pos];
+					done = false;
+				}
+			}
+		}
+
+	}
+}
+
+__device__ void setPersistentForceElementMask(PxgSolverCoreDesc* solverDesc)
+{
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
+
+	for (PxU32 workIndex = globalThreadIdx; workIndex < nbPrevExceededThresholdElements; workIndex += (blockDim.x*gridDim.x))
+	{
+		//based on the previous exceeded force elements
+		gExceededForceElementMask[persistentExceededStart + workIndex] = !gExceededForceElementMask[workIndex];
+	}
+}
+
+extern "C" __global__ void computeThresholdElementMaskIndices(PxgSolverCoreDesc* solverDesc)
+{
+	//this function should be called in setThresholdElementsMask. However, if there are no preExceededThresholdElements(which we will know in the CPU code), we don't
+	//kick of setThresholdElementMask kernel at all so the persistentExceededThresholdElementMask is still set to be one. Therefore, we need to call the setPersistentForceElement
+	//method in here
+
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
+
+	setPersistentForceElementMask(solverDesc);
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//prev, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+
+	const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+	
+		PxU32 forceChangeMask = 0;
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			forceChangeMask = gExceededForceElementMask[workIndex];
+		}
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, forceChangeMask)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpPairsAccumulator[warpIndex] = accumPairs + forceChangeMask;
+		}
+
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			gExceededForceElementMask[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = !!(forceChangeMask);
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gExceededForceElementMaskBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+}
+
+extern "C" __global__ void outputThresholdPairsMaskIndices(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//previous, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+
+	const PxU32 idx = threadIdx.x;
+
+
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		pairIndex = gExceededForceElementMaskBetweenBlocks[idx];
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i*PxgKernelBlockDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			gExceededForceElementMask[workIndex] = gExceededForceElementMask[workIndex] + blockWriteIndexAccum;
+		}
+	}
+}
+
+extern "C" __global__ void createForceChangeThresholdElements(PxgSolverCoreDesc* solverDesc)
+{
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+	Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
+	Dy::ThresholdStreamElement* gForceChangeElements = solverDesc->forceChangeThresholdElements;
+
+	//we copy the original mask value to thresholdStreamWriteable in computeThresholdElementMaskIndices so it corresponding with mask
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//previous, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+	const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
+
+	for(PxU32 workIndex = globalThreadIdx; workIndex < totalNbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		const bool hasForceChangeOrPersistent = gThresholdStreamWriteable[workIndex];
+
+		const PxU32 writeIndex = gExceededForceElementMask[workIndex];
+
+		if (hasForceChangeOrPersistent)
+		{
+			bool lostPair = workIndex < nbPrevExceededThresholdElements;
+			bool foundPair = (workIndex < persistentExceededStart) && !lostPair;
+
+			Dy::ThresholdStreamElement* pair = NULL;
+			if (lostPair)
+			{
+				pair = &gPrevExceededForceElements[workIndex];
+			}
+			else if (foundPair)
+			{
+				pair = &gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
+			}
+			else
+			{
+				//persistent pair
+				pair = &gPrevExceededForceElements[workIndex - persistentExceededStart];
+			}
+
+			//Dy::ThresholdStreamElement& pair = lostPair ? gPrevExceededForceElements[workIndex] : gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
+		
+			Dy::ThresholdStreamElement tempPair;
+			tempPair.shapeInteraction = pair->shapeInteraction;
+			tempPair.nodeIndexA = pair->nodeIndexA;
+			tempPair.nodeIndexB = pair->nodeIndexB;
+			tempPair.normalForce = pair->normalForce;
+			tempPair.accumulatedForce = lostPair ? 0.f : pair->accumulatedForce;
+			tempPair.threshold = pair->threshold;
+			
+			gForceChangeElements[writeIndex] = tempPair;
+		}
+
+		if(workIndex == totalNbExceededThresholdElements-1)
+		{
+			solverDesc->nbForceChangeElements = hasForceChangeOrPersistent ? (writeIndex + 1) : writeIndex;
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/artiConstraintPrep2.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/artiConstraintPrep2.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constant.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constant.cuh
@@ -0,0 +1,37 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef	__CONSTRAINT_CUH__
+#define	__CONSTRAINT_CUH__
+
+#include "PxgSolverCoreDesc.h"
+
+__constant__ PxgSolverCoreDesc constraintSolverCoreDescC;
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrePrep.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrePrep.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrep.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrep.cu
@@ -0,0 +1,494 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgBodySim.h"
+#include "PxgArticulation.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgArticulationCoreKernelIndices.h"
+#include "DySolverConstraintTypes.h"
+#include "DyConstraintPrep.h"
+#include "PxNodeIndex.h"
+#include "PxContact.h"
+#include "PxsContactManagerState.h"
+#include "contactConstraintBlockPrep.cuh"
+#include "contactConstraintPrep.cuh"
+#include "jointConstraintBlockPrep.cuh"
+#include "constant.cuh"
+#include "constraintPrepShared.cuh"
+#include <assert.h>
+#include "stdio.h"
+
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels1() {}
+
+#define LOAD_BODY_DATA 0
+
+#if LOAD_BODY_DATA
+
+//Enough memory to fit 32 warps and load 11 solver body data objects per-pass, i.e. load solverBodyData for all 32 warps in 3 passes.
+//Note, we can +1 on the size to avoid bank conflicts but then 16 byte aligned structs won't be aligned anymore
+#define BODIES_PER_BLOCK 11u
+volatile __shared__ PxU8 bodyLoadData[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL/32][BODIES_PER_BLOCK][sizeof(PxgSolverBodyPrepData)];
+
+
+static __device__ void loadBodyData(const PxgSolverBodyData* PX_RESTRICT datas, const PxU32 batchStride, const PxU32 bodyIndex, const PxU32 threadIndexInWarp, const PxU32 warpIndex,
+	PxgSolverBodyPrepData& outBodyPrepData/*float4& initialLinVelXYZ_invMassW, float4& initialAngVelXYZ_penBiasClamp, PxAlignedMat33&	sqrtInvInertia, PxAlignedTransform& body2World*/)
+{
+	//Iterate through the body datas, pulling in the data we need, then index into shared data, pull out the solver body data and return it by value to store on stack (either in register or in local mem).
+	threadCounts[threadIdx.x] = bodyIndex;
+
+	const PxU32 solverPrepDataWords = sizeof(PxgSolverBodyPrepData)/4;
+
+	PxU32 warpStartIndex = warpIndex*32;
+
+	for(PxU32 a = 0; a < batchStride; a+=BODIES_PER_BLOCK)
+	{
+		PxU32 remainder = PxMin(batchStride - a, BODIES_PER_BLOCK);
+
+		for(PxU32 b = 0; b < remainder; ++b)
+		{
+			PxU32 bodyIndex = threadCounts[warpStartIndex + a + b]; //KS - potentially can use SM3.0 shuffle instead
+
+			const PxU32* PX_RESTRICT sourceData = reinterpret_cast<const PxU32*>(datas + bodyIndex);
+
+			volatile PxU32* bodyData = reinterpret_cast<volatile PxU32*>(&bodyLoadData[warpIndex][b][0]);
+
+			for(PxU32 i = threadIndexInWarp; i < solverPrepDataWords; i+=32)
+			{
+				bodyData[i] = sourceData[i];
+			}
+
+		}
+
+		if((threadIndexInWarp - a) < BODIES_PER_BLOCK)
+		{
+			volatile PxgSolverBodyPrepData& data = reinterpret_cast<volatile PxgSolverBodyPrepData&>(bodyLoadData[warpIndex][threadIndexInWarp-a][0]);
+
+			/*initialLinVelXYZ_invMassW = make_float4(data.initialLinVelXYZ_invMassW.x, data.initialLinVelXYZ_invMassW.y, data.initialLinVelXYZ_invMassW.z,
+				initialLinVelXYZ_invMassW.w);
+			initialAngVelXYZ_penBiasClamp = make_float4(data.initialAngVelXYZ_penBiasClamp.x, data.initialAngVelXYZ_penBiasClamp.y, data.initialAngVelXYZ_penBiasClamp.z,
+				data.initialAngVelXYZ_penBiasClamp.w);
+
+			body2World.p = make_float4(data.body2World.p.x, data.body2World.p.y, data.body2World.p.z, data.body2World.p.w);
+			body2World.q = make_float4(data.body2World.q.q.x, data.body2World.q.q.y, data.body2World.q.q.z, data.body2World.q.q.w);
+
+			sqrtInvInertia = (PxAlignedMat33&)data.sqrtInvInertia;*/
+			outBodyPrepData = (PxgSolverBodyPrepData&)data;
+			
+			/*PxU32* outPrepDataU32 = reinterpret_cast<PxU32*>(&outPrepData);
+			for(PxU32 i = 0; i < solverPrepDataWords; ++i)
+			{
+				outPrepDataU32[i] = bodyLoadData[warpIndex][threadIndexInWarp - a][i];
+			}*/
+		}
+
+	}
+
+	threadCounts[threadIdx.x] = 0; //Reset thread counts to 0 because they're used for accumulators in later code
+
+}
+
+#endif
+
+extern "C" __global__ void jointConstraintBlockPrepareParallelLaunch(
+	PxgConstraintPrepareDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	//PxgBlockWorkUnit* workUnits = constraintPrepDesc->workUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	PxgSolverBodyData* solverBodyDatas = solverDesc->solverBodyDataPool;
+	PxgSolverTxIData* solverTxIData = solverDesc->solverBodyTxIDataPool;
+
+	PxgBlockSolverConstraint1DHeader* jointConstraintHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
+	PxgBlockSolverConstraint1DCon* jointConstraintRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
+	PxgBlockSolverConstraint1DMod* jointConstraintRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
+	PxU32* batchIndices = solverDesc->jointConstraintBatchIndices;
+
+	const PxU32 num1dConstraintBatches = solverDesc->num1dConstraintBatches + solverDesc->numStatic1dConstraintBatches;
+
+	//for(PxU32 i=warpIndex; i< constraintPrepDesc->num1dConstraintBatches; i+=totalNumWarps)
+	PxU32 i = warpIndex;
+	if (i < num1dConstraintBatches)
+	{
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = sharedDesc->iterativeData.blockConstraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+				//desc.descIndex for joint in fact is the batch index
+			PxgBlockConstraint1DData& constraintData = solverDesc->blockJointPrepPool[descIndexBatch];
+			PxgBlockConstraint1DVelocities* rowVelocities = &solverDesc->blockJointPrepPool0[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+			PxgBlockConstraint1DParameters* rowParameters = &solverDesc->blockJointPrepPool1[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			PxgSolverTxIData* txIData0 = &solverTxIData[bodyAIndex];
+			PxgSolverTxIData* txIData1 = &solverTxIData[bodyBIndex];
+
+			PxU32 uniqueIndex = solverDesc->constraintUniqueIndices[batch.mStartPartitionIndex + threadIndexInWarp];
+				
+			setupSolverConstraintBlockGPU<PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1, sharedDesc->dt, sharedDesc->invDtF32, batch, threadIndexInWarp,
+					&jointConstraintHeaders[descIndexBatch], &jointConstraintRowsCon[batch.startConstraintIndex], &jointConstraintRowsMod[batch.startConstraintIndex],
+					solverDesc->solverConstantData[uniqueIndex]);
+		}    
+	}
+}
+
+extern "C" __global__ void contactConstraintBlockPrepareParallelLaunch(
+	PxgConstraintPrepareDesc* constraintPrepDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	PxgBlockWorkUnit* workUnits = constraintPrepDesc->blockWorkUnit;
+
+	const PxU32 warpSize = WARP_SIZE;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	const PxU32 totalPreviousEdges = constraintPrepDesc->totalPreviousEdges;
+	const PxU32 totalCurrentEdges = constraintPrepDesc->totalCurrentEdges;
+	const PxU32 nbContactBatches = constraintPrepDesc->numContactBatches + constraintPrepDesc->numStaticContactBatches;
+
+
+	/*if (warpIndex == 0 && threadIndexInWarp == 0)
+	{
+		printf("NumBatches = %i, numContactBatches = %i, numStaticContactBatches = %i %p\n", nbContactBatches,
+			constraintPrepDesc->numContactBatches, constraintPrepDesc->numStaticContactBatches, constraintPrepDesc);
+	}*/
+
+	__shared__ PxgSolverBodyData* solverBodyDatas;
+	__shared__ PxgSolverTxIData* solverTxIDatas;
+
+	__shared__ PxgBlockSolverContactHeader* contactHeaders;
+	__shared__ PxgBlockSolverFrictionHeader* frictionHeaders;
+	__shared__ PxgBlockSolverContactPoint* contactPoints;
+	__shared__ PxgBlockSolverContactFriction* frictions;
+	__shared__ PxU32* batchIndices;
+	__shared__ PxgBlockFrictionIndex* frictionIndices;
+	__shared__ PxgBlockFrictionIndex* prevFrictionIndices;
+	__shared__ PxgBlockContactPoint* contactBase;
+	__shared__ PxgBlockConstraintBatch* constraintBatch;
+	__shared__ PxgBlockContactData* contactCurrentPrepPool;
+	__shared__ PxgBlockFrictionPatch* prevFrictionPatches;
+	__shared__ PxgBlockFrictionPatch* currFrictionPatches;
+	__shared__ PxgBlockFrictionAnchorPatch* prevFrictionAnchors;
+	__shared__ PxgBlockFrictionAnchorPatch* currFrictionAnchors;
+	__shared__ PxAlignedTransform* bodyFrames;
+
+	
+	volatile __shared__ char sInertias[sizeof(PxMat33) * (PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize) * warpSize];
+	//volatile __shared__ PxMat33 inertias[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize][warpSize];
+
+	volatile PxMat33* inertias = reinterpret_cast<volatile PxMat33*>(sInertias);
+
+	if(threadIdx.x == 0)
+	{
+		solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
+		solverTxIDatas = constraintPrepDesc->solverBodyTxIDataPool;
+
+		contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
+		frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
+		contactPoints = sharedDesc->iterativeData.blockContactPoints;
+		frictions = sharedDesc->iterativeData.blockFrictions;
+		batchIndices = constraintPrepDesc->contactConstraintBatchIndices;
+		frictionIndices = constraintPrepDesc->blockCurrentFrictionIndices;
+		prevFrictionIndices = constraintPrepDesc->blockPreviousFrictionIndices;
+
+		contactBase = constraintPrepDesc->blockContactPoints;
+		constraintBatch = sharedDesc->iterativeData.blockConstraintBatch;
+		contactCurrentPrepPool = constraintPrepDesc->blockContactCurrentPrepPool;
+		currFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
+		prevFrictionPatches = sharedDesc->blockPreviousFrictionPatches;
+		prevFrictionAnchors = constraintPrepDesc->blockPreviousAnchorPatches;
+		currFrictionAnchors = constraintPrepDesc->blockCurrentAnchorPatches;
+		bodyFrames = constraintPrepDesc->body2WorldPool;
+	}
+	
+	__syncthreads();
+
+	PxU32 i = warpIndex;
+	//unsigned mask_nbContactBatches = __ballot_sync(FULL_MASK, i < nbContactBatches);
+	if(i < nbContactBatches)
+	{
+		const PxU32 batchIndex = batchIndices[i];
+
+		//if (batchIndex >= totalBatches)
+		//{
+		//	if(batchIndices[i-1] < totalBatches)
+		//		assert(batchIndex < totalBatches); //Ensure we are not shooting past the max number of batches...
+		//}
+
+		PxgBlockConstraintBatch& batch = constraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//Read in 16 bytes at a time, we take 3 threads to read in a single inertia tensor, and we have some spare bandwidth. We can read
+		//32 inertia tensors in 3 passes
+
+		const PxU32 descStride2 = descStride*2;
+
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx/2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyAIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx &1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				//volatile float* sh = reinterpret_cast<volatile float*>(&inertias[threadIdx.x / 32][bodyToLoad]);
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if(offset == 0)
+					sh[offset*4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia0;
+		const PxU32 index = (threadIdx.x / warpSize) * warpSize + threadIndexInWarp;
+		if (threadIndexInWarp < descStride)
+		{	
+			invInertia0.column0.x = inertias[index].column0.x;
+			invInertia0.column0.y = inertias[index].column0.y;
+			invInertia0.column0.z = inertias[index].column0.z;
+			invInertia0.column1.x = inertias[index].column1.x;
+			invInertia0.column1.y = inertias[index].column1.y;
+			invInertia0.column1.z = inertias[index].column1.z;
+			invInertia0.column2.x = inertias[index].column2.x;
+			invInertia0.column2.y = inertias[index].column2.y;
+			invInertia0.column2.z = inertias[index].column2.z;
+
+			//printf("%i: (%f, %f, %f) (%f, %f, %f) (%f, %f, %f)\n", threadIdx.x, invInertia0.column0.x, invInertia0.column0.y, invInertia0.column0.z, invInertia0.column1.x, invInertia0.column1.y, invInertia0.column1.z, invInertia0.column2.x, invInertia0.column2.y, invInertia0.column2.z);
+		}
+
+		__syncwarp(); //Required (racecheck confirmed) because inertias (Ptr sh points to inertias) is written below and read above
+
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx / 2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyBIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia1;
+
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia1.column0.x = inertias[index].column0.x;
+			invInertia1.column0.y = inertias[index].column0.y;
+			invInertia1.column0.z = inertias[index].column0.z;
+			invInertia1.column1.x = inertias[index].column1.x;
+			invInertia1.column1.y = inertias[index].column1.y;
+			invInertia1.column1.z = inertias[index].column1.z;
+			invInertia1.column2.x = inertias[index].column2.x;
+			invInertia1.column2.y = inertias[index].column2.y;
+			invInertia1.column2.z = inertias[index].column2.z;
+		}
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+			//port contact code
+			PxgBlockContactData& contactData = contactCurrentPrepPool[descIndexBatch];
+			PxgBlockContactPoint* baseContact = contactBase + batch.blockContactIndex;
+			PxgBlockFrictionPatch& frictionPatch = currFrictionPatches[descIndexBatch];
+			PxgBlockFrictionAnchorPatch& fAnchor = currFrictionAnchors[descIndexBatch];
+
+			//Fill in correlation information for next frame...
+
+			PxgBlockWorkUnit& unit = workUnits[descIndexBatch];
+
+			PxgBlockFrictionIndex index;
+			index.createPatchIndex(descIndexBatch, threadIndexInWarp);
+
+			//PxU32 frictionIndex = unit.mFrictionIndex[threadIndexInWarp];
+			PxU32 edgeIndex = unit.mEdgeIndex[threadIndexInWarp];
+			PxU32 frictionIndex = edgeIndex + totalCurrentEdges * unit.mPatchIndex[threadIndexInWarp];
+			PxgBlockFrictionIndex* targetIndex = &frictionIndices[frictionIndex];
+				
+			*reinterpret_cast<uint2*>(targetIndex) = reinterpret_cast<uint2&>(index);
+
+			//KS - todo - get some of this in shared memory/registers as quickly as possible...
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			//PxgSolverTxIData* txIData0 = &solverTxIDatas[bodyAIndex];
+			//PxgSolverTxIData* txIData1 = &solverTxIDatas[bodyBIndex];
+
+			const PxAlignedTransform bodyFrame0 = bodyFrames[bodyAIndex];
+			const PxAlignedTransform bodyFrame1 = bodyFrames[bodyBIndex];
+
+			//KS - temporarily read the velocities the "slow" way so we can store the inertia-scaled velocities 
+			//in velocities buffer for now. We can then switch over later when we create the new prep code for the 
+			//TGS solver and leave the PGS solver as-is
+#if 0
+			const float4 linVel_invMass0 = velocities[bodyAIndex];
+			const float4 angVelXYZ_penBiasClamp0 = velocities[bodyAIndex + totalBodies];
+
+			const float4 linVel_invMass1 = velocities[bodyBIndex];
+			const float4 angVelXYZ_penBiasClamp1 = velocities[bodyBIndex + totalBodies];
+#else
+			const float4 linVel_invMass0 = bodyData0->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp0 = bodyData0->initialAngVelXYZ_penBiasClamp;
+
+			const float4 linVel_invMass1 = bodyData1->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp1 = bodyData1->initialAngVelXYZ_penBiasClamp;
+#endif
+
+			const PxReal solverOffsetSlop = PxMax(bodyData0->offsetSlop, bodyData1->offsetSlop);
+
+			/*if (i >= constraintPrepDesc->numContactBatches)
+			{
+				if(bodyBIndex != )
+			}*/
+
+			PxU32 offset = unit.mWriteback[threadIndexInWarp];
+			createFinalizeSolverContactsBlockGPU(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices, *bodyData0, *bodyData1, 
+				invInertia0, invInertia1, bodyFrame0, bodyFrame1, linVel_invMass0, angVelXYZ_penBiasClamp0, linVel_invMass1, angVelXYZ_penBiasClamp1,
+				sharedDesc->invDtF32, sharedDesc->dt, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
+				threadIndexInWarp, offset, &contactHeaders[descIndexBatch], &frictionHeaders[descIndexBatch], &contactPoints[batch.startConstraintIndex], 
+				&frictions[batch.startFrictionIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, solverOffsetSlop);
+
+			frictionPatch.patchIndex[threadIndexInWarp] = unit.mFrictionPatchIndex[threadIndexInWarp];
+
+			PxgBlockFrictionPatch& fpatch = frictionPatch;
+			if (fpatch.anchorCount[threadIndexInWarp] >= 1)
+				fpatch.anchorPoints[0][threadIndexInWarp] = PxSave3(bodyFrame0.transform(PxLoad3(fAnchor.body0Anchors[0][threadIndexInWarp])));
+			if (fpatch.anchorCount[threadIndexInWarp] == 2)
+				fpatch.anchorPoints[1][threadIndexInWarp] = PxSave3(bodyFrame0.transform(PxLoad3(fAnchor.body0Anchors[1][threadIndexInWarp])));
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrepTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrepTGS.cu
@@ -0,0 +1,410 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#define IS_TGS_SOLVER
+
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "DyConstraintPrep.h"
+#include "contactConstraintBlockPrep.cuh"
+#include "jointConstraintBlockPrepTGS.cuh"
+
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels10() {}
+
+extern "C" __global__ void jointConstraintBlockPrepareParallelLaunchTGS( PxgConstraintPrepareDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	//PxgBlockWorkUnit* workUnits = constraintPrepDesc->workUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	PxgSolverBodyData* solverBodyDatas = solverDesc->solverBodyDataPool;
+	PxgSolverTxIData* solverTxIData = solverDesc->solverBodyTxIDataPool;
+
+	PxgTGSBlockSolverConstraint1DHeader* jointConstraintHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
+	PxgTGSBlockSolverConstraint1DCon* jointConstraintRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
+	//PxgBlockSolverConstraint1DMod* jointConstraintRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
+	PxU32* batchIndices = solverDesc->jointConstraintBatchIndices;
+	const PxU32 total1dConstraintBatches = solverDesc->num1dConstraintBatches + solverDesc->numStatic1dConstraintBatches;
+
+
+	//for(PxU32 i=warpIndex; i< constraintPrepDesc->num1dConstraintBatches; i+=totalNumWarps)
+	PxU32 i = warpIndex;
+	if (i < total1dConstraintBatches)
+	{
+
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = sharedDesc->iterativeData.blockConstraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+				//desc.descIndex for joint in fact is the batch index
+			PxgBlockConstraint1DData& constraintData = solverDesc->blockJointPrepPool[descIndexBatch];
+			PxgBlockConstraint1DVelocities* rowVelocities = &solverDesc->blockJointPrepPool0[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+			PxgBlockConstraint1DParameters* rowParameters = &solverDesc->blockJointPrepPool1[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			PxgSolverTxIData* txIData0 = &solverTxIData[bodyAIndex];
+			PxgSolverTxIData* txIData1 = &solverTxIData[bodyBIndex];
+
+			PxU32 uniqueIndex = solverDesc->constraintUniqueIndices[batch.mStartPartitionIndex + threadIndexInWarp];
+				
+			setupSolverConstraintBlockGPUTGS<PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1, 
+				sharedDesc->stepDt, sharedDesc->stepInvDtF32, sharedDesc->dt, sharedDesc->invDtF32, sharedDesc->lengthScale, 
+				solverDesc->biasCoefficient, batch, threadIndexInWarp,
+					&jointConstraintHeaders[descIndexBatch], &jointConstraintRowsCon[batch.startConstraintIndex],
+					solverDesc->solverConstantData[uniqueIndex]);
+		}    
+	}
+}
+
+extern "C" __global__ void contactConstraintBlockPrepareParallelLaunchTGS(
+	PxgConstraintPrepareDesc* constraintPrepDesc,
+	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	PxgBlockWorkUnit* workUnits = constraintPrepDesc->blockWorkUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	const PxU32 totalPreviousEdges = constraintPrepDesc->totalPreviousEdges;
+	const PxU32 totalCurrentEdges = constraintPrepDesc->totalCurrentEdges;
+	const PxU32 nbContactBatches = constraintPrepDesc->numContactBatches + constraintPrepDesc->numStaticContactBatches;
+	
+
+	__shared__ PxgSolverBodyData* solverBodyDatas;
+	__shared__ PxgSolverTxIData* solverTxIDatas;
+
+	__shared__ PxgTGSBlockSolverContactHeader* contactHeaders;
+	__shared__ PxgTGSBlockSolverFrictionHeader* frictionHeaders;
+	__shared__ PxgTGSBlockSolverContactPoint* contactPoints;
+	__shared__ PxgTGSBlockSolverContactFriction* frictions;
+	__shared__ PxU32* batchIndices;
+	__shared__ PxgBlockFrictionIndex* frictionIndices;
+	__shared__ PxgBlockFrictionIndex* prevFrictionIndices;
+	__shared__ PxgBlockContactPoint* contactBase;
+	__shared__ PxgBlockConstraintBatch* constraintBatch;
+	__shared__ PxgBlockContactData* contactCurrentPrepPool;
+	__shared__ PxgBlockFrictionPatch* prevFrictionPatches;
+	__shared__ PxgBlockFrictionPatch* currFrictionPatches;
+	__shared__ PxgBlockFrictionAnchorPatch* prevFrictionAnchors;
+	__shared__ PxgBlockFrictionAnchorPatch* currFrictionAnchors;
+	__shared__ PxAlignedTransform* bodyFrames;
+
+	volatile __shared__ char sInertias[sizeof(PxMat33) * (PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize) * warpSize];
+	volatile PxMat33* inertias = reinterpret_cast<volatile PxMat33*>(sInertias);
+
+	//volatile __shared__ PxMat33 inertias[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / 32][32];
+
+	if(threadIdx.x == 0)
+	{
+		solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
+		solverTxIDatas = constraintPrepDesc->solverBodyTxIDataPool;
+
+		contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
+		frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
+		contactPoints = sharedDesc->iterativeData.blockContactPoints;
+		frictions = sharedDesc->iterativeData.blockFrictions;
+		batchIndices = constraintPrepDesc->contactConstraintBatchIndices;
+		frictionIndices = constraintPrepDesc->blockCurrentFrictionIndices;
+		prevFrictionIndices = constraintPrepDesc->blockPreviousFrictionIndices;
+
+		contactBase = constraintPrepDesc->blockContactPoints;
+		constraintBatch = sharedDesc->iterativeData.blockConstraintBatch;
+		contactCurrentPrepPool = constraintPrepDesc->blockContactCurrentPrepPool;
+		currFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
+		prevFrictionPatches = sharedDesc->blockPreviousFrictionPatches;
+		prevFrictionAnchors = constraintPrepDesc->blockPreviousAnchorPatches;
+		currFrictionAnchors = constraintPrepDesc->blockCurrentAnchorPatches;
+		bodyFrames = constraintPrepDesc->body2WorldPool;
+	}
+	
+	__syncthreads();
+
+	PxU32 i = warpIndex;
+	//unsigned mask_nbContactBatches = __ballot_sync(FULL_MASK, i < nbContactBatches);
+	if(i < nbContactBatches)
+	{
+		/*if (threadIndexInWarp == 0)
+			printf("Processing batch %i\n", i);*/
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = constraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+
+		/*if (threadIndexInWarp == 0)
+			printf("Processing batchIndex %i\n", batchIndex);*/
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//Read in 16 bytes at a time, we take 3 threads to read in a single inertia tensor, and we have some spare bandwidth. We can read
+		//32 inertia tensors in 3 passes
+
+		const PxU32 descStride2 = descStride*2;
+
+		/*if(threadIndexInWarp == 0)
+			printf("Loading first txIData\n");*/
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx/2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyAIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				//volatile float* sh = reinterpret_cast<volatile float*>(&inertias[threadIdx.x / 32][bodyToLoad]);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia0;
+		const PxU32 index = (threadIdx.x / warpSize) * warpSize + threadIndexInWarp;
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia0.column0.x = inertias[index].column0.x;
+			invInertia0.column0.y = inertias[index].column0.y;
+			invInertia0.column0.z = inertias[index].column0.z;
+			invInertia0.column1.x = inertias[index].column1.x;
+			invInertia0.column1.y = inertias[index].column1.y;
+			invInertia0.column1.z = inertias[index].column1.z;
+			invInertia0.column2.x = inertias[index].column2.x;
+			invInertia0.column2.y = inertias[index].column2.y;
+			invInertia0.column2.z = inertias[index].column2.z;
+
+			//printf("%i: (%f, %f, %f) (%f, %f, %f) (%f, %f, %f)\n", threadIdx.x, invInertia0.column0.x, invInertia0.column0.y, invInertia0.column0.z, invInertia0.column1.x, invInertia0.column1.y, invInertia0.column1.z, invInertia0.column2.x, invInertia0.column2.y, invInertia0.column2.z);
+		}
+
+		__syncwarp(); //Required (racecheck confirmed) because inertias (Ptr sh points to inertias) is written below and read above
+
+		/*if (threadIndexInWarp == 0)
+			printf("Loading second txIData\n");*/
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx / 2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyBIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		/*if (threadIndexInWarp == 0)
+			printf("Loaded second txIData\n");*/
+
+		PxMat33 invInertia1;
+
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia1.column0.x = inertias[index].column0.x;
+			invInertia1.column0.y = inertias[index].column0.y;
+			invInertia1.column0.z = inertias[index].column0.z;
+			invInertia1.column1.x = inertias[index].column1.x;
+			invInertia1.column1.y = inertias[index].column1.y;
+			invInertia1.column1.z = inertias[index].column1.z;
+			invInertia1.column2.x = inertias[index].column2.x;
+			invInertia1.column2.y = inertias[index].column2.y;
+			invInertia1.column2.z = inertias[index].column2.z;
+		}
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+			//port contact code
+			PxgBlockContactData& contactData = contactCurrentPrepPool[descIndexBatch];
+			PxgBlockContactPoint* baseContact = contactBase + batch.blockContactIndex;
+			PxgBlockFrictionPatch& frictionPatch = currFrictionPatches[descIndexBatch];
+			PxgBlockFrictionAnchorPatch& fAnchor = currFrictionAnchors[descIndexBatch];
+
+			//Fill in correlation information for next frame...
+
+			PxgBlockWorkUnit& unit = workUnits[descIndexBatch];
+
+			PxgBlockFrictionIndex index;
+			index.createPatchIndex(descIndexBatch, threadIndexInWarp);
+
+			//PxU32 frictionIndex = unit.mFrictionIndex[threadIndexInWarp];
+			PxU32 edgeIndex = unit.mEdgeIndex[threadIndexInWarp];
+			PxU32 frictionIndex = edgeIndex + totalCurrentEdges * unit.mPatchIndex[threadIndexInWarp];
+			PxgBlockFrictionIndex* targetIndex = &frictionIndices[frictionIndex];
+				
+			*reinterpret_cast<uint2*>(targetIndex) = reinterpret_cast<uint2&>(index);
+
+			//KS - todo - get some of this in shared memory/registers as quickly as possible...
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			//PxgSolverTxIData* txIData0 = &solverTxIDatas[bodyAIndex];
+			//PxgSolverTxIData* txIData1 = &solverTxIDatas[bodyBIndex];
+
+			const PxAlignedTransform bodyFrame0 = bodyFrames[bodyAIndex];
+			const PxAlignedTransform bodyFrame1 = bodyFrames[bodyBIndex];
+
+			//KS - temporarily read the velocities the "slow" way so we can store the inertia-scaled velocities 
+			//in velocities buffer for now. We can then switch over later when we create the new prep code for the 
+			//TGS solver and leave the PGS solver as-is
+#if 0
+			const float4 linVel_invMass0 = velocities[bodyAIndex];
+			const float4 angVelXYZ_penBiasClamp0 = velocities[bodyAIndex + totalBodies];
+
+			const float4 linVel_invMass1 = velocities[bodyBIndex];
+			const float4 angVelXYZ_penBiasClamp1 = velocities[bodyBIndex + totalBodies];
+#else
+
+			//We use these velocities because these are not multiplied by sqrtInertia. This is a bit slower to read but 
+			//means we can treat kinematics and statics the same in the below code.
+			const float4 linVel_invMass0 = bodyData0->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp0 = bodyData0->initialAngVelXYZ_penBiasClamp;
+
+			const float4 linVel_invMass1 = bodyData1->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp1 = bodyData1->initialAngVelXYZ_penBiasClamp;
+#endif
+
+			const PxReal offsetSlop = PxMax(bodyData0->offsetSlop, bodyData1->offsetSlop);
+
+			PxU32 offset = unit.mWriteback[threadIndexInWarp];
+			const float2 torsionalData = unit.mTorsionalFrictionData[threadIndexInWarp];
+			createFinalizeSolverContactsBlockGPUTGS(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices, *bodyData0, *bodyData1,
+				invInertia0, invInertia1, bodyFrame0, bodyFrame1, linVel_invMass0, angVelXYZ_penBiasClamp0, linVel_invMass1, angVelXYZ_penBiasClamp1,
+				sharedDesc->stepInvDtF32, sharedDesc->stepDt, sharedDesc->dt, sharedDesc->invDtF32, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
+				constraintPrepDesc->biasCoefficient, threadIndexInWarp, offset, &contactHeaders[descIndexBatch], &frictionHeaders[descIndexBatch], &contactPoints[batch.startConstraintIndex], 
+				&frictions[batch.startFrictionIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, offsetSlop,
+				torsionalData);
+
+			frictionPatch.patchIndex[threadIndexInWarp] = unit.mFrictionPatchIndex[threadIndexInWarp];
+
+			PxgBlockFrictionPatch& fpatch = frictionPatch;
+			if (fpatch.anchorCount[threadIndexInWarp] >= 1)
+				fpatch.anchorPoints[0][threadIndexInWarp] = bodyFrame0.transform(fAnchor.body0Anchors[0][threadIndexInWarp]);
+			if (fpatch.anchorCount[threadIndexInWarp] == 2)
+				fpatch.anchorPoints[1][threadIndexInWarp] = bodyFrame0.transform(fAnchor.body0Anchors[1][threadIndexInWarp]);
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintPrepShared.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintPrepShared.cuh
@@ -0,0 +1,378 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__CONSTRAINT_PREP_SHARED_CUH__
+#define	__CONSTRAINT_PREP_SHARED_CUH__
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySolverConstraintTypes.h"
+#include "PxNodeIndex.h"
+#include "PxgArticulation.h"
+#include "PxgEdgeType.h"
+#include "PxgDynamicsConfiguration.h"
+#include "stdio.h"
+#include "utils.cuh"
+#include "PxgSolverFlags.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgPartitionNode.h"
+
+#define PXC_SAME_NORMAL 0.999f
+
+static __device__ physx::PxU32 computeRemapIndexRigidBody(bool isSecondBody, 
+	const physx::PxU32* const PX_RESTRICT partitionStartIndices, 
+	const physx::PxU32* const PX_RESTRICT partitionArticStartIndices, 
+	const physx::PxU32* const PX_RESTRICT partitionJointCounts,
+	const physx::PxU32* const PX_RESTRICT partitionArticulationJointCounts, 
+	const physx::PartitionIndexData& indexData, 
+	physx::PxgSolverReferences* solverBodyReferences, 
+	physx::PxU32 currPartition, 
+	physx::PxU32 maxNbPartitions, 
+	const physx::PxU32 totalActiveBodyCount,
+	const physx::PxU32 bodyId,
+	const physx::PxU32 activeBodyOffset, 
+	const physx::PxU32 totalRigidBatches, 
+	const physx::PxU32 totalArticBatches,
+	const physx::PxU32 nbElemsPerBody,
+	const physx::PxU32 nbSlabs,
+	const physx::PxU32 solverBodyOutputVelocityOffset //Only used for assert
+)
+{
+	using namespace physx;
+	//Computes the remapped index for the rigid body being referenced by this constraint.
+	//This is quite complicated. For rigid body constraints, there are up to 32 pairs of rigid bodies referenced
+	//by each constraint batch.
+	//Joints attached to articulations and articulation contacts are indexed similar to rigid bodies. The rigid 
+	//bodies for articulation constraints appear after the rigid body constraints in the list, prior to the accumulation buffer.
+
+	const PxU32 partitionIndex = indexData.mPartitionIndex;
+	//printf("indexData partitionIndex %i\n", partitionIndex);
+	const PxU32 partitionEntryIndex = indexData.mPartitionEntryIndex;
+	//printf("indexData partitionEntryIndex %i\n", partitionEntryIndex);
+	PxU32 index;
+
+	//printf("indexData type %i\n", indexData.mCType);
+	switch (indexData.mCType)
+	{
+	case PxgEdgeType::eCONTACT_MANAGER:
+		index = partitionStartIndices[partitionIndex] + partitionJointCounts[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eCONSTRAINT:
+		index = partitionStartIndices[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eARTICULATION_CONTACT:
+		index = totalRigidBatches + partitionArticStartIndices[partitionIndex] + partitionArticulationJointCounts[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eARTICULATION_CONSTRAINT:
+		index = totalRigidBatches + partitionArticStartIndices[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	default:
+		break;
+	}
+
+	/*printf("Index is now = %i\n", index);
+
+	printf("partitionJointCounts[%i] = %i\n", partitionIndex, partitionJointCounts[partitionIndex]);
+	printf("partitionArticStartIndices[%i] = %i\n", partitionIndex, partitionArticStartIndices[partitionIndex]);
+	printf("partitionArticulationJointCounts[%i] = %i\n", partitionIndex, partitionArticulationJointCounts[partitionIndex]);
+	printf("partitionEntryIndex %i\n", partitionEntryIndex);*/
+
+	//if (!isArticulation)
+	{
+		//printf("Next is not an articulation constraint!!!\n");
+		const PxU32 batchMask = PXG_BATCH_SIZE - 1;
+
+		index = (index*PXG_BATCH_SIZE * 2)*nbElemsPerBody + (partitionEntryIndex & batchMask);
+		if (isSecondBody)
+			index += PXG_BATCH_SIZE*nbElemsPerBody;
+	}
+	//else
+	//{
+	//	//printf("Next is an articulation constraint index = %i!!!\n", index);
+	//	//Make sure that there's the slot for index, index+32
+	//	index = ((index & (~31))*nbElemsPerBody) + (index&31);
+	//	index += totalRigidBatches * PXG_BATCH_SIZE * 2* nbElemsPerBody;
+	//}
+
+	//printf("index %i Type = %i\n", index, indexData.mCType);
+
+	if (bodyId >= activeBodyOffset)
+	{
+		//printf("bodyId>= activeBodyOffset\n");
+		if ((partitionIndex & (~(maxNbPartitions - 1))) != (currPartition & (~(maxNbPartitions - 1))) || partitionIndex <= currPartition)
+		{
+			//printf("recalculate slabId\n");
+			//We changed slabs, so we need to introduce a new solver reference
+			PxU32 slabId = partitionIndex / maxNbPartitions;
+
+			//printf("slabId %i bodyId %i activeBodyOffset%i \n", slabId, bodyId, activeBodyOffset);
+			//PxU32 referenceId = slabId * totalActiveBodyCount + bodyId - activeBodyOffset;
+			//PxU32 referenceId = slabId * totalActiveBodyCount + bodyId - activeBodyOffset;
+			PxU32 referenceId = nbSlabs * (bodyId - activeBodyOffset) + slabId;
+
+			/*printf("%i: slabId = %i, nbSlabs = %i, bodyId = %i, activeBodyOffset = %i, referenceId = %i\n", 
+				threadIdx.x, slabId, nbSlabs, bodyId, activeBodyOffset, referenceId);*/
+
+			solverBodyReferences[referenceId].mRemappedBodyIndex = index;
+
+			referenceId = (referenceId&(~31))*nbElemsPerBody + (referenceId & 31);
+
+			//printf("solverBodyReferences[%i].mRemappedBodyIndex %i\n", referenceId, solverBodyReferences[referenceId].mRemappedBodyIndex);
+			//Change remap table so that outputs that need to be averaged are placed in a format that is suitable for coalesced loading!
+			//There are totalBatches * 32 * 2 * 2 float4 velocity vectors for the solver, then nbSlabs * bodyCount * 2 float4 velocity vectors for averaging!
+			index = (totalArticBatches + totalRigidBatches) * PXG_BATCH_SIZE * 2 * nbElemsPerBody + referenceId;
+
+			//printf("TotalRigidBatches = %i, totalArticBatches = %i\n", totalRigidBatches, totalArticBatches);
+		}
+	}
+
+	assert(index < solverBodyOutputVelocityOffset);
+
+	return index;
+}
+
+
+static __device__ PX_FORCE_INLINE bool pointsAreClose(const physx::PxAlignedTransform& body1ToBody0,
+	const float4& localAnchor0, const float4& localAnchor1,
+	const float4& axis, float correlDist)
+{
+	using namespace physx;
+
+	const float4 body0PatchPoint1 = body1ToBody0.transform(localAnchor1);
+
+	return PxAbs(dot3(localAnchor0 - body0PatchPoint1, axis))<correlDist;
+}
+
+// Decides whether or not the damper should be turned on. We don't want damping if the contact
+// is not expected to be closed this step because the damper can produce repulsive forces
+// even before the contact is closed.
+static __device__ PX_FORCE_INLINE PxReal computeCompliantDamping(bool isSeparated, bool collidingWithVrel, PxReal damping)
+{
+	const float dampingIfEnabled = (isSeparated && !collidingWithVrel) ? 0.0f : damping;
+	return dampingIfEnabled;
+}
+
+// Storing minimal coefficients to compute constant, unbiasedConstant, velMultiplier, and impulseMultiplier in
+// "compute1dConstraintSolverConstantsPGS"
+static __device__ PX_FORCE_INLINE void queryReduced1dConstraintSolverConstantsPGS(
+	const PxU16 constraintFlags, const PxReal springStiffness, const PxReal springDamping, const PxReal restitution,
+	const PxReal bounceThreshold, const PxReal geometricError, const PxReal velocityTarget,
+	const PxReal jointSpeedForRestitutionBounce, const PxReal erp, const PxReal simDt, const PxReal recipSimDt,
+	PxReal& coeff0, PxReal& coeff1)
+{
+	if (constraintFlags & Px1DConstraintFlag::eSPRING)
+	{
+		// coeff0: a, coeff1: b in "compute1dConstraintSolverConstantsPGS"
+
+		const PxReal a = simDt * simDt * springStiffness + simDt * springDamping;
+		const PxReal b = simDt * (springDamping * velocityTarget - springStiffness * geometricError);
+
+		coeff0 = a;
+		coeff1 = b;
+	}
+	else
+	{
+		// coeff0: constant (to be scaled by recipUnitResponse)
+		// coeff1: unbiasedConstant (to be scaled by recipUnitResponse)
+
+		const PxReal bounceVel = Dy::computeBounceVelocity(constraintFlags, jointSpeedForRestitutionBounce, bounceThreshold, restitution, geometricError);
+		if (bounceVel != 0.0f)
+		{
+			coeff0 = bounceVel;
+			coeff1 = bounceVel;
+		}
+		else
+		{
+			const PxReal geomError = geometricError * erp;
+			coeff0 = velocityTarget - geomError * recipSimDt;
+			coeff1 = (!(constraintFlags & Px1DConstraintFlag::eKEEPBIAS)) ? velocityTarget : coeff0;
+		}
+	}
+}
+
+// Computing constant, unbiasedConstant, velMultiplier, and impulseMultiplier using precomputed coefficients. 
+// See also "queryReduced1dConstraintSolverConstantsPGS" and "compute1dConstraintSolverConstantsPGS."
+static __device__ PX_FORCE_INLINE void compute1dConstraintSolverConstantsPGS
+(bool isSpring, bool isAccelerationSpring, PxReal coeff0, PxReal coeff1, PxReal coeff2,
+	const PxReal unitResponse, const PxReal recipUnitResponse,
+	PxReal& constant, PxReal& unbiasedConstant, PxReal& velMultiplier, PxReal& impulseMultiplier)
+{
+	if (isSpring)
+	{
+		// coeff0: a
+		// coeff1: b
+
+		const PxReal a = coeff0;
+		const PxReal b = coeff1;
+
+		if (isAccelerationSpring)
+		{
+			const PxReal x = 1.0f / (1.0f + a);
+			constant = x * recipUnitResponse * b;
+			unbiasedConstant = constant;
+			velMultiplier = -x * recipUnitResponse * a;
+			impulseMultiplier = 1.0f - x;
+		}
+		else
+		{
+			const PxReal x = 1.0f / (1.0f + a * unitResponse);
+			constant = x * b;
+			unbiasedConstant = constant;
+			velMultiplier = -x * a;
+			impulseMultiplier = 1.0f - x;
+		}
+	}
+	else
+	{
+		// coeff0: constant (to be scaled by recipUnitResponse)
+		// coeff1: unbiasedConstant (to be scaled by recipUnitResponse)
+
+		velMultiplier = -recipUnitResponse;
+		impulseMultiplier = 1.0f;
+
+		constant = coeff0 * recipUnitResponse;
+		unbiasedConstant = coeff1 * recipUnitResponse;
+	}
+
+	// coeff2: initJointSpeed
+	const PxReal velBias = coeff2 * velMultiplier;
+	constant += velBias;
+	unbiasedConstant += velBias;
+}
+
+
+static __device__ PX_FORCE_INLINE PxReal
+computeCompliantContactCoefficients(PxReal dt, PxU8 flags, PxReal restitution, PxReal damping, PxReal unitResponse,
+                                    PxReal recipResponse, PxReal penetration, PxReal targetVelocity, bool isSeparated,
+                                    bool collidingWithVrel, PxReal& velMultiplier, PxReal& impulseMultiplier,
+                                    PxReal& unbiasedErr, PxReal& biasedErr)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+	const PxReal nrdt = dt * restitution;
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	const PxReal a = dt * (dampingIfEnabled - nrdt);
+	const PxReal b = -(nrdt * penetration * massIfAccelElseOne);
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+	// scaledBias = FSel(isSeparated, FNeg(invStepDt), FDiv(FMul(nrdt, FMul(x, unitResponse)), velMultiplier));
+	const PxReal scaledBias = x * b;
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	impulseMultiplier = 1.f - x;
+	unbiasedErr = biasedErr = targetVelocity * velMultiplier - scaledBias;
+
+	return a;
+}
+
+// Query two coefficients, coeff0 and coeff1, to compute contact coefficients efficiently at every sub-timestep or
+// iteration. See "computeCompliantContactCoefficients".
+static __device__ PX_FORCE_INLINE void
+queryReducedCompliantContactCoefficients(PxReal dt, PxU8 flags, PxReal restitution, PxReal damping, PxReal penetration, 
+										 PxReal targetVelocity, bool isSeparated, bool collidingWithVrel, 
+										 PxReal& coeff0, PxReal& coeff1)
+{
+	const PxReal nrdt = dt * restitution;
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	coeff0 = dt * (dampingIfEnabled - nrdt); // a = coeff0
+	coeff1 = -nrdt * penetration; // b = -(nrdt * penetration * massIfAccelElseOne) = coeff1 * massIfAccelElseOne
+}
+
+
+// Compute contact-related coefficients, velMultiplier, impulseMultiplier, unbiasedErr, and biasedErr with precomputed coefficients, coeff0 and coeff1.
+// See "computeCompliantContactCoefficients".
+static __device__ PX_FORCE_INLINE void 
+computeContactCoefficients(PxU8 flags, PxReal restitution, PxReal unitResponse, PxReal recipResponse, PxReal targetVelocity, 
+						   PxReal coeff0, PxReal coeff1, PxReal& velMultiplier, PxReal& impulseMultiplier, 
+						   PxReal& unbiasedErr, PxReal& biasedErr)
+{
+	if (restitution < 0.f)
+	{
+		const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+		const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+		const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+		const PxReal a = coeff0;
+		const PxReal b = coeff1 * massIfAccelElseOne;
+
+		const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+		const PxReal scaledBias = x * b;
+
+		velMultiplier = x * a * massIfAccelElseOne;
+		impulseMultiplier = 1.f - x;
+		unbiasedErr = biasedErr = targetVelocity * velMultiplier - scaledBias;
+	}
+	else
+	{
+		velMultiplier = recipResponse;
+		biasedErr = coeff0 * velMultiplier;
+		unbiasedErr = coeff1 * velMultiplier;
+		impulseMultiplier = 1.f;
+	}
+}
+
+static __device__ PX_FORCE_INLINE PxReal 
+computeCompliantContactCoefficientsTGS(PxReal stepDt, PxU8 flags,PxReal restitution, PxReal damping,
+                                       PxReal unitResponse, PxReal recipResponse, bool isSeparated, bool collidingWithVrel,
+                                       PxReal& velMultiplier, PxReal& scaledBias)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	const PxReal nrdt = stepDt * restitution;
+	const PxReal a = stepDt * (dampingIfEnabled - nrdt);
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	scaledBias = nrdt * x * oneIfAccelElseR;
+
+	return a; // compliant contact coefficient a.
+}
+
+static __device__ PX_FORCE_INLINE void 
+computeCompliantContactCoefficientsTGS(PxU8 flags, PxReal nrdt, PxReal unitResponse, PxReal recipResponse, PxReal a, 
+									  PxReal& velMultiplier, PxReal& scaledBias)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+	//const PxReal nrdt = stepDt * restitution;
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	scaledBias = nrdt * x * oneIfAccelElseR;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintBlockPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintBlockPrep.cuh
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintPrep.cuh
@@ -0,0 +1,368 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef	__CONTACT_CONSTRAINT_PREP_CUH__
+#define	__CONTACT_CONSTRAINT_PREP_CUH__
+
+
+#include "PxgSolverBody.h"
+#include "PxgConstraintBlock.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "cutil_math.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpuArticulation.h"
+#include "PxMaterial.h"
+#include "PxgSolverKernelIndices.h"
+#include "PxgSolverFlags.h"
+#include "MemoryAllocator.cuh"
+#include "vector.cuh"
+#include "PxgCommonDefines.h"
+#include "constraintPrepShared.cuh"
+#include "copy.cuh"
+
+using namespace physx;
+
+
+
+static __device__ bool getFrictionPatches(PxgFrictionPatch&  frictionPatch,
+	PxgFrictionAnchorPatch& anchorPatch,
+	const PxgBlockFrictionIndex* PX_RESTRICT prevFrictionIndices,
+	const PxU32 prevFrictionStartIndex,
+	const PxgFrictionPatch* PX_RESTRICT previousPatches,
+	const PxgFrictionAnchorPatch* PX_RESTRICT previousAnchors,
+	PxU32 frictionPatchCount,
+	const PxAlignedTransform& bodyFrame0,
+	const PxAlignedTransform& bodyFrame1,
+	PxReal correlationDistance,
+	const PxU32 totalNbEdges,
+	PxReal& patchExtents,
+	const PxU32 threadIndexInWarp)
+{
+	if (prevFrictionStartIndex == 0xFFFFFFFF || frictionPatchCount == 0)
+		return true;
+
+	PxgFrictionPatch& newPatch = frictionPatch;
+	PxgFrictionAnchorPatch& newAnchor = anchorPatch;
+
+	for (PxU32 a = 0; a < frictionPatchCount; a++)
+	{
+		const PxU64 index = prevFrictionIndices[prevFrictionStartIndex + a*totalNbEdges].getPatchIndex();
+		//indices += totalNbEdges;
+		const PxgFrictionPatch& oldPatch = previousPatches[index];
+		const PxgFrictionAnchorPatch& oldAnchor = previousAnchors[index];
+
+		assert(oldPatch.broken == 0 || oldPatch.broken == 1);
+		if (!oldPatch.broken)
+		{
+			const float4 oldBody0Normal = oldPatch.body0Normal;
+			if (dot3(oldBody0Normal, newPatch.body0Normal) > PXC_SAME_NORMAL) //TODO - check that they're the same material!
+			{
+				const PxU8 anchorCount = oldPatch.anchorCount;
+				if (anchorCount != 0)
+				{
+					assert(anchorCount <= 2);
+
+					const PxAlignedTransform body1ToBody0 = bodyFrame0.transformInv(bodyFrame1);
+					const float4 oldBody1Normal = oldPatch.body1Normal;
+					const float result = dot3(oldBody0Normal, body1ToBody0.rotate(oldBody1Normal));
+					if (dot3(oldBody0Normal, body1ToBody0.rotate(oldBody1Normal)) > PXC_SAME_NORMAL)
+					{
+
+						const float4 body0Anchor0 = oldAnchor.body0Anchors[0];
+						const float4 body1Anchor0 = oldAnchor.body1Anchors[0];
+						if (pointsAreClose(body1ToBody0, body0Anchor0, body1Anchor0, oldBody0Normal, correlationDistance))
+						{
+							const float4 body0Anchor1 = oldAnchor.body0Anchors[1];
+							const float4 body1Anchor1 = oldAnchor.body1Anchors[1];
+							if (anchorCount < 2 || pointsAreClose(body1ToBody0, body0Anchor1, body1Anchor1, oldBody0Normal, correlationDistance))
+							{
+								newPatch.contactID[0] = 0xff;
+								newPatch.contactID[1] = 0xff;
+								newPatch.anchorCount = anchorCount;
+								newPatch.body0Normal = oldBody0Normal;
+								newPatch.body1Normal = oldBody1Normal;
+								newAnchor.body0Anchors[0] = body0Anchor0;
+								newAnchor.body0Anchors[1] = body0Anchor1;
+								newAnchor.body1Anchors[0] = body1Anchor0;
+								newAnchor.body1Anchors[1] = body1Anchor1;
+
+								const float4 ext = (body0Anchor0 - body0Anchor1);
+								patchExtents = ext.x*ext.x + ext.y*ext.y + ext.z*ext.z;
+								return true; //Found a match = terminate!
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+
+	return true;
+}
+
+static __device__ void growPatches(PxgFrictionPatch& fp, PxgFrictionAnchorPatch& fAnchor,
+	const physx::PxgContactPoint* msContacts, const PxU32 numContacts,
+	const physx::PxTransform& msBodyFrame0,
+	const physx::PxTransform& msBodyFrame1,
+	float frictionOffsetThreshold,
+	const PxReal anchorSqDistance,
+	const float minimum,			//PxBounds3
+	const float maximum,			//PxBounds3
+	ScratchMemoryAllocator& sAlloc,
+	const PxU32 threadIndexInWarp)
+{
+	using namespace physx;
+	PxU32 oldAnchorCount = fp.anchorCount;
+
+	ScratchMemoryMarker marker(sAlloc);
+
+
+	if (oldAnchorCount == 2)
+	{
+		float dif = 0.f;
+		if (threadIndexInWarp < 3)
+		{
+			dif = maximum - minimum;
+			dif = dif * dif;
+		}
+
+		const PxReal frictionPatchDiagonalSq = __shfl_sync(FULL_MASK, dif, 0)
+			+ __shfl_sync(FULL_MASK, dif, 1)
+			+ __shfl_sync(FULL_MASK, dif, 2);
+
+		//If the squared distance between the anchors is more than a quarter of the patch diagonal, we can keep, 
+		//otherwise the anchors are potentially clustered around a corner so force a rebuild of the patch
+		if ((anchorSqDistance * 4.f) >= frictionPatchDiagonalSq)
+			return;
+
+		oldAnchorCount = 0;
+
+	}
+
+	//__shared__ PxVec3 worldAnchors[2];
+	//__shared__ PxU32 contactID[2];
+
+	PxVec3* msWorldAnchors = sAlloc.allocAligned<PxVec3>(sizeof(PxVec3) * 2);
+	PxU32* msContactID = sAlloc.allocAligned<PxU32>(sizeof(PxU32) * 2);
+
+	PxU16 anchorCount = 0;
+	PxReal pointDistSq = 0.0f, dist0, dist1;
+
+	// if we have an anchor already, keep it
+	if (oldAnchorCount == 1)
+	{
+
+		float v = 0.f;
+		if (threadIndexInWarp < 3)
+		{
+			float* anchors = reinterpret_cast<float*>(&fAnchor.body0Anchors[0].x);
+			v = anchors[threadIndexInWarp];
+		}
+
+		transform(v, msBodyFrame0, msWorldAnchors[0], threadIndexInWarp);
+
+
+		if (threadIndexInWarp == 0)
+			msContactID[0] = 0xFF;
+
+		/*const PxVec3 v(fAnchor.body0Anchors[0].x, fAnchor.body0Anchors[0].y, fAnchor.body0Anchors[0].z);
+		worldAnchors[0] = bodyFrame0.transform(v);
+		contactID[0] = 0xFF;*/
+		anchorCount++;
+	}
+
+	__syncwarp();
+
+	//PxVec3& msWorldPoint = *sAlloc.alloc<PxVec3>(sizeof(PxVec3));
+
+	for (PxU32 j = 0; j<numContacts; j++)
+	{
+		const PxReal separation = msContacts[j].point_separationW.w;
+
+		if (separation < frictionOffsetThreshold)
+		{
+			//const float* contacts = reinterpret_cast<const float*>(&msContacts[j].point_separationW.x);
+			const PxVec3& worldPoint = reinterpret_cast<const PxVec3&>(msContacts[j].point_separationW.x);
+			switch (anchorCount)
+			{
+			case 0:
+				if (threadIndexInWarp < 3)
+				{
+					msWorldAnchors[0][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+					if (threadIndexInWarp == 0)
+						msContactID[0] = PxU16(j);
+				}
+				anchorCount++;
+				__syncwarp();
+				/*contactID[0] = PxU16(j);
+				worldAnchors[0] = worldPoint;
+				anchorCount++;*/
+				break;
+			case 1:
+				//pointDistSq = (worldPoint - worldAnchors[0]).magnitudeSquared();
+				pointDistSq = negateMagnitudeSquared(worldPoint, msWorldAnchors[0], threadIndexInWarp);
+				if (pointDistSq > 1e-8f)
+				{
+					if (threadIndexInWarp < 3)
+					{
+						msWorldAnchors[1][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+						if (threadIndexInWarp == 0)
+							msContactID[1] = PxU16(j);
+					}
+					anchorCount++;
+
+					__syncwarp();
+
+				}
+
+				break;
+			default: //case 2
+				dist0 = negateMagnitudeSquared(worldPoint, msWorldAnchors[0], threadIndexInWarp);
+				dist1 = negateMagnitudeSquared(worldPoint, msWorldAnchors[1], threadIndexInWarp);
+
+				//dist0 = (worldPoint - worldAnchors[0]).magnitudeSquared();
+				//dist1 = (worldPoint - worldAnchors[1]).magnitudeSquared();
+				if (dist0 > dist1)
+				{
+					if (dist0 > pointDistSq)
+					{
+						if (threadIndexInWarp < 3)
+						{
+							msWorldAnchors[1][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+							if (threadIndexInWarp == 0)
+								msContactID[1] = PxU16(j);
+						}
+						//contactID[1] = PxU16(j);
+						//worldAnchors[1] = worldPoint;
+						pointDistSq = dist0;
+
+						__syncwarp();
+					}
+				}
+				else if (dist1 > pointDistSq)
+				{
+					if (threadIndexInWarp < 3)
+					{
+						msWorldAnchors[0][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+						if (threadIndexInWarp == 0)
+							msContactID[0] = PxU16(j);
+					}
+					/*contactID[0] = PxU16(j);
+					worldAnchors[0] = worldPoint;*/
+					pointDistSq = dist1;
+
+					__syncwarp();
+				}
+			}
+		}
+	}
+
+
+
+	switch (anchorCount)
+	{
+	case 2:
+	{
+		//KS - if there is a 2nd anchor, we always write it. If we already had 2 anchors, we would have exited earlier!
+
+		transformInv(msWorldAnchors[1], msBodyFrame0, reinterpret_cast<PxVec3&>(fAnchor.body0Anchors[1]), threadIndexInWarp);
+		transformInv(msWorldAnchors[1], msBodyFrame1, reinterpret_cast<PxVec3&>(fAnchor.body1Anchors[1]), threadIndexInWarp);
+
+	}
+	case 1:
+		if (oldAnchorCount == 0)
+		{
+			//KS - if there is a 2nd anchor, we always write it. If we already had 2 anchors, we would have exited earlier!
+			transformInv(msWorldAnchors[0], msBodyFrame0, reinterpret_cast<PxVec3&>(fAnchor.body0Anchors[0]), threadIndexInWarp);
+			transformInv(msWorldAnchors[0], msBodyFrame1, reinterpret_cast<PxVec3&>(fAnchor.body1Anchors[0]), threadIndexInWarp);
+		}
+	default:
+		break;
+	};
+
+	if (threadIndexInWarp == 0)
+		fp.anchorCount = anchorCount;
+
+	__syncwarp();
+}
+
+static __device__ void initFrictionPatch(physx::PxgFrictionPatch& p,
+	const float msBody0Normal, const float msBody1Normal,
+	const PxU32 threadIndexInWarp)
+{
+	if (threadIndexInWarp < 3)
+	{
+		float* dBody0Normal = &p.body0Normal.x;
+		dBody0Normal[threadIndexInWarp] = msBody0Normal;
+
+		float* dBody1Normal = &p.body1Normal.x;
+		dBody1Normal[threadIndexInWarp] = msBody1Normal;
+
+		if (threadIndexInWarp == 0)
+		{
+			dBody0Normal[3] = 0.f;
+			dBody1Normal[3] = 0.f;
+			p.anchorCount = 0;
+			p.broken = 0;
+		}
+	}
+	/*p.body0Normal = make_float4(body0Normal.x, body0Normal.y, body0Normal.z, 0.f);
+	p.body1Normal = make_float4(body1Normal.x, body1Normal.y, body1Normal.z, 0.f);
+	p.anchorCount = 0;
+	p.broken = 0;*/
+}
+
+
+static __device__ void correlatePatches(PxgFrictionPatch& frictionPatch, const physx::PxgContactPoint* contacts, const PxU32 nbContacts,
+	const float msNormal, const physx::PxAlignedTransform& msBodyFrame0, const physx::PxAlignedTransform& msBodyFrame1,
+	float normalTolerance, ScratchMemoryAllocator& sAlloc, const PxU32 threadIndexInWarp)
+{
+	using namespace physx;
+
+	if (nbContacts > 0)
+	{
+
+		const PxQuat& quat0 = reinterpret_cast<const PxQuat&>(msBodyFrame0.q);
+		const PxQuat& quat1 = reinterpret_cast<const PxQuat&>(msBodyFrame1.q);
+
+		float msBody0Normal = rotateInvR(msNormal, quat0, threadIndexInWarp);
+		float msBody1Normal = rotateInvR(msNormal, quat1, threadIndexInWarp);
+
+		initFrictionPatch(frictionPatch, msBody0Normal, msBody1Normal, threadIndexInWarp);
+
+		__syncwarp();
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cu
@@ -0,0 +1,115 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySleepingConfigulation.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "assert.h"
+#include "stdio.h"
+#include "integration.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels3() {}
+
+extern "C" __global__ void integrateCoreParallelLaunch(
+	const uint32_t offset, const PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
+	const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
+	const PxU32* PX_RESTRICT islandIds, 
+	const PxU32* PX_RESTRICT islandStaticTouchCounts,
+	const PxU32* PX_RESTRICT numCountedInteractions)
+{	
+	//integrateCoreParallel(motionVelocity, solverBody, solverBodyData, numBodies, dt);
+	uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const float4* PX_RESTRICT motionVelocityArray = solverCoreDesc->motionVelocityArray;
+	const uint32_t numSolverBodies = solverCoreDesc->numSolverBodies;
+	const float4* PX_RESTRICT solverBodyVelocity = sharedDesc->iterativeData.solverBodyVelPool;
+
+	const PxgSolverTxIData* PX_RESTRICT txIData = solverCoreDesc->solverBodyTxIDataPool;
+
+	float4* PX_RESTRICT outSolverVelocity = solverCoreDesc->outSolverVelocity;
+	PxAlignedTransform* PX_RESTRICT outBody2World = solverCoreDesc->outBody2World;
+
+	//for(uint32_t a = idx+offset; a < numSolverBodies; a+=blockSize)
+	uint32_t a = idx+offset;
+	if(a < numSolverBodies)
+	{
+		const PxgSolverBodyData& data = solverCoreDesc->solverBodyDataPool[a];
+
+		const PxU32 nodeIndex = data.islandNodeIndex.index();// >> 2;
+
+		PxgBodySim&	bodySim = solverCoreDesc->mBodySimBufferDeviceData[nodeIndex];
+
+		//KS - TODO - access all data via shared memory
+		// PT: TODO: TGS version uses a copy here, what's better?
+		const PxMat33& sqrtInvInertia = txIData[a].sqrtInvInertia;
+		
+		PxAlignedTransform body2World = bodySim.body2World;	// PT: TODO: TGS version uses outBody2World[a] here, why?
+		const float4 inverseInertia = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+
+		const PxU32 index = solverCoreDesc->accumulatedBodyDeltaVOffset + a;
+
+		// PT: TODO: TGS version uses tmp v0/v1 values here, why?
+		float4 linVel = solverBodyVelocity[index];
+		float4 angVel = solverBodyVelocity[index + numSolverBodies];
+
+		const PxU32 staticTouchCount = islandStaticTouchCounts[islandIds[nodeIndex]];
+
+		//printf("Integrating %i: index = %i, a = %i\n", nodeIndex, index, a);
+
+		//we need to dma the sleep data back for post solver task
+		PxgSolverBodySleepData& sleepData = solverCoreDesc->solverBodySleepDataPool[a];
+
+		integrateCore(motionVelocityArray[a], motionVelocityArray[a + numSolverBodies], inverseInertia, linVel, angVel, body2World, data, bodySim, sleepData, sqrtInvInertia,
+			sharedDesc->dt, sharedDesc->invDtF32, solverCoreDesc->enableStabilization, staticTouchCount != 0, numCountedInteractions[nodeIndex],
+			nodeIndex);
+
+		// PT: TODO: why do we write out the vels & pose to 2 different buffers?
+		outSolverVelocity[a] = linVel;
+		outSolverVelocity[a+numSolverBodies] = angVel;
+		outBody2World[a] = body2World;
+
+		// PT: for acceleration getters (eENABLE_BODY_ACCELERATIONS)
+		PxgBodySimVelocities* prevVelocities = solverCoreDesc->mBodySimPrevVelocitiesBufferDeviceData;
+		if(prevVelocities)
+		{
+			PxgBodySimVelocities& prev = prevVelocities[nodeIndex];
+			prev.linearVelocity = bodySim.linearVelocityXYZ_inverseMassW;
+			prev.angularVelocity = bodySim.angularVelocityXYZ_maxPenBiasW;
+		}
+
+		//write back linear velocity, angular velocity to pxgbodysim
+		bodySim.linearVelocityXYZ_inverseMassW.x = linVel.x; bodySim.linearVelocityXYZ_inverseMassW.y = linVel.y; bodySim.linearVelocityXYZ_inverseMassW.z = linVel.z;
+		bodySim.angularVelocityXYZ_maxPenBiasW.x = angVel.x; bodySim.angularVelocityXYZ_maxPenBiasW.y = angVel.y; bodySim.angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+		bodySim.body2World = body2World;
+		assert(body2World.isSane());
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cuh
@@ -0,0 +1,435 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "PxgSolverKernelIndices.h"
+#include "DySleepingConfigulation.h"
+#include "stdio.h"
+
+using namespace physx;
+
+static __device__ void updateWakeCounter(bool& freeze, float4& solverBodyLinVel, float4& solverBodyAngVel, const PxAlignedTransform& body2World, 
+	PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const float4& inverseInertia, const PxVec3& linearMotionVel, const PxVec3& angularMotionVel, const float invertedMass, const float dt,
+	const float invDt, const bool enableStabilization, const bool hasStaticTouch, PxU32 numCountedInteractions,
+	PxU32 nodeIndex)
+{
+	// update the body's sleep state and 
+	PxReal wakeCounterResetTime = 20.0f*0.02f;
+
+	float4 freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex = bodySim.freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex;
+	float4 sleepLinVelAccXYZ_freezeCountW = bodySim.sleepLinVelAccXYZ_freezeCountW;
+	float4 sleepAngVelAccXYZ_accelScaleW = bodySim.sleepAngVelAccXYZ_accelScaleW;
+
+	PxReal wc = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y; //wakeCounter;
+
+	PxU32 flags = bodySim.internalFlags & (PxsRigidBody::eDISABLE_GRAVITY_GPU | PxsRigidBody::eFROZEN | PxsRigidBody::eENABLE_GYROSCOPIC | PxsRigidBody::eRETAIN_ACCELERATION);
+	PxReal freezeCount = sleepLinVelAccXYZ_freezeCountW.w;
+	PxReal accelScale = sleepAngVelAccXYZ_accelScaleW.w;
+
+	bool alreadyUpdateWC = false;
+	PxVec3 sleepLinVelAcc(0.f), sleepAngVelAcc(0.f);
+
+	{
+		if (enableStabilization)
+		{
+			const PxU32 maxCountedInteractions = 10u; //KS - arbitrary limit to make sure that 
+													  //bool freeze = false;
+													  //const PxAlignedTransform& body2World = solverBodyData.body2World;
+
+													  // calculate normalized energy: kinetic energy divided by mass
+			const PxVec3 inertia(inverseInertia.x > 0.f ? 1.0f / inverseInertia.x : 1.f, inverseInertia.y > 0.f ? 1.0f / inverseInertia.y : 1.f, inverseInertia.z > 0.f ? 1.0f / inverseInertia.z : 1.f);
+			sleepLinVelAcc = linearMotionVel;
+			sleepAngVelAcc = angularMotionVel;
+
+			// scale threshold by cluster factor (more contacts => higher sleep threshold)
+			const PxU32 clusterFactor = PxMin(numCountedInteractions, maxCountedInteractions);
+
+			PxReal invMass = invertedMass;// intialVel_invMass.w;
+			if (invMass == 0.f)
+				invMass = 1.f;
+
+			const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+			const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+			PxReal frameNormalizedEnergy = 0.5f * (angular + linear);
+
+			const PxReal cf = hasStaticTouch ? clusterFactor : 0.f;
+			const PxReal freezeThresh = cf * freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x;// solverBodySleepData.freezeThreshold;
+			freezeCount = PxMax(freezeCount - dt, 0.0f);
+
+			bool settled = true;
+
+			accelScale = PxMin(1.f, accelScale + dt);
+
+			if (frameNormalizedEnergy >= freezeThresh)
+			{
+				settled = false;
+
+				freezeCount = PXD_FREEZE_INTERVAL;
+			}
+
+			if (!hasStaticTouch)
+			{
+				accelScale = 1.f;
+				settled = false;
+			}
+
+			if (settled)
+			{
+				//Dampen bodies that are just about to go to sleep
+				if (cf > 1)
+				{
+					const PxReal d = 1.0f - (PXD_SLEEP_DAMPING * dt);
+
+					solverBodyLinVel = solverBodyLinVel * d;
+					solverBodyAngVel = solverBodyAngVel * d;
+					accelScale = accelScale * 0.75f + 0.25f*PXD_FREEZE_SCALE;
+				}
+				
+				freeze = freezeCount == 0.f && frameNormalizedEnergy < (freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x * PXD_FREEZE_TOLERANCE);
+			}
+
+			if (freeze)
+			{
+				//current flag isn't frozen but freeze flag raise so we need to raise the frozen flag in this frame
+
+				bool wasNotFrozen = (flags & PxsRigidBody::eFROZEN) == 0;
+				flags |= PxsRigidBody::eFROZEN;
+				if (wasNotFrozen)
+				{
+					flags |= PxsRigidBody::eFREEZE_THIS_FRAME;
+				}
+			}
+			else
+			{
+				bool wasFrozen = (flags & PxsRigidBody::eFROZEN) != 0;
+				flags &= (PxsRigidBody::eDISABLE_GRAVITY_GPU | PxsRigidBody::eENABLE_GYROSCOPIC | PxsRigidBody::eRETAIN_ACCELERATION);
+				if (wasFrozen)
+				{
+					flags |= PxsRigidBody::eUNFREEZE_THIS_FRAME;
+				}
+			}
+
+			/*KS: New algorithm for sleeping when using stabilization:
+			* Energy *this frame* must be higher than sleep threshold and accumulated energy over previous frames
+			* must be higher than clusterFactor*energyThreshold.
+			*/
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//Accumulate energy
+				sleepLinVelAcc.x += sleepLinVelAccXYZ_freezeCountW.x;
+				sleepLinVelAcc.y += sleepLinVelAccXYZ_freezeCountW.y;
+				sleepLinVelAcc.z += sleepLinVelAccXYZ_freezeCountW.z;
+
+				sleepAngVelAcc.x += sleepAngVelAccXYZ_accelScaleW.x;
+				sleepAngVelAcc.y += sleepAngVelAccXYZ_accelScaleW.y;
+				sleepAngVelAcc.z += sleepAngVelAccXYZ_accelScaleW.z;
+
+				//If energy this frame is high
+				const PxReal sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+				if (frameNormalizedEnergy >= sleepThreshold)
+				{
+					//Compute energy over sleep preparation time
+		
+					const PxReal sleepAngular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+					const PxReal sleepLinear = sleepLinVelAcc.magnitudeSquared();
+
+					PxReal normalizedEnergy = 0.5f * (sleepAngular + sleepLinear);
+					PxReal sleepClusterFactor = clusterFactor + 1.f;
+
+					// scale threshold by cluster factor (more contacts => higher sleep threshold)
+					const PxReal threshold = sleepClusterFactor * sleepThreshold;
+
+					//If energy over sleep preparation time is high
+					if (normalizedEnergy >= threshold)
+					{
+						//Wake up
+						//assert(isActive());
+						
+						sleepAngVelAcc = PxVec3(0);
+						sleepLinVelAcc = PxVec3(0);
+
+						const float factor = sleepThreshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+						PxReal oldWc = wc;
+						wc = factor * 0.5f * wakeCounterResetTime + dt * (sleepClusterFactor - 1.0f);
+
+						//if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+						//	notifyNotReadyForSleeping(bodyCore.nodeIndex);
+
+						if (oldWc == 0.0f)
+							flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+
+						alreadyUpdateWC = true;
+					}
+				}
+			}
+		}
+		else
+		{
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//const PxAlignedTransform& body2World = solverBodyData.body2World;
+
+				// calculate normalized energy: kinetic energy divided by mass
+				const PxVec3 inertia(inverseInertia.x > 0.f ? 1.0f / inverseInertia.x : 1.f, inverseInertia.y > 0.f ? 1.0f / inverseInertia.y : 1.f, inverseInertia.z > 0.f ? 1.0f / inverseInertia.z : 1.f);
+
+				sleepLinVelAcc = linearMotionVel;// originalBody->mAcceleration.linear;
+				sleepAngVelAcc = body2World.q.rotateInv(angularMotionVel);// originalBody->mAcceleration.angular;
+
+				sleepLinVelAcc.x += sleepLinVelAccXYZ_freezeCountW.x;
+				sleepLinVelAcc.y += sleepLinVelAccXYZ_freezeCountW.y;
+				sleepLinVelAcc.z += sleepLinVelAccXYZ_freezeCountW.z;
+
+				sleepAngVelAcc.x += sleepAngVelAccXYZ_accelScaleW.x;
+				sleepAngVelAcc.y += sleepAngVelAccXYZ_accelScaleW.y;
+				sleepAngVelAcc.z += sleepAngVelAccXYZ_accelScaleW.z;
+				
+
+				PxReal invMass = invertedMass;
+				if (invMass == 0.f)
+					invMass = 1.f;
+
+				const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+				const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+				PxReal normalizedEnergy = 0.5f * (angular + linear);
+
+				// scale threshold by cluster factor (more contacts => higher sleep threshold)
+				const PxReal clusterFactor = PxReal(1 + numCountedInteractions);
+
+				const PxReal sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+				const PxReal threshold = clusterFactor * sleepThreshold;
+
+				if (normalizedEnergy >= threshold)
+				{
+					//assert(isActive());
+					sleepLinVelAcc = PxVec3(0);
+					sleepAngVelAcc = PxVec3(0);
+					const float factor = threshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+					PxReal oldWc = wc;
+					wc = factor * 0.5f * wakeCounterResetTime + dt * (clusterFactor - 1.0f);
+					
+					if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+					{
+						flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+					}
+
+					alreadyUpdateWC = true;
+				}
+			}
+		}
+	}
+
+	if(!alreadyUpdateWC)
+		wc = PxMax(wc - dt, 0.0f);
+
+	bool wakeCounterZero = (wc == 0.0f);
+
+	if (wakeCounterZero)
+	{
+		flags |= PxsRigidBody::eDEACTIVATE_THIS_FRAME;
+		sleepLinVelAcc = PxVec3(0);
+		sleepAngVelAcc = PxVec3(0);
+	}
+
+	bodySim.internalFlags = flags;
+	bodySim.freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y = wc;
+	bodySim.sleepLinVelAccXYZ_freezeCountW = make_float4(sleepLinVelAcc.x, sleepLinVelAcc.y, sleepLinVelAcc.z, freezeCount);
+	bodySim.sleepAngVelAccXYZ_accelScaleW = make_float4(sleepAngVelAcc.x, sleepAngVelAcc.y, sleepAngVelAcc.z, accelScale);
+
+	if (!(flags & PxsRigidBody::eRETAIN_ACCELERATION))
+	{
+		bodySim.externalLinearAcceleration = make_float4(0.f, 0.f, 0.f, 0.f);
+		bodySim.externalAngularAcceleration = make_float4(0.f, 0.f, 0.f, 0.f);
+	}
+
+	sleepData.internalFlags = flags;
+	sleepData.wakeCounter = wc;
+}
+
+static __device__ bool sleepCheck(float4& solverBodyLinVel, float4& solverBodyAngVel, const PxAlignedTransform& body2World, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const float4& inverseInertia, const PxVec3& linearMotionVel, const PxVec3& angularMotionVel, const float invertedMass, const float dt, const float invDt, const bool enableStabilization,
+	const bool hasStaticTouch, const PxU32 numCountedInteractions, PxU32 nodeIndex)
+{
+	bool freeze = false;
+	updateWakeCounter(freeze, solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, invertedMass, dt,
+		invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	return freeze;
+}
+
+static __device__ void integrateCore(const float4 motionLinVelXYZW, const float4 motionAngVelXYZW, const float4& inverseInertia, float4& solverBodyLinVel,
+	float4& solverBodyAngVel, PxAlignedTransform& body2World, const PxgSolverBodyData& solverBodyData, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const PxMat33& sqrtInvInertia, const float dt, const float invDt, const bool enableStabilization, const bool hasStaticTouch, const PxU32 numCountedInteractions,
+	const PxU32 nodeIndex)
+{
+	// Integrate linear part
+	const float4 initialLinVelXYZ_invMassW = solverBodyData.initialLinVelXYZ_invMassW;
+	const float4 initialAngVelXYZ_penBiasClamp = solverBodyData.initialAngVelXYZ_penBiasClamp;
+	//ML: solverBodyData.initialLinVelocity store the PxsBodyCore (original )linearVelocity and angularVelocity
+	const PxVec3 initialLinVel(initialLinVelXYZ_invMassW.x, initialLinVelXYZ_invMassW.y, initialLinVelXYZ_invMassW.z);
+	const PxVec3 initialAngVel(initialAngVelXYZ_penBiasClamp.x, initialAngVelXYZ_penBiasClamp.y, initialAngVelXYZ_penBiasClamp.z);
+
+	PxU32 lockFlags = bodySim.lockFlags;
+
+	//update body lin and ang velocity
+	PxVec3 bodyLinearVelocity(solverBodyLinVel.x, solverBodyLinVel.y, solverBodyLinVel.z);
+	PxVec3 bodyAngVelocity(solverBodyAngVel.x, solverBodyAngVel.y, solverBodyAngVel.z);
+
+#ifndef IS_TGS_SOLVER
+	bodyLinearVelocity = initialLinVel + bodyLinearVelocity;
+	bodyAngVelocity = initialAngVel + sqrtInvInertia * bodyAngVelocity;
+#else
+	bodyAngVelocity = sqrtInvInertia * bodyAngVelocity;
+#endif
+
+	solverBodyLinVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : bodyLinearVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : bodyLinearVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : bodyLinearVelocity.z, 0.f);
+	solverBodyAngVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : bodyAngVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : bodyAngVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : bodyAngVelocity.z, 0.f);
+
+	//we need to perform sleep check here to decide whether we want to update body2World transform for the body
+	const PxVec3 motionLinVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : motionLinVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : motionLinVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : motionLinVelXYZW.z);
+	const PxVec3 motionAngVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : motionAngVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : motionAngVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : motionAngVelXYZW.z);
+
+#ifndef IS_TGS_SOLVER
+	PxVec3 linearMotionVel = initialLinVel + motionLinVel;
+	PxVec3 angularMotionVel = initialAngVel + sqrtInvInertia * motionAngVel;
+
+	//printf("%i: DeltaLinVel = (%f, %f, %f)\n", nodeIndex, motionLinVel.x, motionLinVel.y, motionLinVel.z);
+#else
+	PxVec3 linearMotionVel = motionLinVel;
+	PxVec3 angularMotionVel = sqrtInvInertia * motionAngVel;
+#endif
+
+	// Integrate the rotation using closed form quaternion integrator
+	PxReal w = angularMotionVel.magnitudeSquared();
+	w = PxSqrt(w);
+	const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+	if (w > maxW)
+	{
+		angularMotionVel = angularMotionVel.getNormalized() * maxW;
+		w = maxW;
+	}
+
+	const bool freeze = sleepCheck(solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, initialLinVelXYZ_invMassW.w,
+		dt, invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	if (!freeze)
+	{
+		PxVec3 delta = linearMotionVel * dt;
+		body2World.p.x += delta.x; body2World.p.y += delta.y; body2World.p.z += delta.z;
+
+		if (w != 0.0f)
+		{
+			const PxReal v = dt * w * 0.5f;
+			PxReal s, q;
+			//s = sin(v);
+			//q = cos(v);
+			__sincosf(v, &s, &q);
+			s /= w;
+
+			const PxVec3 pqr = angularMotionVel * s;
+			const PxAlignedQuat quatVel(pqr.x, pqr.y, pqr.z, 0);
+			PxAlignedQuat result = quatVel * body2World.q;
+
+			result += body2World.q * q;
+
+			//ML: solverBodyData store the current transform for PxsBodyCore
+			body2World.q = result.getNormalized();
+		}
+	}
+}
+
+static __device__ void integrateCoreTGS(const float4 motionLinVelXYZW, const float4 motionAngVelXYZW, const float4& inverseInertia, float4& solverBodyLinVel,
+	float4& solverBodyAngVel, PxAlignedTransform& body2World, const PxTransform& deltaBody2World, const PxgSolverBodyData& solverBodyData, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const PxMat33& sqrtInvInertia, const float dt, const float invDt, const bool enableStabilization, const bool hasStaticTouch, const PxU32 numCountedInteractions,
+	const PxU32 nodeIndex)
+{
+	const PxU32 lockFlags = bodySim.lockFlags;
+
+	//KS - TODO - optimize this away
+	const float4 initialLinVelXYZ_invMassW = solverBodyData.initialLinVelXYZ_invMassW;
+
+	//update body lin and ang velocity
+	PxVec3 bodyLinearVelocity(solverBodyLinVel.x, solverBodyLinVel.y, solverBodyLinVel.z);
+	PxVec3 bodyAngVelocity(solverBodyAngVel.x, solverBodyAngVel.y, solverBodyAngVel.z);
+	bodyAngVelocity = sqrtInvInertia * bodyAngVelocity;
+
+	solverBodyLinVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : bodyLinearVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : bodyLinearVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : bodyLinearVelocity.z, 0.f);
+	solverBodyAngVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : bodyAngVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : bodyAngVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : bodyAngVelocity.z, 0.f);
+
+	//we need to perform sleep check here to decide whether we want to update body2World transform for the body
+	const PxVec3 motionLinVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : motionLinVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : motionLinVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : motionLinVelXYZW.z);
+	const PxVec3 motionAngVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : motionAngVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : motionAngVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : motionAngVelXYZW.z);
+
+	PxVec3 linearMotionVel = motionLinVel;
+	PxVec3 angularMotionVel = sqrtInvInertia * motionAngVel;
+
+	// Integrate the rotation using closed form quaternion integrator
+	PxReal w = angularMotionVel.magnitudeSquared();
+	w = PxSqrt(w);
+	const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+	if (w > maxW)
+	{
+		angularMotionVel = angularMotionVel.getNormalized() * maxW;
+		w = maxW;
+	}
+
+	const bool freeze = sleepCheck(solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, initialLinVelXYZ_invMassW.w,
+		dt, invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	if (!freeze)
+	{
+		//printf("DeltaP = (%f, %f, %f)\n", deltaBody2World.p.x, deltaBody2World.p.y, deltaBody2World.p.z);
+		body2World.p.x += deltaBody2World.p.x; body2World.p.y += deltaBody2World.p.y; body2World.p.z += deltaBody2World.p.z;
+		PxQuat q(body2World.q.q.x, body2World.q.q.y, body2World.q.q.z, body2World.q.q.w);
+		q = (deltaBody2World.q * q).getNormalized();
+
+		body2World.q.q.x = q.x; body2World.q.q.y = q.y; body2World.q.q.z = q.z; body2World.q.q.w = q.w;
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integrationTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integrationTGS.cu
@@ -0,0 +1,140 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#define IS_TGS_SOLVER
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySleepingConfigulation.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "assert.h"
+#include "stdio.h"
+#include "integration.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels11() {}
+
+//KS - this will change dramatically once we have all the TGS functionality working!
+extern "C" __global__ void integrateCoreParallelLaunchTGS(
+	const uint32_t offset, const PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
+	const PxgSolverSharedDesc<IterativeSolveDataTGS>* PX_RESTRICT sharedDesc,
+	const PxU32* PX_RESTRICT islandIds,
+	const PxU32* PX_RESTRICT islandStaticTouchCounts,
+	const PxU32* PX_RESTRICT numCountedInteractions)
+{
+	//integrateCoreParallel(motionVelocity, solverBody, solverBodyData, numBodies, dt);
+	uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const float4* PX_RESTRICT motionVelocityArray = solverCoreDesc->motionVelocityArray;
+	const uint32_t numSolverBodies = solverCoreDesc->numSolverBodies;
+	const float4* PX_RESTRICT solverBodyVelocity = sharedDesc->iterativeData.solverBodyVelPool;
+
+	const PxgSolverTxIData* PX_RESTRICT txIDatas = solverCoreDesc->solverBodyTxIDataPool;
+
+	float4* PX_RESTRICT outSolverVelocity = solverCoreDesc->outSolverVelocity;
+	PxAlignedTransform* PX_RESTRICT outBody2World = solverCoreDesc->outBody2World;
+
+	//for(uint32_t a = idx+offset; a < numSolverBodies; a+=blockSize)
+	uint32_t a = idx + offset;
+	if (a < numSolverBodies)
+	{
+		const PxgSolverBodyData& data = solverCoreDesc->solverBodyDataPool[a];
+
+		const PxU32 nodeIndex = data.islandNodeIndex.index();// >> 2;
+
+		PxgBodySim&	bodySim = solverCoreDesc->mBodySimBufferDeviceData[nodeIndex];
+
+		// PT: TODO: PGS version uses a reference here, what's better?
+		const PxMat33 sqrtInvInertia = txIDatas[a].sqrtInvInertia;
+		const PxTransform deltaTransform = txIDatas[a].deltaBody2World;
+
+		PxAlignedTransform body2World = outBody2World[a];	// PT: TODO: PGS version uses bodySim.body2World here, why?
+
+		const float4 inverseInertia = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+
+		const float4 motionLinVel = motionVelocityArray[a];
+		const float4 motionAngVel = motionVelocityArray[a + numSolverBodies];
+
+		//if (index == PxgSolverBody::InvalidHandle)
+		//{
+		//	const float4 zero4 = make_float4(0.f);
+		//	linVel = motionLinVel;
+		//	angVel = motionAngVel;
+		//}
+		//else
+		//{
+		//	//PxU32 ind = 3*(index&(~31)) + (index&31);
+		//	PxU32 ind = index;
+		//	float4 linxyz_angx = solverBodyVelocity[ind];
+		//	float4 angyz_lindxy = solverBodyVelocity[ind + 32];
+		//	linVel = make_float4(linxyz_angx.x, linxyz_angx.y, linxyz_angx.z, 0.f);
+		//	angVel = make_float4(linxyz_angx.w, angyz_lindxy.x, angyz_lindxy.y, 0.f);
+		//}
+
+		const PxU32 readIndex = solverCoreDesc->accumulatedBodyDeltaVOffset + a;
+
+		float4 v0 = solverBodyVelocity[readIndex];
+		float4 v1 = solverBodyVelocity[readIndex + numSolverBodies];
+
+		// PT: TODO: PGS version doesn't use tmp v0/v1 values here, why?
+		float4 linVel = make_float4(v0.x, v0.y, v0.z, 0.f);
+		float4 angVel = make_float4(v0.w, v1.x, v1.y, 0.f);
+
+		const PxU32 staticTouchCount = islandStaticTouchCounts[islandIds[nodeIndex]];
+
+		//this array need to be copied back to CPU
+		PxgSolverBodySleepData& sleepData = solverCoreDesc->solverBodySleepDataPool[a];
+
+		integrateCoreTGS(motionLinVel, motionAngVel, inverseInertia, linVel, angVel, body2World, deltaTransform, data, bodySim, sleepData, sqrtInvInertia,
+			sharedDesc->dt, sharedDesc->invDtF32, solverCoreDesc->enableStabilization, staticTouchCount != 0, numCountedInteractions[nodeIndex],
+			nodeIndex);
+
+		// PT: TODO: why do we write out the vels & pose to 2 different buffers?
+		outSolverVelocity[a] = linVel;
+		outSolverVelocity[a + numSolverBodies] = angVel;
+		outBody2World[a] = body2World;
+
+		// PT: for acceleration getters (eENABLE_BODY_ACCELERATIONS)
+		PxgBodySimVelocities* prevVelocities = solverCoreDesc->mBodySimPrevVelocitiesBufferDeviceData;
+		if(prevVelocities)
+		{
+			PxgBodySimVelocities& prev = prevVelocities[nodeIndex];
+			prev.linearVelocity = bodySim.linearVelocityXYZ_inverseMassW;
+			prev.angularVelocity = bodySim.angularVelocityXYZ_maxPenBiasW;
+		}
+
+		//write back linear velocity, angular velocity to pxgbodysim
+		bodySim.linearVelocityXYZ_inverseMassW.x = linVel.x; bodySim.linearVelocityXYZ_inverseMassW.y = linVel.y; bodySim.linearVelocityXYZ_inverseMassW.z = linVel.z;
+		bodySim.angularVelocityXYZ_maxPenBiasW.x = angVel.x; bodySim.angularVelocityXYZ_maxPenBiasW.y = angVel.y; bodySim.angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+		bodySim.body2World = body2World;
+
+		assert(body2World.isSane());
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrep.cuh
@@ -0,0 +1,409 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__JOINT_CONSTRAINT_BLOCK_PREP_CUH__
+#define	__JOINT_CONSTRAINT_BLOCK_PREP_CUH__
+
+#include "PxConstraintDesc.h"
+#include "PxConstraint.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraintPrep.h"
+#include "foundation/PxVec4.h"
+#include "MemoryAllocator.cuh"
+#include "PxgSolverKernelIndices.h"
+
+using namespace physx;
+
+namespace physx
+{
+	struct PxgMassProps
+	{
+		float invMass0;
+		float invMass1;
+		float invInertiaScale0;
+		float invInertiaScale1;
+
+		__device__ PxgMassProps(const PxReal iMass0, const PxReal iMass1, 
+			const float4 lin0_ang0_lin1_ang1)
+		{
+			invMass0 = iMass0 * lin0_ang0_lin1_ang1.x;
+			invMass1 = iMass1 * lin0_ang0_lin1_ang1.z;
+			invInertiaScale0 = lin0_ang0_lin1_ang1.y;
+			invInertiaScale1 = lin0_ang0_lin1_ang1.w;
+		}
+
+	};
+}
+
+
+//
+// See orthogonalize() in DyConstraintSetup.cpp for a general explanation
+//
+static __device__ void orthogonalize( PxU32* sortedRowIndices, PxgBlockConstraint1DVelocities* rvs, PxgBlockConstraint1DParameters* rps,
+										PxVec3* angSqrtInvInertia0,
+										PxVec3* angSqrtInvInertia1,
+										PxU32 rowCount, 
+										PxU32 eqRowCount,
+										const physx::PxgMassProps* m,
+										const PxU32 threadIndex)
+{
+	using namespace physx;
+
+	assert(eqRowCount<=6);
+
+	PxVec3 lin1m[6], ang1m[6], lin1[6], ang1[6];	
+	PxVec3 lin0m[6], ang0m[6], lin0[6], ang0[6];
+
+	PxReal geomErr[6];
+	PxReal velTarget[6];
+
+	for(PxU32 i=0;i<rowCount;i++)
+	{
+		const PxU32 index = sortedRowIndices[i];
+		const float4 linear0XYZ_geometricErrorW = rvs[index].linear0XYZ_geometricErrorW[threadIndex];
+		const float4 linear1XYZ_minImpulseW = rvs[index].linear1XYZ_minImpulseW[threadIndex];
+		const float4 angular0XYZ_velocityTargetW = rvs[index].angular0XYZ_velocityTargetW[threadIndex];
+		const float4 angular1XYZ_maxImpulseW = rvs[index].angular1XYZ_maxImpulseW[threadIndex];
+
+		PxVec3 l0(linear0XYZ_geometricErrorW.x, linear0XYZ_geometricErrorW.y, linear0XYZ_geometricErrorW.z);
+		PxVec3 a0(angular0XYZ_velocityTargetW.x, angular0XYZ_velocityTargetW.y, angular0XYZ_velocityTargetW.z);
+
+		PxVec3 l1(linear1XYZ_minImpulseW.x, linear1XYZ_minImpulseW.y, linear1XYZ_minImpulseW.z);
+		PxVec3 a1(angular1XYZ_maxImpulseW.x, angular1XYZ_maxImpulseW.y, angular1XYZ_maxImpulseW.z);
+
+		PxVec3 angSqrtL0 = angSqrtInvInertia0[i];
+		PxVec3 angSqrtL1 = angSqrtInvInertia1[i];
+
+		PxReal g = linear0XYZ_geometricErrorW.w;
+		PxReal T = angular0XYZ_velocityTargetW.w;
+
+		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
+		for(PxU32 j=0;j<eliminationRows;j++)
+		{
+			const PxVec3 s0 = l1.multiply(lin1m[j]) + l0.multiply(lin0m[j]);
+			const PxVec3 s1 = angSqrtL1.multiply(ang1m[j]) + angSqrtL0.multiply(ang0m[j]);
+			const PxVec3 s0s1 = s0+s1;
+			float t = s0s1.x + s0s1.y + s0s1.z;
+
+			l0 = l0 - (lin0[j] * t);
+			a0 = a0 - (ang0[j] * t);
+			l1 = l1 - (lin1[j] * t);
+			a1 = a1 - (ang1[j] * t);
+			g = g - (geomErr[j] * t);
+			T = T - (velTarget[j] * t);
+			angSqrtL0 = angSqrtL0 - (angSqrtInvInertia0[j] * t);
+			angSqrtL1 = angSqrtL1 - (angSqrtInvInertia1[j] * t);
+		}
+
+		rvs[index].linear0XYZ_geometricErrorW[threadIndex]		= make_float4(l0.x, l0.y, l0.z, g);
+		rvs[index].angular0XYZ_velocityTargetW[threadIndex]		= make_float4(a0.x, a0.y, a0.z, T);
+		rvs[index].linear1XYZ_minImpulseW[threadIndex]			= make_float4(l1.x,		l1.y,		l1.z,	linear1XYZ_minImpulseW.w);	
+		rvs[index].angular1XYZ_maxImpulseW[threadIndex]			= make_float4(a1.x,		a1.y,		a1.z,	angular1XYZ_maxImpulseW.w);
+		angSqrtInvInertia0[i] = angSqrtL0;
+		angSqrtInvInertia1[i] = angSqrtL1;
+
+
+		if(i<eqRowCount)
+		{
+			lin0[i] = l0;	
+			ang0[i] = a0;
+			geomErr[i] = g;
+			velTarget[i] = T;
+			lin1[i] = l1;	
+			ang1[i] = a1;	
+			angSqrtInvInertia0[i] = angSqrtL0;
+			angSqrtInvInertia1[i] = angSqrtL1;
+			
+			const PxVec3 l0m = l0 * m->invMass0;
+			const PxVec3 l1m = l1 * m->invMass1;
+			const PxVec3 a0m = angSqrtL0 * m->invInertiaScale0;
+			const PxVec3 a1m = angSqrtL1 * m->invInertiaScale1;
+
+			const PxVec3 s0 = l0.multiply(l0m) + l1.multiply(l1m);
+			const PxVec3 s1 = a0m.multiply(angSqrtL0) + a1m.multiply(angSqrtL1);
+			const PxVec3 s0s1 = s0 + s1;
+			const float s = s0s1.x + s0s1.y + s0s1.z;
+			const float a = s > 0 ? 1.f/s : 0.f;  // with mass scaling, it's possible for the inner product of a row to be zero
+
+			lin0m[i] = l0m * a;
+			ang0m[i] = a0m * a;
+			lin1m[i] = l1m * a;
+			ang1m[i] = a1m * a;
+		}
+	}
+}
+
+
+
+
+static __device__ void preprocessRows(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, 
+									  PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters,
+									  PxVec3* angSqrtInvInertia0, PxVec3* angSqrtInvInertia1,
+									  const physx::PxgSolverBodyPrepData* bd0, const physx::PxgSolverBodyPrepData* bd1,
+									  PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+									  const PxU32 threadIndex, bool disablePreprocessing)
+{
+	using namespace physx;
+
+	//Px1DConstraint* sorted[MAX_CONSTRAINTS];
+	// j is maxed at 12, typically around 7, so insertion sort is fine
+
+	for(PxU32 i=0; i<constraintData->mNumRows[threadIndex]; i++)
+	{
+		PxgBlockConstraint1DParameters& r = rowParameters[i];
+		
+		PxU32 j = i;
+		for(;j>0 && r.solveHint[threadIndex] < rowParameters[sortedRowIndices[j-1]].solveHint[threadIndex]; j--)
+			sortedRowIndices[j] = sortedRowIndices[j-1];
+
+		sortedRowIndices[j] = i;
+	}
+
+	/*for(PxU32 i=1;i<constraintData->mNumRows[threadIndex];i++)
+		assert(sorted[i-1]->solveHint[threadIndex] <= sorted[i]->solveHint[threadIndex]);*/
+
+	PxgMassProps m(bd0->initialLinVelXYZ_invMassW.w, bd1->initialLinVelXYZ_invMassW.w, reinterpret_cast<float4*>(constraintData->mInvMassScale)[threadIndex]);
+
+	const PxMat33 i0 = txIData0->sqrtInvInertia;
+	const PxMat33 i1 = txIData1->sqrtInvInertia;
+
+	for(PxU32 i = 0; i < constraintData->mNumRows[threadIndex]; ++i)
+	{
+		/*const PxVec3 angDelta0 = bd0->sqrtInvInertia * sorted[i]->angular0[threadIndex];
+		const PxVec3 angDelta1 = bd1->sqrtInvInertia * sorted[i]->angular1[threadIndex];*/
+		PxgBlockConstraint1DVelocities& rv =  rowVelocities[sortedRowIndices[i]];
+
+		const float4 angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+		const PxVec3 angular0(angular0XYZ_velocityTargetW.x,	angular0XYZ_velocityTargetW.y,	angular0XYZ_velocityTargetW.z);
+		const PxVec3 angular1(angular1XYZ_maxImpulseW.x,		angular1XYZ_maxImpulseW.y,		angular1XYZ_maxImpulseW.z);
+		/*const PxVec3 angDelta0 = bd0->sqrtInvInertia * angular0;
+		const PxVec3 angDelta1 = bd1->sqrtInvInertia * angular1;
+		angSqrtInvInertia0[i] = angDelta0;
+		angSqrtInvInertia1[i] = angDelta1;*/
+		angSqrtInvInertia0[i] = i0 * angular0;
+		angSqrtInvInertia1[i] = i1 * angular1;
+	}
+
+
+	if (!disablePreprocessing)
+	{
+		//MassProps m(bd0, bd1, ims);
+		for (PxU32 i = 0; i < constraintData->mNumRows[threadIndex];)
+		{
+			PxgBlockConstraint1DParameters& rp = rowParameters[sortedRowIndices[i]];
+
+			const PxU32 groupMajorId = PxU32(rp.solveHint[threadIndex] >> 8), start = i++;
+			while (i < constraintData->mNumRows[threadIndex] && PxU32(rowParameters[sortedRowIndices[i]].solveHint[threadIndex] >> 8) == groupMajorId)
+				i++;
+
+			if (groupMajorId == 4 || (groupMajorId == 8))
+			{
+				PxU32 bCount = start;		// count of bilateral constraints 
+				for (; bCount < i && (rowParameters[sortedRowIndices[bCount]].solveHint[threadIndex] & 255) == 0; bCount++)
+					;
+
+				orthogonalize(sortedRowIndices + start, rowVelocities, rowParameters, angSqrtInvInertia0 + start, angSqrtInvInertia1 + start, i - start, bCount - start, &m, threadIndex);
+
+			}
+		}
+	}
+}
+  
+static __device__ void intializeBlock1D(const physx::PxgBlockConstraint1DVelocities& rv,
+										const physx::PxgBlockConstraint1DParameters& rp,
+											float jointSpeedForRestitutionBounce,
+											float initJointSpeed,
+											float resp0,
+											float resp1,
+											float erp,
+											float dt,
+											float recipdt,
+											PxgBlockSolverConstraint1DCon& scon,
+											PxgBlockSolverConstraint1DMod& smod,
+											const PxVec3& _linear0, const PxVec3& _linear1, 
+											const PxVec3& _angular0, const PxVec3& _angular1,
+											const PxReal _minImpulse, const PxReal _maxImpulse, 
+											const PxReal cfm,
+											const PxU32 threadIndex
+											)
+{
+	using namespace physx;
+	
+	{
+		const PxU16 flags = rp.flags[threadIndex];
+		const PxReal springStiffness = rp.mods.spring.stiffness[threadIndex];
+		const PxReal springDamping = rp.mods.spring.damping[threadIndex];
+		const PxReal restitution = rp.mods.bounce.restitution[threadIndex];
+		const PxReal bounceThreshold = rp.mods.bounce.velocityThreshold[threadIndex];
+		const PxReal geomError = rv.linear0XYZ_geometricErrorW[threadIndex].w;
+		const PxReal velocityTarget = rv.angular0XYZ_velocityTargetW[threadIndex].w;
+
+		PxReal coeff0, coeff1;
+		queryReduced1dConstraintSolverConstantsPGS(flags, springStiffness, springDamping, restitution, bounceThreshold, geomError,
+												   velocityTarget, jointSpeedForRestitutionBounce, erp, dt, recipdt, coeff0, coeff1);
+
+		scon.lin0XYZ_minImpulse[threadIndex] = make_float4(_linear0.x, _linear0.y, _linear0.z, _minImpulse);
+		scon.lin1XYZ_maxImpulse[threadIndex] = make_float4(_linear1.x, _linear1.y, _linear1.z, _maxImpulse);
+		scon.ang0XYZ_resp0[threadIndex] = make_float4(_angular0.x, _angular0.y, _angular0.z, resp0);
+		scon.ang1XYZ_resp1[threadIndex] = make_float4(_angular1.x, _angular1.y, _angular1.z, resp1);
+		scon.initJointSpeed[threadIndex] = initJointSpeed;
+
+		smod.coeff0[threadIndex] = coeff0;
+		smod.coeff1[threadIndex] = coeff1;
+
+		smod.appliedForce[threadIndex] = 0;
+		smod.residual[threadIndex] = 0;
+
+		// Instead of setting the flag to zero as in the previous implementation, the flag is used to mark spring and
+		// acceleration spring.
+		smod.flags[threadIndex] = 0;
+		if (flags & Px1DConstraintFlag::eSPRING)
+			smod.flags[threadIndex] |= DY_SC_FLAG_SPRING;
+
+		if (flags & Px1DConstraintFlag::eACCELERATION_SPRING)
+			smod.flags[threadIndex] |= DY_SC_FLAG_ACCELERATION_SPRING;
+	}
+}
+
+static __device__ void setUp1DConstraintBlock(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+								  PxVec3* angSqrtInvInertia0, PxVec3* angSqrtInvInertia1, PxgBlockSolverConstraint1DCon* constraintsCon, PxgBlockSolverConstraint1DMod* constraintsMod,
+									float dt, float recipdt, const physx::PxgSolverBodyPrepData* sBodyData0, const physx::PxgSolverBodyPrepData* sBodyData1,
+									const PxU32 threadIndex)
+{
+	using namespace physx;
+
+	//PxU32 stride = sizeof(PxgSolverConstraint1D);
+
+	const PxReal erp = 1.0f;
+	const float4 sBodyData0_initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 sBodyData1_initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+	
+	for(PxU32 i=0;i<numRows;i++)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = constraintsCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = constraintsMod[i];
+		//Pxg1DConstraintBlock& c = *sorted[i];
+		const PxU32 index = sortedRowIndices[i];
+		PxgBlockConstraint1DParameters& rp = rowParameters[index];
+		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];
+
+		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
+		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
+		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+
+		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x,	c_linear0XYZ_geometricErrorW.y,		c_linear0XYZ_geometricErrorW.z);
+		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x,		c_linear1XYZ_minImpulseW.y,			c_linear1XYZ_minImpulseW.z);
+		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y,	c_angular0XYZ_velocityTargetW.z);
+		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x,		c_angular1XYZ_maxImpulseW.y,		c_angular1XYZ_maxImpulseW.z);
+		const PxVec3 ang0 = angSqrtInvInertia0[i];
+		const PxVec3 ang1 = angSqrtInvInertia1[i];
+
+
+		PxReal minImpulse;
+		PxReal maxImpulse;
+		{
+			const bool hasDriveLimit = rp.flags[threadIndex] & Px1DConstraintFlag::eHAS_DRIVE_LIMIT;
+			const bool driveLimitsAreForces = constraintData->mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES;
+			Dy::computeMinMaxImpulseOrForceAsImpulse(
+				c_linear1XYZ_minImpulseW.w, c_angular1XYZ_maxImpulseW.w,
+				hasDriveLimit, driveLimitsAreForces, dt,
+				minImpulse, maxImpulse);
+		}	
+	
+		cmod.ang0Writeback[threadIndex] = cang0;
+
+		const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+	
+		PxReal resp0 = clin0.magnitudeSquared() * sBodyData0_initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x + ang0.magnitudeSquared() * lin0_ang0_lin1_ang1.y;
+		PxReal resp1 = clin1.magnitudeSquared() * sBodyData1_initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z + ang1.magnitudeSquared() * lin0_ang0_lin1_ang1.w;
+
+		const PxReal initJointSpeed = sBodyData0->projectVelocity(clin0, cang0) - sBodyData1->projectVelocity(clin1, cang1);
+
+		// Following the previous implementation, cfm is not used in unitResponse, thus it is set to 0.
+		intializeBlock1D(rv, rp, initJointSpeed, initJointSpeed, resp0, resp1, erp, dt, recipdt, ccon, cmod, clin0, clin1, ang0, ang1, minImpulse, maxImpulse, 0.0f, threadIndex);
+
+		if(rp.flags[threadIndex] & Px1DConstraintFlag::eOUTPUT_FORCE)
+			cmod.flags[threadIndex] |= DY_SC_FLAG_OUTPUT_FORCE;
+	}
+}
+
+
+template<int NbThreads>
+static __device__ void setupSolverConstraintBlockGPU(PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+													 const physx::PxgSolverBodyPrepData* sBodyData0, const physx::PxgSolverBodyPrepData* sBodyData1, PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+													float dt, float recipdt, PxgBlockConstraintBatch& batch, 
+													 const PxU32 threadIndex, PxgBlockSolverConstraint1DHeader* header, PxgBlockSolverConstraint1DCon* rowsCon, PxgBlockSolverConstraint1DMod* rowsMod,
+													 const PxgSolverConstraintManagerConstants& managerConstants)
+{
+	using namespace physx;
+
+	//distance constraint might have zero number of rows	
+	header->rowCounts[threadIndex] = PxU8(constraintData->mNumRows[threadIndex]);
+
+	header->writeBackOffset[threadIndex] = managerConstants.mConstraintWriteBackIndex;
+
+	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+
+	const float4 initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+	
+	const float4 raWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndex];
+	const float4 rbWorld_angBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndex];
+	const float linBreakImpulse = raWorld_linBreakForce.w * dt;
+	const float angBreakForce = rbWorld_angBreakForce.w;
+	const float angBreakImpulse = angBreakForce * dt;
+	header->body0WorldOffset_linBreakImpulse[threadIndex] = make_float4(raWorld_linBreakForce.x, raWorld_linBreakForce.y, raWorld_linBreakForce.z, linBreakImpulse);
+	header->angBreakImpulse[threadIndex] = angBreakImpulse;
+
+	header->invMass0D0[threadIndex] = initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x;
+	header->invMass1D1[threadIndex] = initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z;
+	header->invInertiaScale0[threadIndex] = lin0_ang0_lin1_ang1.y;
+	header->invInertiaScale1[threadIndex] = lin0_ang0_lin1_ang1.w;
+	
+	header->breakable[threadIndex] = PxU8((raWorld_linBreakForce.w != PX_MAX_F32) || (angBreakForce != PX_MAX_F32));
+
+	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia0[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia1[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	
+	preprocessRows(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], sBodyData0, sBodyData1, txIData0, txIData1, threadIndex, !!(constraintData->mFlags[threadIndex] & PxConstraintFlag::eDISABLE_PREPROCESSING));
+	setUp1DConstraintBlock(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], rowsCon, rowsMod, dt, recipdt, sBodyData0, sBodyData1, threadIndex);
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrepTGS.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrepTGS.cuh
@@ -0,0 +1,321 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__JOINT_CONSTRAINT_BLOCK_PREP_TGS_CUH__
+#define	__JOINT_CONSTRAINT_BLOCK_PREP_TGS_CUH__
+
+#include "PxConstraintDesc.h"
+#include "PxConstraint.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraintPrep.h"
+#include "foundation/PxVec4.h"
+#include "MemoryAllocator.cuh"
+#include "PxgSolverKernelIndices.h"
+#include "jointConstraintBlockPrep.cuh"
+
+
+using namespace physx;
+  
+static __device__ PxU32 intializeBlock1DTGS
+(const physx::PxgBlockConstraint1DVelocities& rv, const physx::PxgBlockConstraint1DParameters& rp, const PxgBlockConstraint1DData& constraintData,
+ const PxReal jointSpeedForRestitutionBounce, const PxReal initJointSpeed,
+ const PxReal unitResponse, const PxReal minRowResponse,
+ const PxReal erp, const PxReal lengthScale,
+ const PxReal stepDt, const PxReal simDt, const PxReal recipStepDt, const PxReal recipSimDt,
+ const PxVec3& angSqrtInvInertia0, const PxVec3& angSqrtInvInertia1,
+ const PxReal invInertiaScale0, const PxReal invInertiaScale1,
+ const PxU32 eqCount, const PxU32 threadIndex, const bool disablePreprocessing,
+ PxgTGSBlockSolverConstraint1DHeader& hdr,
+ PxgTGSBlockSolverConstraint1DCon& scon)
+{
+	using namespace physx;
+
+	//Copy min and max impulse because the convention of
+	//function inputs and outputs is very confusing.
+
+	PxReal maxBiasVelocity = 0.0f;
+	PxReal recipUnitResponse = 0.0f;
+	const PxReal geometricError = rv.linear0XYZ_geometricErrorW[threadIndex].w;
+	Dy::Constraint1dSolverConstantsTGS desc = {0.0f, 0.0f, 0.0f, 0.0f};
+	{
+		const PxU16 flags = PxU16(rp.flags[threadIndex]);
+		const PxReal stiffness = rp.mods.spring.stiffness[threadIndex];
+		const PxReal damping = rp.mods.spring.damping[threadIndex];
+		const PxReal restitution = rp.mods.bounce.restitution[threadIndex];
+		const PxReal bounceVelocityThreshold = rp.mods.bounce.velocityThreshold[threadIndex];
+		const PxReal velocityTarget = rv.angular0XYZ_velocityTargetW[threadIndex].w;
+
+		maxBiasVelocity = Dy::computeMaxBiasVelocityTGS(flags, jointSpeedForRestitutionBounce, bounceVelocityThreshold, 
+			restitution, geometricError, false, lengthScale, recipSimDt);
+
+		recipUnitResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		desc = Dy::compute1dConstraintSolverConstantsTGS(
+			flags, 
+			stiffness, damping,
+			restitution, bounceVelocityThreshold,
+			geometricError, velocityTarget, 
+			jointSpeedForRestitutionBounce, initJointSpeed, 
+			unitResponse, recipUnitResponse, 
+			erp, 
+			stepDt, recipStepDt);
+	}
+
+	//Write to the w-components of each float4.
+	//set the biasScale
+	float4 lin1XYZ_biasScale = rv.linear1XYZ_minImpulseW[threadIndex];
+	lin1XYZ_biasScale.w = desc.biasScale;
+	//set the initBias
+	float4 lin0XYZ_initBiasW = rv.linear0XYZ_geometricErrorW[threadIndex];
+	lin0XYZ_initBiasW.w = desc.error;
+	//set the velMultiplier
+	float4 ang0XYZ_velMultiplierW =  rv.angular0XYZ_velocityTargetW[threadIndex];
+	ang0XYZ_velMultiplierW.w = desc.velMultiplier;
+	//set the velTarget
+	float4 ang1XYZ_velTargetW = rv.angular1XYZ_maxImpulseW[threadIndex];
+	ang1XYZ_velTargetW.w = desc.targetVel;
+	
+	PxReal angularErrorScale = 1.f;
+	if (!(rp.flags[threadIndex] & Px1DConstraintFlag::eANGULAR_CONSTRAINT))
+	{
+		ang0XYZ_velMultiplierW.x = ang0XYZ_velMultiplierW.y = ang0XYZ_velMultiplierW.z = 0.f;
+		ang1XYZ_velTargetW.x = ang1XYZ_velTargetW.y = ang1XYZ_velTargetW.z = 0.f;
+		angularErrorScale = 0.f;
+	}
+
+	scon.lin0XYZ_initBiasOrCoeff0[threadIndex] = lin0XYZ_initBiasW;
+	scon.lin1XYZ_biasScaleOrCoeff1[threadIndex] = lin1XYZ_biasScale;
+	scon.ang0XYZ_velMultiplierOrCoeff2[threadIndex] = ang0XYZ_velMultiplierW;
+	scon.ang1XYZ_velTargetOrCoeff3[threadIndex] = ang1XYZ_velTargetW;
+
+	scon.maxBias[threadIndex] = maxBiasVelocity;
+	scon.angularErrorScale[threadIndex] = angularErrorScale;
+	scon.appliedForce[threadIndex] = 0.f;
+	scon.residual[threadIndex] = 0.0f;
+
+	const bool hasDriveLimit = rp.flags[threadIndex] & Px1DConstraintFlag::eHAS_DRIVE_LIMIT;
+	const bool driveLimitsAreForces = constraintData.mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES;
+	Dy::computeMinMaxImpulseOrForceAsImpulse(
+			rv.linear1XYZ_minImpulseW[threadIndex].w, rv.angular1XYZ_maxImpulseW[threadIndex].w,
+			hasDriveLimit, driveLimitsAreForces, simDt,
+			scon.minImpulse[threadIndex], scon.maxImpulse[threadIndex]);
+
+	PxU32 outFlags = 0;
+	const PxU32 solveHint = rp.solveHint[threadIndex];
+	Dy::raiseInternalFlagsTGS(rp.flags[threadIndex], solveHint, outFlags);
+
+	PxU32 ret = 0;
+	if (!disablePreprocessing)
+	{
+		if (solveHint == PxConstraintSolveHint::eROTATIONAL_EQUALITY)
+		{
+			ret = 1;
+			outFlags |= DY_SC_FLAG_ROT_EQ;
+
+			hdr.angOrthoAxis0_recipResponseW[eqCount][threadIndex] = make_float4(angSqrtInvInertia0.x * invInertiaScale0, angSqrtInvInertia0.y * invInertiaScale0,
+				angSqrtInvInertia0.z * invInertiaScale0, recipUnitResponse);
+			hdr.angOrthoAxis1_ErrorW[eqCount][threadIndex] = make_float4(angSqrtInvInertia1.x * invInertiaScale1, angSqrtInvInertia1.y * invInertiaScale1,
+				angSqrtInvInertia1.z * invInertiaScale1, geometricError);
+		}
+		else if(solveHint & PxConstraintSolveHint::eEQUALITY)
+			outFlags |= DY_SC_FLAG_ORTHO_TARGET;
+	}
+
+	scon.flags[threadIndex] = outFlags;
+	
+	return ret;
+}
+
+
+static __device__ PxU32 setUp1DConstraintBlockTGS
+(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+ PxVec3* angSqrtInvInertias0, PxVec3* angSqrtInvInertias1, PxgTGSBlockSolverConstraint1DHeader& header, PxgTGSBlockSolverConstraint1DCon* constraintsCon,
+ float stepDt, float recipStepDt, float simDt, float recipSimDt, float biasCoefficient, const physx::PxgSolverBodyData* sBodyData0, const physx::PxgSolverBodyData* sBodyData1,
+ const PxU32 threadIndex, const PxReal lengthScale, bool disablePreprocessing)
+{
+	using namespace physx;
+
+	//PxU32 stride = sizeof(PxgSolverConstraint1D);
+
+	const PxReal erp = 0.5f * biasCoefficient;
+	const float4 sBodyData0_initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 sBodyData1_initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+
+	const bool isKinematic0 = !!(sBodyData0->flags & PxRigidBodyFlag::eKINEMATIC);
+	const bool isKinematic1 = !!(sBodyData1->flags & PxRigidBodyFlag::eKINEMATIC);
+	
+	PxU32 eqCount = 0;
+	for(PxU32 i=0;i<numRows;i++)
+	{
+		PxgTGSBlockSolverConstraint1DCon& ccon = constraintsCon[i];
+		//Pxg1DConstraintBlock& c = *sorted[i];
+		const PxU32 index = sortedRowIndices[i];
+		PxgBlockConstraint1DParameters& rp = rowParameters[index];
+		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];
+
+		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
+		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
+		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+
+		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x,	c_linear0XYZ_geometricErrorW.y,		c_linear0XYZ_geometricErrorW.z);
+		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x,		c_linear1XYZ_minImpulseW.y,			c_linear1XYZ_minImpulseW.z);
+		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y,	c_angular0XYZ_velocityTargetW.z);
+		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x,		c_angular1XYZ_maxImpulseW.y,		c_angular1XYZ_maxImpulseW.z);
+		const PxVec3 angSqrtInvInertia0 = angSqrtInvInertias0[i];
+		const PxVec3 angSqrtInvInertia1 = angSqrtInvInertias1[i];
+
+		const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+	
+		PxReal unitResponse;
+		{
+			const PxReal resp0 = clin0.magnitudeSquared() * sBodyData0_initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x + angSqrtInvInertia0.magnitudeSquared() * lin0_ang0_lin1_ang1.y;
+			const PxReal resp1 = clin1.magnitudeSquared() * sBodyData1_initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z + angSqrtInvInertia1.magnitudeSquared() * lin0_ang0_lin1_ang1.w;
+			unitResponse  = resp0 + resp1;
+		}
+
+		PxReal jointSpeedForRestitutionBounce;
+		PxReal initJointSpeed;
+		{
+			const float vel0 = sBodyData0->projectVelocity(clin0, cang0);
+			const float vel1 = sBodyData1->projectVelocity(clin1, cang1);
+			Dy::computeJointSpeedTGS(
+				vel0, isKinematic0, vel1, isKinematic1, 
+				jointSpeedForRestitutionBounce, initJointSpeed);
+		}
+
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+
+		eqCount += intializeBlock1DTGS(
+			rv, rp, *constraintData,
+			jointSpeedForRestitutionBounce, initJointSpeed,
+			unitResponse, minRowResponse, erp, lengthScale,
+			stepDt, simDt, recipStepDt, recipSimDt, 
+			angSqrtInvInertia0, angSqrtInvInertia1, 
+			lin0_ang0_lin1_ang1.y, lin0_ang0_lin1_ang1.w,
+			eqCount, threadIndex, disablePreprocessing,			
+			header, ccon);
+	}
+
+	for (PxU32 i = eqCount; i < 3; ++i)
+	{
+		header.angOrthoAxis0_recipResponseW[i][threadIndex] = make_float4(0.f);
+		header.angOrthoAxis1_ErrorW[i][threadIndex] = make_float4(0.f);
+	}
+
+	return eqCount;
+}
+
+
+template<int NbThreads>
+static __device__ void setupSolverConstraintBlockGPUTGS(PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+													const physx::PxgSolverBodyData* sBodyData0, const physx::PxgSolverBodyData* sBodyData1, PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+													float dt, float recipdt, float totalDt, float recipTotalDt, float lengthScale, float biasCoefficient, PxgBlockConstraintBatch& batch, 
+													const PxU32 threadIndex, PxgTGSBlockSolverConstraint1DHeader* header, PxgTGSBlockSolverConstraint1DCon* rowsCon, const PxgSolverConstraintManagerConstants& managerConstants)
+{
+	using namespace physx;
+
+	//distance constraint might have zero number of rows	
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+
+	uchar4 rowCounts_breakable_orthoAxisCount;
+
+
+
+	rowCounts_breakable_orthoAxisCount.x = PxU8(numRows);
+	
+
+	header->writeBackOffset[threadIndex] = managerConstants.mConstraintWriteBackIndex;
+
+	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+
+	const float4 initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const float4 rAWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndex];
+	const float4 rBWorld_AngBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndex];
+	
+	const float linBreakImpulse = rAWorld_linBreakForce.w * totalDt;
+	const float angBreakImpulse = rBWorld_AngBreakForce.w * totalDt;
+	header->linBreakImpulse[threadIndex] = linBreakImpulse;
+	header->angBreakImpulse[threadIndex] = angBreakImpulse;
+
+	const float invMass0 = initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x;
+	const float invMass1 = initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z;
+
+	
+
+	header->rAWorld_invMass0D0[threadIndex] = make_float4(rAWorld_linBreakForce.x, rAWorld_linBreakForce.y, rAWorld_linBreakForce.z, invMass0);
+	header->rBWorld_invMass1D1[threadIndex] = make_float4(rBWorld_AngBreakForce.x, rBWorld_AngBreakForce.y, rBWorld_AngBreakForce.z, invMass1);
+	header->invInertiaScale0[threadIndex] = lin0_ang0_lin1_ang1.y;
+	header->invInertiaScale1[threadIndex] = lin0_ang0_lin1_ang1.w;
+
+	rowCounts_breakable_orthoAxisCount.y = PxU8((rAWorld_linBreakForce.w != PX_MAX_F32) || (rBWorld_AngBreakForce.w != PX_MAX_F32));
+
+
+	for (PxU32 i = 0; i < numRows; ++i)
+	{
+		if (rowParameters[i].flags[threadIndex] & Px1DConstraintFlag::eANGULAR_CONSTRAINT)
+		{
+			PxU32 hint = rowParameters[i].solveHint[threadIndex];
+			if (hint == PxConstraintSolveHint::eEQUALITY)
+				hint = PxConstraintSolveHint::eROTATIONAL_EQUALITY;
+			else if (hint == PxConstraintSolveHint::eINEQUALITY)
+				hint = PxConstraintSolveHint::eROTATIONAL_INEQUALITY;
+
+			rowParameters[i].solveHint[threadIndex] = hint;
+		}
+	}
+
+
+	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia0[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia1[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+
+	bool disablePreprocessing = !!(constraintData->mFlags[threadIndex] & PxConstraintFlag::eDISABLE_PREPROCESSING);
+
+	preprocessRows(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], 
+		sBodyData0, sBodyData1, txIData0, txIData1, threadIndex, disablePreprocessing);
+	rowCounts_breakable_orthoAxisCount.z = setUp1DConstraintBlockTGS(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x],
+		*header, rowsCon, dt, recipdt, totalDt, recipTotalDt, biasCoefficient, sBodyData0, sBodyData1, threadIndex, lengthScale, disablePreprocessing);
+
+	header->rowCounts_breakable_orthoAxisCount[threadIndex] = rowCounts_breakable_orthoAxisCount;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cu
@@ -0,0 +1,64 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "preIntegration.cuh"
+
+extern "C" __host__ void initSolverKernels4() {}
+
+extern "C" __global__ void preIntegrationLaunch(
+	const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool, PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* PX_RESTRICT bodySimPool, const PxNodeIndex* PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* gTransforms, float4* gOutVelocityPool, PxU32* solverBodyIndices)
+{
+	preIntegration(offset, nbSolverBodies, dt, gravity, solverBodyDataPool, solverBodySleepDataPool, solverTxIDataPool, 
+		bodySimPool, islandNodeIndices, gTransforms, gOutVelocityPool, solverBodyIndices);
+}
+
+extern "C" __global__ void initStaticKinematics(
+	const uint32_t nbStaticKinematics, const uint32_t nbSolverBodies, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverTxIData* PX_RESTRICT solverTxIDataPool, PxAlignedTransform* gTransforms, float4* gOutVelocityPool, 
+	PxNodeIndex* activeNodeIndices, PxU32* solverBodyIndices)
+{
+	const uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	if(idx < nbStaticKinematics)
+	{
+		//KS - TODO - Optimize these reads/writes
+		const PxNodeIndex index = activeNodeIndices[idx];
+		if (!index.isStaticBody())
+		{
+			solverBodyIndices[index.index()] = idx;
+		}
+		gTransforms[idx] = solverBodyDataPool[idx].body2World;
+		gOutVelocityPool[idx] = solverBodyDataPool[idx].initialLinVelXYZ_invMassW;
+		gOutVelocityPool[idx + nbSolverBodies] = solverBodyDataPool[idx].initialAngVelXYZ_penBiasClamp;
+		solverTxIDataPool[idx].deltaBody2World = PxTransform(PxIdentity);
+		solverTxIDataPool[idx].sqrtInvInertia = PxMat33(PxZero);
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cuh
@@ -0,0 +1,439 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "PxgSolverKernelIndices.h"
+#include "stdio.h"
+
+using namespace physx;
+
+__device__ __forceinline__ PxVec3 computeSafeSqrtInertia(const PxVec3& v)
+{
+	return PxVec3(v.x == 0.f ? 0.f : PxSqrt(v.x), v.y == 0.f ? 0.f : PxSqrt(v.y), v.z == 0.f ? 0.f : PxSqrt(v.z));
+}
+
+__device__ __forceinline__ void transformInertiaTensor(const PxVec3& invD, const PxMat33& M, PxAlignedMat33& mIInv)
+{
+	const float	axx = invD.x*M(0, 0), axy = invD.x*M(1, 0), axz = invD.x*M(2, 0);
+	const float	byx = invD.y*M(0, 1), byy = invD.y*M(1, 1), byz = invD.y*M(2, 1);
+	const float	czx = invD.z*M(0, 2), czy = invD.z*M(1, 2), czz = invD.z*M(2, 2);
+
+	mIInv(0, 0) = axx*M(0, 0) + byx*M(0, 1) + czx*M(0, 2);
+	mIInv(1, 1) = axy*M(1, 0) + byy*M(1, 1) + czy*M(1, 2);
+	mIInv(2, 2) = axz*M(2, 0) + byz*M(2, 1) + czz*M(2, 2);
+
+	mIInv(0, 1) = mIInv(1, 0) = axx*M(1, 0) + byx*M(1, 1) + czx*M(1, 2);
+	mIInv(0, 2) = mIInv(2, 0) = axx*M(2, 0) + byx*M(2, 1) + czx*M(2, 2);
+	mIInv(1, 2) = mIInv(2, 1) = axy*M(2, 0) + byy*M(2, 1) + czy*M(2, 2);
+}
+
+__device__ __forceinline__ void transformInertiaTensor(const PxVec3& invD, const PxMat33& M, PxMat33& mIInv)
+{
+	const float	axx = invD.x*M(0, 0), axy = invD.x*M(1, 0), axz = invD.x*M(2, 0);
+	const float	byx = invD.y*M(0, 1), byy = invD.y*M(1, 1), byz = invD.y*M(2, 1);
+	const float	czx = invD.z*M(0, 2), czy = invD.z*M(1, 2), czz = invD.z*M(2, 2);
+
+	mIInv(0, 0) = axx*M(0, 0) + byx*M(0, 1) + czx*M(0, 2);
+	mIInv(1, 1) = axy*M(1, 0) + byy*M(1, 1) + czy*M(1, 2);
+	mIInv(2, 2) = axz*M(2, 0) + byz*M(2, 1) + czz*M(2, 2);
+
+	mIInv(0, 1) = mIInv(1, 0) = axx*M(1, 0) + byx*M(1, 1) + czx*M(1, 2);
+	mIInv(0, 2) = mIInv(2, 0) = axx*M(2, 0) + byx*M(2, 1) + czx*M(2, 2);
+	mIInv(1, 2) = mIInv(2, 1) = axy*M(2, 0) + byy*M(2, 1) + czy*M(2, 2);
+}
+
+
+PX_FORCE_INLINE __device__ PxVec3 getGravityAcceleration(const PxU32 disableGravity, const PxReal accelScale, const PxVec3& gravity, const PxReal dt)
+{
+	if(!(disableGravity))
+	{
+		return gravity * accelScale * dt;
+	}
+
+	return PxVec3(0);
+}
+
+
+__device__  __forceinline__ void bodyCoreComputeUnconstrainedVelocity(const float4& maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW, 
+	float4& linearVelocityXYZ_inverseMassW, float4& angularVelocityXYZ_maxPenBiasW,
+	const PxU32 lockFlags, const PxU32 disableGravity, const PxReal accelScale, const PxVec3& gravity, 
+	const float4& linearAccel, const float4& angularAccel, const PxReal dt)
+{
+	PxVec3 linearVelocity(linearVelocityXYZ_inverseMassW.x, linearVelocityXYZ_inverseMassW.y, linearVelocityXYZ_inverseMassW.z);
+	PxVec3 angularVelocity(angularVelocityXYZ_maxPenBiasW.x, angularVelocityXYZ_maxPenBiasW.y, angularVelocityXYZ_maxPenBiasW.z);
+	const float4 temp = maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW;
+
+
+
+	//Multiply everything that needs multiplied by dt to improve code generation.
+	const PxVec3 linearAccelTimesDT = getGravityAcceleration(disableGravity, accelScale, gravity, dt) + PxVec3(linearAccel.x, linearAccel.y, linearAccel.z) * dt;
+	const PxVec3 angularAccelTimesDT = PxVec3(angularAccel.x, angularAccel.y, angularAccel.z) * dt;
+	//const PxVec3 angularAccelTimesDT = PxVec3(0.f);
+	const PxReal linearDampingTimesDT = temp.z*dt;
+	const PxReal angularDampingTimesDT = temp.w*dt;
+	const PxReal oneMinusLinearDampingTimesDT = 1.0f - linearDampingTimesDT;
+	const PxReal oneMinusAngularDampingTimesDT = 1.0f - angularDampingTimesDT;
+
+	//TODO context-global gravity
+	linearVelocity += linearAccelTimesDT;
+	angularVelocity += angularAccelTimesDT;
+
+	//Apply damping.
+	const PxReal linVelMultiplier = physx::intrinsics::fsel(oneMinusLinearDampingTimesDT, oneMinusLinearDampingTimesDT, 0.0f);
+	const PxReal angVelMultiplier = physx::intrinsics::fsel(oneMinusAngularDampingTimesDT, oneMinusAngularDampingTimesDT, 0.0f);
+	linearVelocity *= linVelMultiplier;
+	angularVelocity *= angVelMultiplier;
+
+	// Clamp velocity
+	const PxReal angVelSq = angularVelocity.magnitudeSquared();
+	if (angVelSq > temp.y)
+	{
+		angularVelocity *= PxSqrt(temp.y / angVelSq);
+	}
+
+	const PxReal linVelSq = linearVelocity.magnitudeSquared();
+	if (linVelSq > temp.x)
+	{
+		linearVelocity *= PxSqrt(temp.x / linVelSq);
+	}
+
+	//printf("%i, LV = (%f, %f, %f)\n", threadIdx.x, linearVelocity.x, linearVelocity.y, linearVelocity.z);
+	//printf("%i, AV = (%f, %f, %f)\n", threadIdx.x, angularVelocity.x, angularVelocity.y, angularVelocity.z);
+
+
+	linearVelocityXYZ_inverseMassW.x = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : linearVelocity.x;
+	linearVelocityXYZ_inverseMassW.y = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : linearVelocity.y;
+	linearVelocityXYZ_inverseMassW.z = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : linearVelocity.z;
+	angularVelocityXYZ_maxPenBiasW.x = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : angularVelocity.x;
+	angularVelocityXYZ_maxPenBiasW.y = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : angularVelocity.y;
+	angularVelocityXYZ_maxPenBiasW.z = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : angularVelocity.z;
+}
+
+//Reads a 4 byte element from shared buffer, where data is swizzled to avoid bank conflicts
+template<typename RetType, typename ContainedClass>
+PX_FORCE_INLINE PX_CUDA_CALLABLE RetType readSwizzledWord(const uint* sharedBuffer, uint element, uint quadwordWithinElem, uint elemInQuadword)
+{
+	const uint quadword = (element * sizeof(ContainedClass) / sizeof(uint4)) + quadwordWithinElem; //Which quadword within the structure am I reading?
+
+	const uint page = quadword / 32; //Which page is it in?
+	const uint subpage = quadword & 31; //Which quadword within the page is it in?
+
+	const uint address = page * 128 + elemInQuadword * 32 + subpage;
+
+	return reinterpret_cast<const RetType*>(sharedBuffer)[address];
+}
+
+template<typename ContainedClass>
+PX_FORCE_INLINE PX_CUDA_CALLABLE float4 readSwizzledFloat4(const uint* sharedBuffer, uint element, uint quadwordWithinElem)
+{
+	const uint quadword = (element * sizeof(ContainedClass) / sizeof(uint4)) + quadwordWithinElem; //Which quadword within the structure am I reading?
+
+	const uint page = quadword / 32; //Which page is it in?
+	const uint subpage = quadword & 31; //Which quadword within the page is it in?
+
+	const uint address = page * 128 + subpage;
+
+	return make_float4(reinterpret_cast<const float*>(sharedBuffer)[address], reinterpret_cast<const float*>(sharedBuffer)[address + 32], reinterpret_cast<const float*>(sharedBuffer)[address + 64],
+		reinterpret_cast<const float*>(sharedBuffer)[address + 96]);
+}
+
+//fill in all the dynamic bodies data
+static __device__ void preIntegration(const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity,
+	PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool,
+	PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* PX_RESTRICT bodySimPool, 
+	const PxNodeIndex* PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* PX_RESTRICT gTransforms,
+	float4* PX_RESTRICT gOutVelocityPool,
+	PxU32* PX_RESTRICT solverBodyIndices,
+	bool skipGravityApplication = false)
+{
+	const uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const uint32_t a = idx + offset;
+
+	//	const uint32_t warpStartIdx = offset + (idx&(~31));
+
+	const PxU32 BodySimSize = sizeof(PxgBodySim) / sizeof(float4);
+
+	__shared__ uint sharedBufferSpace[PxgKernelBlockDim::PRE_INTEGRATION / 32][16 * 33];
+
+	__shared__ PxU32 sharedIslandNodeIndices[PxgKernelBlockDim::PRE_INTEGRATION / 32][32];
+
+	const PxU32 warpIndex = threadIdx.x / 32;
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+	if (a < nbSolverBodies)
+	{
+		PxU32 index = islandNodeIndices[a].index();
+		sharedIslandNodeIndices[warpIndex][threadIndexInWarp] = index;
+		solverBodyIndices[index] = a;
+		//printf("%i: SharedIslandNodeIndices[%i][%i] = %i, %i\n", a, warpIndex, threadIndexInWarp, sharedIslandNodeIndices[warpIndex][threadIndexInWarp], islandNodeIndices[a]);
+	}
+
+	__syncwarp();
+
+
+	PX_COMPILE_TIME_ASSERT((sizeof(uint4) * 4 * 4) >= BodySimSize);
+
+	const PxU32 startReadIndex = a - threadIndexInWarp;
+
+	const PxU32 NbToRead = startReadIndex < nbSolverBodies ? PxMin(32u, nbSolverBodies - startReadIndex) : 0;
+
+
+	float4 linearVelocityXYZ_inverseMassW;
+	float4 angularVelocityXYZ_maxPenBiasW;
+	float4 sleepAngVelAccXYZ_accelScaleW;
+	float4 inverseInertiaXYZ_contactReportThresholdW;
+	float4 maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW;
+	float4 linearAccel;
+	float4 angularAccel;
+	float maxImpulse;
+	PxAlignedTransform body2World;
+	PxMat33 sqrtInvInertia;
+	PxU32 internalFlags;
+	PxU16 lockFlags;
+	PxU16 disableGravity;
+	PxReal offsetSlop;
+
+	for (PxU32 i = 0; i < NbToRead; i += 8)
+	{
+		const PxU32 TotalUint4ToRead = BodySimSize * PxMin(NbToRead - i, 8u);
+
+		for (PxU32 j = threadIndexInWarp, iter = 0; j < TotalUint4ToRead; j += 32, iter++)
+		{
+			const PxU32 ind = j / BodySimSize;
+			const PxU32 nodeIndex = sharedIslandNodeIndices[warpIndex][ind + i];
+			const PxU32 offset = j - (ind*BodySimSize);
+
+			const uint4* src = reinterpret_cast<const uint4*>(&bodySimPool[nodeIndex]);
+			const uint4 val = src[offset];
+			sharedBufferSpace[warpIndex][iter * 128 + threadIndexInWarp] = val.x;
+			sharedBufferSpace[warpIndex][iter * 128 + 32 + threadIndexInWarp] = val.y;
+			sharedBufferSpace[warpIndex][iter * 128 + 64 + threadIndexInWarp] = val.z;
+			sharedBufferSpace[warpIndex][iter * 128 + 96 + threadIndexInWarp] = val.w;
+		}
+
+		__syncwarp();
+
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 8) && a < nbSolverBodies)
+		{
+			const uint* bSims = reinterpret_cast<uint*>(sharedBufferSpace[warpIndex]);
+
+			const PxU32 readIndex = threadIndexInWarp & 7;
+
+			linearVelocityXYZ_inverseMassW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 0);
+			angularVelocityXYZ_maxPenBiasW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 1);
+			maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 2);
+			inverseInertiaXYZ_contactReportThresholdW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 3);
+			sleepAngVelAccXYZ_accelScaleW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 5);
+			
+			float4 q = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 7);
+			float4 p = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 8);
+			body2World.p = make_float4(p.x, p.y, p.z, 0.f);
+			body2World.q = PxAlignedQuat(q.x, q.y, q.z, q.w);
+			maxImpulse = readSwizzledWord<float, PxgBodySim>(bSims, readIndex, 10, 3);
+			internalFlags = readSwizzledWord<PxU32, PxgBodySim>(bSims, readIndex, 11, 1);
+
+			ushort2 tmp = readSwizzledWord<ushort2, PxgBodySim>(bSims, readIndex, 11, 2);
+
+			lockFlags = tmp.x;
+			disableGravity = tmp.y;
+			offsetSlop = readSwizzledWord<PxReal, PxgBodySim>(bSims, readIndex, 11, 3);
+
+			linearAccel = skipGravityApplication ? make_float4(0.0f) : readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 12);
+			angularAccel = skipGravityApplication ? make_float4(0.0f) : readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 13);
+		}
+
+		__syncwarp();
+	}
+
+	if (a < nbSolverBodies)
+	{
+		bodyCoreComputeUnconstrainedVelocity(maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW, linearVelocityXYZ_inverseMassW, angularVelocityXYZ_maxPenBiasW,
+			lockFlags, disableGravity || skipGravityApplication, sleepAngVelAccXYZ_accelScaleW.w, gravity, linearAccel, angularAccel, dt);
+
+		//initialize solver bodyData
+		//const float4 inverseInertiaXYZ_contactReportThresholdW = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+		const PxVec3 inverseInertia(inverseInertiaXYZ_contactReportThresholdW.x, inverseInertiaXYZ_contactReportThresholdW.y, inverseInertiaXYZ_contactReportThresholdW.z);
+		const PxVec3 safeSqrtInvInertia = computeSafeSqrtInertia(inverseInertia);
+		const PxMat33 rotation(reinterpret_cast<PxQuat&>(body2World.q));
+		transformInertiaTensor(safeSqrtInvInertia, rotation, sqrtInvInertia);
+
+		gOutVelocityPool[a] = linearVelocityXYZ_inverseMassW;
+
+		//KS - to make this compatible with the rigid body particle system, we store the angular velocity in the gOutVelocityPool
+		//in momocity format!
+		const PxVec3 sqrtInertiaV(safeSqrtInvInertia.x == 0.f ? 0.f : 1.f / safeSqrtInvInertia.x, safeSqrtInvInertia.y == 0.f ? 0.f : 1.f / safeSqrtInvInertia.y, 
+			safeSqrtInvInertia.z == 0.f ? 0.f : 1.f / safeSqrtInvInertia.z);
+
+		PxMat33 sqrtInertia;
+		transformInertiaTensor(sqrtInertiaV, rotation, sqrtInertia);
+
+		PxVec3 angVel(angularVelocityXYZ_maxPenBiasW.x, angularVelocityXYZ_maxPenBiasW.y, angularVelocityXYZ_maxPenBiasW.z);
+
+		if (internalFlags & PxsRigidBody::eENABLE_GYROSCOPIC)
+		{
+			const PxVec3 localInertia(
+				inverseInertia.x == 0.f ? 0.f : 1.f / inverseInertia.x,
+				inverseInertia.y == 0.f ? 0.f : 1.f / inverseInertia.y,
+				inverseInertia.z == 0.f ? 0.f : 1.f / inverseInertia.z);
+
+			const PxVec3 localAngVel = body2World.q.rotateInv(angVel);
+			const PxVec3 origMom = localInertia.multiply(localAngVel);
+			const PxVec3 torque = -localAngVel.cross(origMom);
+			PxVec3 newMom = origMom + torque * dt;
+			const PxReal denom = newMom.magnitude();
+			PxReal ratio = denom > 0.f ? origMom.magnitude() / denom : 0.f;
+			newMom *= ratio;
+			PxVec3 newDeltaAngVel = body2World.q.rotate(inverseInertia.multiply(newMom) - localAngVel);
+
+			angVel += newDeltaAngVel;
+		}
+
+		angularVelocityXYZ_maxPenBiasW.x = angVel.x; angularVelocityXYZ_maxPenBiasW.y = angVel.y; angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+
+		angVel = sqrtInertia * (angVel);
+		
+		gOutVelocityPool[a + nbSolverBodies] = make_float4(angVel.x, angVel.y, angVel.z, angularVelocityXYZ_maxPenBiasW.w);
+
+		//KS - TODO - coalesce theses, probably by writing out 2x float4.
+		gTransforms[a] = body2World;
+	}
+
+	for (PxU32 i = 0; i < NbToRead; i += 16)
+	{
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		{
+			PxgSolverBodyData* bData = reinterpret_cast<PxgSolverBodyData*>(&sharedBufferSpace[warpIndex][0]);
+
+			const PxU32 nodeIndex = sharedIslandNodeIndices[warpIndex][threadIndexInWarp];
+
+			PxgSolverBodyData& data = bData[threadIndexInWarp & 15];
+			//PxgSolverBodyData& data = solverBodyDataPool[a];
+
+			//data.sqrtInvInertia = sqrtInvInertia;
+			data.body2World = body2World;
+			data.initialLinVelXYZ_invMassW = linearVelocityXYZ_inverseMassW;
+			data.initialAngVelXYZ_penBiasClamp = angularVelocityXYZ_maxPenBiasW;
+			data.offsetSlop = offsetSlop;
+
+			data.reportThreshold = inverseInertiaXYZ_contactReportThresholdW.w;
+
+			data.islandNodeIndex = PxNodeIndex(nodeIndex);
+			//data.inverseInertia  = make_float4(bodySim.inverseInertiaXYZ_contactReportThresholdW.x,bodySim.inverseInertiaXYZ_contactReportThresholdW.y, bodySim.inverseInertiaXYZ_contactReportThresholdW.z, 0.f);
+
+			data.maxImpulse = maxImpulse; //KS - can this be read in in a more efficient/better way?
+			PxU32 flags = 0;
+			if (internalFlags & PxsRigidBody::eSPECULATIVE_CCD)
+				flags |= PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD;
+			if (internalFlags & PxsRigidBody::eENABLE_GYROSCOPIC)
+				flags |= PxRigidBodyFlag::eENABLE_GYROSCOPIC_FORCES;
+			data.flags = flags;
+		}
+
+		__syncwarp();
+
+		const PxU32 solverBodyDataSize = sizeof(PxgSolverBodyData) / sizeof(uint);
+
+		PxU32 TotalUintToWrite = solverBodyDataSize * PxMin(NbToRead - i, 16u);
+
+		uint* dst = reinterpret_cast<uint*>(&solverBodyDataPool[startReadIndex + i]);
+		uint* src = reinterpret_cast<uint*>(&sharedBufferSpace[warpIndex][0]);
+
+		for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		{
+			dst[j] = src[j];
+		}
+		__syncwarp();
+
+		//if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		//{
+		//	//KS - TODO - is this necessary? We now have PxgBodySim, which stores all this data.
+		//	//This entire sleep data thing looks redundant now!
+		//	PxgSolverBodySleepData* sData = reinterpret_cast<PxgSolverBodySleepData*>(sharedBufferSpace[warpIndex]);
+		//	PxgSolverBodySleepData& sleepData = sData[threadIndexInWarp & 15];
+		//	//PxgSolverBodySleepData& sleepData = solverBodySleepDataPool[a];
+		//	//initialize solver body sleep data
+		//	sleepData.freezeCount = sleepLinVelAccXYZ_freezeCountW.w;
+		//	sleepData.sleepLinVelAcc = *reinterpret_cast<PxVec3*>(&(make_float3(sleepLinVelAccXYZ_freezeCountW).x));
+		//	sleepData.accelScale = sleepAngVelAccXYZ_accelScaleW.w;
+		//	sleepData.sleepAngVelAcc = *reinterpret_cast<PxVec3*>(&(make_float3(sleepAngVelAccXYZ_accelScaleW).x));
+
+		//	sleepData.freezeThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x;
+		//	sleepData.wakeCounter = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y;
+		//	sleepData.sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+		//	sleepData.internalFlags = internalFlags;
+		//}
+
+		//__syncwarp();
+
+		//const PxU32 sleepDataSize = sizeof(PxgSolverBodySleepData) / sizeof(uint);
+
+		//TotalUintToWrite = sleepDataSize * PxMin(NbToRead - i, 16u);
+		//dst = reinterpret_cast<uint*>(&solverBodySleepDataPool[startReadIndex + i]);
+
+		//for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		//{
+		//	dst[j] = sharedBufferSpace[warpIndex][j];
+		//}
+
+		//__syncwarp();
+
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		{
+			PxgSolverTxIData* bData = reinterpret_cast<PxgSolverTxIData*>(&sharedBufferSpace[warpIndex][0]);
+
+			PxgSolverTxIData& data = bData[threadIndexInWarp & 15];
+			//PxgSolverBodyData& data = solverBodyDataPool[a];
+
+			data.sqrtInvInertia = sqrtInvInertia;
+			data.deltaBody2World = PxTransform(PxIdentity);
+		}
+		__syncwarp();
+
+		const PxU32 txISize = sizeof(PxgSolverTxIData) / sizeof(uint);
+
+		TotalUintToWrite = txISize * PxMin(NbToRead - i, 16u);
+		dst = reinterpret_cast<uint*>(&solverTxIDataPool[startReadIndex + i]);
+
+		for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		{
+			dst[j] = sharedBufferSpace[warpIndex][j];
+		}
+
+		__syncwarp();
+
+	}
+}
+
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegrationTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegrationTGS.cu
@@ -0,0 +1,43 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#define IS_TGS_SOLVER
+
+#include "preIntegration.cuh"
+
+extern "C" __host__ void initSolverKernels9() {}
+
+extern "C" __global__ void preIntegrationLaunchTGS(
+	const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool, PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* const PX_RESTRICT bodySimPool, const PxNodeIndex* const PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* gTransforms, float4* gOutVelocityPool, PxU32* solverBodyIndices, bool skipGravityApplication)
+{
+	preIntegration(offset, nbSolverBodies, dt, gravity, solverBodyDataPool, solverBodySleepDataPool, solverTxIDataPool, 
+		bodySimPool, islandNodeIndices, gTransforms, gOutVelocityPool, solverBodyIndices, skipGravityApplication);
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cu
@@ -0,0 +1,266 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgSolverBody.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgIslandContext.h"
+#include "PxgIntrinsics.h"
+#include "PxgSolverCoreDesc.h"
+#include "solver.cuh"
+#include "solverBlock.cuh"
+#include "PxgArticulation.h"
+#include "assert.h"
+#include "PxgArticulationCoreDesc.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels7() {}
+
+extern "C" __global__
+//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 8)
+void artiSolveBlockPartition(PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
+	const PxU32 islandIndex, const PxU32 partitionIndex, bool doFriction_, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
+{
+	const PxU32 warpIndex = threadIdx.y;
+	PxU32 globalWarpIndex = blockIdx.x * blockDim.y + warpIndex;
+
+	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
+
+	//PxgBodySim* gBodySims = sharedDesc->bodySims;
+	PxgArticulationBlockData* gArticulations = artiDesc->mArticulationBlocks;
+
+	const PxU32 maxLinks = artiDesc->mMaxLinksPerArticulation;
+
+	Cm::UnAlignedSpatialVector* deferredZ = sharedDesc->articulationDeferredZ;
+
+	const PxU32 startPartitionIndex = island.mStartPartitionIndex;
+
+	PxU32 startIndex = partitionIndex == 0 ? island.mArtiBatchStartIndex : solverDesc->artiConstraintsPerPartition[partitionIndex + startPartitionIndex - 1];
+	//PxU32 startIndex = solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex];
+	
+	const PxU32 articulationBatchOffset = solverDesc->islandContextPool->mBatchCount;
+
+	//const PxU32 nbArticulations = artiDesc->nbArticulations;
+
+	PxU32 endIndex = solverDesc->artiConstraintsPerPartition[partitionIndex + startPartitionIndex] + articulationBatchOffset;
+
+	//const PxU32 articOffset = solverDesc->numBatches * 32 * 2 * 2;
+
+	uint2* isSlabDirty = artiDesc->slabHasChanges;
+
+	//This identifies which thread within a warp a specific thread is
+	const uint threadIndexInWarp = threadIdx.x;
+
+	uint k = startIndex + globalWarpIndex + articulationBatchOffset;	
+
+	PxgErrorAccumulator error;
+	const bool accumulateError = solverDesc->contactErrorAccumulator.mCounter >= 0;
+
+	if (k < endIndex)
+	{
+		/*if(threadIndexInWarp == 0)
+			printf("arti Contact Batch k = %i, endIndex = %i\n", k, endIndex);*/
+		/*	if (threadIndexInWarp == 0)
+		printf("============================================\n");*/
+
+		const bool doFriction = doFriction_;
+
+		const IterativeSolveData& msIterativeData = sharedDesc->iterativeData;
+
+		const PxgBlockConstraintBatch& batch = msIterativeData.blockConstraintBatch[k];
+
+		if(threadIndexInWarp >= batch.mDescStride)
+			return;
+
+		//printf("threadIndexInWarp %i descStride %i\n", threadIndexInWarp, batch.mDescStride);
+		//printf("constraintBatchIndex %i startConstraintIndex %i", batch.mConstraintBatchIndex, batch.startConstraintIndex);
+		const PxU32 nbArticulations = solverDesc->islandContextPool->mArticulationCount;
+
+		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
+		const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];
+
+		const PxU32 slabId = batch.slabId[threadIndexInWarp];
+
+		const PxU32 nodeIndexA = igNodeIndexA.index();
+		const PxU32 nodeIndexB = igNodeIndexB.index();
+
+		const PxU32 solverBodyId0 = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 solverBodyId1 = batch.bodyBIndex[threadIndexInWarp];
+
+		const PxU32 readIndex = k * 128 + threadIndexInWarp;
+
+		PxgArticulationBlockResponse* responses = sharedDesc->iterativeData.artiResponse;
+		const PxU32 responseIndex = batch.mArticulationResponseIndex;
+
+		//printf("responseIndex %i\n", responseIndex);
+
+		//Cm::UnAlignedSpatialVector v0, v1;
+		PxU32 linkIndexA = igNodeIndexA.articulationLinkId(); 
+		PxU32 linkIndexB = igNodeIndexB.articulationLinkId();
+
+		Cm::UnAlignedSpatialVector vel0, vel1;
+		{
+			float4 lin = Pxldcg(msIterativeData.solverBodyVelPool[readIndex]);
+			float4 ang = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 32]);
+			vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
+		}
+
+		{
+			float4 lin = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 64]);
+			float4 ang = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 96]);
+			vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
+		}
+
+		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
+		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();
+
+		PxReal curRef0 = 1.f;
+		PxReal curRef1 = 1.f;
+
+		const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;
+		const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
+
+		const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
+		const PxU32 numSolverBodies = bodyOffset + numDynamicBodies + numArticulations;
+
+		const PxU32* const PX_RESTRICT encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;
+
+		if(igNodeIndexA.isArticulation())
+		{
+			const PxU32 articulationBodyIdA = batch.remappedBodyAIndex[threadIndexInWarp];
+
+			// Articulation IDs are at the back of rigid body IDs.
+			const PxU32 globalBodyIdA = articulationBodyIdA + numDynamicBodies + bodyOffset;
+
+			// Counting the number of active slabs
+			curRef0 = static_cast<PxReal>(countActiveSlabs(globalBodyIdA, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+		else if(solverBodyId0 >= bodyOffset)
+		{
+			// Counting the number of active slabs
+			curRef0 = static_cast<PxReal>(countActiveSlabs(solverBodyId0, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+
+		if(igNodeIndexB.isArticulation())
+		{
+			const PxU32 articulationBodyIdB = batch.remappedBodyBIndex[threadIndexInWarp];
+
+			// Articulation IDs are at the back of rigid body IDs.
+			const PxU32 globalBodyIdB = articulationBodyIdB + numDynamicBodies + bodyOffset;
+
+			// Counting the number of active slabs
+			curRef1 = static_cast<PxReal>(countActiveSlabs(globalBodyIdB, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+		else if(solverBodyId1 >= bodyOffset)
+		{
+			// Counting the number of active slabs
+			curRef1 = static_cast<PxReal>(countActiveSlabs(solverBodyId1, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+
+		if(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
+		{
+			solveExtContactsBlock(batch, vel0, vel1, doFriction, msIterativeData.blockContactHeaders,
+			                      msIterativeData.blockFrictionHeaders, msIterativeData.blockContactPoints,
+			                      msIterativeData.blockFrictions, msIterativeData.artiResponse, impulse0, impulse1,
+			                      threadIndexInWarp, accumulateError ? &error : NULL, curRef0, curRef1);
+		}
+		else
+		{
+			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D);
+			solveExt1DBlock(batch, vel0, vel1, threadIndexInWarp, msIterativeData.blockJointConstraintHeaders,
+			                msIterativeData.blockJointConstraintRowsCon, msIterativeData.blockJointConstraintRowsMod,
+			                &responses[responseIndex], impulse0, impulse1,
+			                solverDesc->contactErrorAccumulator.mCounter >= 0, curRef0, curRef1);
+		}
+
+		//Pull impulse from threads 6-12
+		if (!igNodeIndexA.isArticulation())
+		{
+			const PxU32 outIndex = (batch.remappedBodyAIndex[threadIndexInWarp]);
+			{
+				msIterativeData.solverBodyVelPool[outIndex] = make_float4(vel0.bottom.x, vel0.bottom.y, vel0.bottom.z, 0.f);
+				msIterativeData.solverBodyVelPool[outIndex+32] = make_float4(vel0.top.x, vel0.top.y, vel0.top.z, 0.f);
+			}
+		}
+		else
+		{
+			//Write to GPU articulation!
+			const PxU32 index = computeDeltaVIndex(nbArticulations, maxLinks, solverBodyId0, linkIndexA, slabId);
+			//deferredZ[index] = impulse0;
+			//printf("A index %i\n", index);
+			storeSpatialVector(deferredZ + index, impulse0);
+
+			/*printf("A expected final LinkID = %i, articId = %i, vel = (%f, %f, %f, %f, %f, %f)\n", linkIndexA, solverBodyId0,
+				vel0.top.x, vel0.top.y, vel0.top.z, vel0.bottom.x, vel0.bottom.y, vel0.bottom.z);*/
+
+			/*printf("%i: Output ArticA index = %i, articId = %i (%f, %f, %f, %f, %f, %f)\n", threadIndexInWarp, index, solverBodyId0,
+				impulse0.top.x, impulse0.top.y, impulse0.top.z, impulse0.bottom.x, impulse0.bottom.y, impulse0.bottom.z);*/
+
+			//KS - TODO - let's see if we can skip *all* of this code below. It should be avoidable!
+			isSlabDirty[solverBodyId0 + slabId*nbArticulations].x = linkIndexA; // this works because the articulations are enumerated first in solverBodyIds
+			PxU32 articBlockId = solverBodyId0 / 32;
+			gArticulations[articBlockId].mStateDirty[solverBodyId0&31] = PxgArtiStateDirtyFlag::eHAS_IMPULSES | PxgArtiStateDirtyFlag::eVEL_DIRTY;
+		}
+
+		if (!igNodeIndexB.isArticulation())
+		{
+			//const PxU32 indexB = (2 * batch.remappedBodyBIndex);
+			const PxU32 outIndex = (batch.remappedBodyBIndex[threadIndexInWarp]);
+			{
+				msIterativeData.solverBodyVelPool[outIndex] = make_float4(vel1.bottom.x, vel1.bottom.y, vel1.bottom.z, 0.f);
+				msIterativeData.solverBodyVelPool[outIndex + 32] = make_float4(vel1.top.x, vel1.top.y, vel1.top.z, 0.f);
+
+				//printf("Output rigid B to %i\n", outIndex);
+			}
+		}
+		else
+		{
+			//Write to GPU articulation!
+			const PxU32 index = computeDeltaVIndex(nbArticulations, maxLinks, solverBodyId1, linkIndexB, slabId);
+			//printf("B index %i\n", index);
+			deferredZ[index] = impulse1;
+			storeSpatialVector(deferredZ + index, impulse1);
+			isSlabDirty[solverBodyId1 + slabId*nbArticulations].y = linkIndexB;
+			PxU32 articBlockId = solverBodyId1 / 32;
+			/*gArticulations[articBlockId].mJointDirty[solverBodyId1&31] = true;
+			gArticulations[articBlockId].mHasInternalImpulses[solverBodyId1&31] = true;*/
+			gArticulations[articBlockId].mStateDirty[solverBodyId1 & 31] = PxgArtiStateDirtyFlag::eHAS_IMPULSES | PxgArtiStateDirtyFlag::eVEL_DIRTY;
+			
+			/*printf("A expected final LinkID = %i, articId = %i, vel = (%f, %f, %f, %f, %f, %f)\n", linkIndexB, solverBodyId0,
+				vel1.top.x, vel1.top.y, vel1.top.z, vel1.bottom.x, vel1.bottom.y, vel1.bottom.z);*/
+			//printf("%i: Output ArticB index = %i, articId = %i (%f, %f, %f, %f, %f, %f)\n", threadIndexInWarp, index, solverBodyId1,
+			//	impulse1.top.x, impulse1.top.y, impulse1.top.z, impulse1.bottom.x, impulse1.bottom.y, impulse1.bottom.z);
+		}		
+	}
+
+	if (accumulateError)
+		error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cuh
@@ -0,0 +1,834 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_CUH__
+#define __SOLVER_CUH__
+
+#include "PxgSolverBody.h"
+#include "PxgSolverConstraint1D.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgSolverContext.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgCommonDefines.h"
+#include "PxgIntrinsics.h"
+#include "PxgArticulation.h"
+#include "solverResidual.cuh"
+#include "constraintPrepShared.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+using namespace physx;
+
+// This function is for contacts involving articulations.
+// To apply mass-splitting, different data is stored and used when computing impulses.
+// Apart from mass-splitting, the formulation is the same as the previous implementation, see "setupFinalizeExtSolverConstraintsBlock"
+static __device__ void solveExtContactsBlock(const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0,
+	Cm::UnAlignedSpatialVector& vel1, const bool doFriction, PxgBlockSolverContactHeader* contactHeaders,
+	PxgBlockSolverFrictionHeader* frictionHeaders, PxgBlockSolverContactPoint* contactPoints,
+	PxgBlockSolverContactFriction* frictionPoints, const PxgArticulationBlockResponse* const PX_RESTRICT responses, Cm::UnAlignedSpatialVector& impulse0,
+	Cm::UnAlignedSpatialVector& impulse1, const PxU32 threadIndexInWarp, PxgErrorAccumulator* error, 
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	Cm::UnAlignedSpatialVector imp0(PxVec3(0.f), PxVec3(0.f));
+	Cm::UnAlignedSpatialVector imp1(PxVec3(0.f), PxVec3(0.f));
+
+	PxgBlockSolverContactHeader& contactHeader = contactHeaders[batch.mConstraintBatchIndex];
+	PxgBlockSolverFrictionHeader& frictionHeader = frictionHeaders[batch.mConstraintBatchIndex];
+
+	const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader.invMass0_1_angDom0_1[threadIndexInWarp]);
+
+	const PxgArticulationBlockResponse* resp = &responses[batch.mArticulationResponseIndex];
+
+	const uint numNormalConstr = Pxldcg(contactHeader.numNormalConstr[threadIndexInWarp]);
+	const uint	numFrictionConstr = Pxldcg(frictionHeader.numFrictionConstr[threadIndexInWarp]);
+
+	const float restitution = contactHeader.restitution[threadIndexInWarp];
+	const float p8 = 0.8f;
+	const float cfm = contactHeader.cfm[threadIndexInWarp];
+	const PxU8 flags = contactHeader.flags[threadIndexInWarp];
+
+	PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+
+	PxReal accumulatedNormalImpulse = 0.f;
+
+	Cm::UnAlignedSpatialVector v0 = vel0;
+	Cm::UnAlignedSpatialVector v1 = vel1;
+
+	const float4 normal_staticFriction = Pxldcg(contactHeader.normal_staticFriction[threadIndexInWarp]);
+
+	const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+	const PxReal staticCof = normal_staticFriction.w;
+
+	float4 nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndexInWarp]);
+	float4 nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndexInWarp]);
+	float nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndexInWarp]);
+
+	float nextResp0 = Pxldcg(contacts[0].resp0[threadIndexInWarp]);
+	float nextResp1 = Pxldcg(contacts[0].resp1[threadIndexInWarp]);
+
+	float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndexInWarp]);
+	float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndexInWarp]);
+
+	float3 nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+	float3 nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+	float3 nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+	float3 nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+
+	PxgBlockSolverContactFriction* frictions = &frictionPoints[batch.startFrictionIndex];
+
+	for (uint i = 0; i < numNormalConstr; i++)
+	{
+		PxgBlockSolverContactPoint& c = contacts[i];
+		resp++;
+
+		const float4 raXn_targetVelocity = nextRaxn_extraCoeff;
+		const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+		const float appliedForce = nextAppliedForce;
+		const float resp0 = nextResp0;
+		const float resp1 = nextResp1;
+		const float coeff0 = nextCoeff0;
+		const float coeff1 = nextCoeff1;
+
+		const float3 deltaRALin = nextDeltaRALin;
+		const float3 deltaRAAng = nextDeltaRAAng;
+		const float3 deltaRBLin = nextDeltaRBLin;
+		const float3 deltaRBAng = nextDeltaRBAng;
+
+		if ((i + 1) < numNormalConstr)
+		{
+			const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+
+			nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndexInWarp]);
+
+			nextResp0 = Pxldcg(nextC.resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(nextC.resp1[threadIndexInWarp]);
+
+			nextCoeff0 = Pxldcg(nextC.coeff0[threadIndexInWarp]);
+			nextCoeff1 = Pxldcg(nextC.coeff1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+		else if (numFrictionConstr && doFriction)
+		{
+			nextRaxn_extraCoeff = Pxldcg(frictions[0].raXn_bias[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(frictions[0].rbXn_targetVelW[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(frictions[0].appliedForce[threadIndexInWarp]);
+			nextResp0 = Pxldcg(frictions[0].resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(frictions[0].resp1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+
+		const PxVec3 raXn = PxVec3(raXn_targetVelocity.x, raXn_targetVelocity.y, raXn_targetVelocity.z);
+		const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+		const float targetVelocity = raXn_targetVelocity.w;
+		const float maxImpulse = rbXn_maxImpulse.w;
+
+		float unitResponse = ref0 * resp0 + ref1 * resp1;
+		float recipResponse = (unitResponse > 0.0f) ? 1.0f / (unitResponse + cfm) : 0.0f;
+		float velMultiplier = recipResponse;
+		float impulseMul = 1.0f;
+		float unbiasedError;
+		float biasedErr;
+
+		computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0, coeff1,
+			velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+		const Cm::UnAlignedSpatialVector deltaVA(ref0 * PxVec3(deltaRAAng.x, deltaRAAng.y, deltaRAAng.z),
+												 ref0 * PxVec3(deltaRALin.x, deltaRALin.y, deltaRALin.z));
+
+		const Cm::UnAlignedSpatialVector deltaVB(ref1 * PxVec3(deltaRBAng.x, deltaRBAng.y, deltaRBAng.z),
+												 ref1 * PxVec3(deltaRBLin.x, deltaRBLin.y, deltaRBLin.z));
+
+		const float v0_ = v0.bottom.dot(normal) + v0.top.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+		const float v1_ = v1.bottom.dot(normal) + v1.top.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+		const float normalVel = v0_ - v1_;
+
+		//KS - clamp the maximum force
+		const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+		const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const float _newForce = appliedForce * impulseMul + _deltaF;
+		const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+		const float deltaF = newForce - appliedForce;
+
+		if(error)
+			error->accumulateErrorLocal(deltaF, velMultiplier);
+		Pxstcg(&c.appliedForce[threadIndexInWarp], newForce);
+
+		imp0.bottom -= raXn * deltaF;
+		imp0.top -= normal * deltaF;
+		imp1.bottom += rbXn * deltaF;
+		imp1.top += normal * deltaF;
+
+		v0 += deltaVA * deltaF;
+		v1 += deltaVB * deltaF;
+
+		accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+	}
+
+	//Force a minimum normal force for friction. This is required for articulations with multi-link collisions
+	//because often normal force can be solved with just 1 link's collisions. However, this means that other links can slide on
+	//a surface friction-free because there was no normal force applied.
+	accumulatedNormalImpulse = PxMax(accumulatedNormalImpulse, contactHeader.minNormalForce[threadIndexInWarp]);
+
+	if (numFrictionConstr && doFriction)
+	{
+
+		//printf("FrictionHeader = %i, count = %i\n", batch.startFrictionIndex, numFrictionConstr);
+		const float dynamicFrictionCof = frictionHeader.dynamicFriction[threadIndexInWarp];
+		const float maxFrictionImpulse = staticCof * accumulatedNormalImpulse;
+		const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+		//const float negMaxDynFrictionImpulse = -maxDynFrictionImpulse;
+
+		PxU32 broken = 0;
+
+
+		for (uint i = 0; i < numFrictionConstr; i++)
+		{
+			PxgBlockSolverContactFriction& f = frictions[i];
+			resp++;
+
+			const float4 frictionNormal = frictionHeader.frictionNormals[i & 1][threadIndexInWarp];
+
+			const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+			const float4 rbXn_targetVelW = nextRbxn_maxImpulseW;
+			const float resp0 = nextResp0;
+			const float resp1 = nextResp1;
+
+			const float appliedForce = nextAppliedForce;
+
+
+			const float3 deltaRALin = nextDeltaRALin;
+			const float3 deltaRAAng = nextDeltaRAAng;
+			const float3 deltaRBLin = nextDeltaRBLin;
+			const float3 deltaRBAng = nextDeltaRBAng;
+
+			if ((i + 1) < numFrictionConstr)
+			{
+				const PxgBlockSolverContactFriction& f2 = frictions[i + 1];
+
+				nextRaxn_extraCoeff = Pxldcg(f2.raXn_bias[threadIndexInWarp]);
+				nextRbxn_maxImpulseW = Pxldcg(f2.rbXn_targetVelW[threadIndexInWarp]);
+				nextResp0 = Pxldcg(f2.resp0[threadIndexInWarp]);
+				nextResp1 = Pxldcg(f2.resp1[threadIndexInWarp]);
+
+				nextAppliedForce = Pxldcg(f2.appliedForce[threadIndexInWarp]);
+				nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+				nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+				nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+				nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+			}
+
+			const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+			const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+			const float resp = ref0 * resp0 + ref1 * resp1;
+			const float velMultiplier = (resp > PX_EPS_REAL) ? (p8 / resp) : 0.f;
+			const float bias = raXn_extraCoeff.w;
+			const float targetVel = rbXn_targetVelW.w;
+
+			const Cm::UnAlignedSpatialVector deltaVA(ref0 * PxVec3(deltaRAAng.x, deltaRAAng.y, deltaRAAng.z),
+													 ref0 * PxVec3(deltaRALin.x, deltaRALin.y, deltaRALin.z));
+
+			const Cm::UnAlignedSpatialVector deltaVB(ref1 * PxVec3(deltaRBAng.x, deltaRBAng.y, deltaRBAng.z),
+													 ref1 * PxVec3(deltaRBLin.x, deltaRBLin.y, deltaRBLin.z));
+
+			const PxVec3 normal = PxVec3(frictionNormal.x, frictionNormal.y, frictionNormal.z);
+
+			const float v0_ = v0.top.dot(raXn) + v0.bottom.dot(normal);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+			const float v1_ = v1.top.dot(rbXn) + v1.bottom.dot(normal);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+			const float normalVel = v0_ - v1_;
+
+			const float tmp1 = appliedForce - (bias - targetVel) * velMultiplier;
+
+			const float totalImpulse = tmp1 - normalVel * velMultiplier;
+
+			const bool clamp = fabsf(totalImpulse) > maxFrictionImpulse;
+
+			const float totalClamped = fminf(maxDynFrictionImpulse, fmaxf(-maxDynFrictionImpulse, totalImpulse));
+
+			const float newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+			float deltaF = newAppliedForce - appliedForce;//FSub(newAppliedForce, appliedForce);
+
+			if (error)
+				error->accumulateErrorLocal(deltaF, velMultiplier);
+
+			//printf("v0 = (%f, %f, %f, %f, %f, %f), v1 = (%f, %f, %f, %f, %f, %f)\n", v0.top.x, v0.top.y, v0.top.z, v0.bottom.x, v0.bottom.y, v0.bottom.z,
+			//	v1.top.x, v1.top.y, v1.top.z, v1.bottom.x, v1.bottom.y, v1.bottom.z);
+			//printf("normal = (%f, %f, %f), raXn = (%f, %f, %f)\n", normal.x, normal.y, normal.z, raXn.x, raXn.y, raXn.z);
+
+			//printf("Friction velMultiplier = %f, normalVel = %f, deltaF = %f\n", velMultiplier, normalVel, deltaF);
+
+			v0 += deltaVA * deltaF;
+			v1 += deltaVB * deltaF;
+
+			imp0.bottom -= raXn * deltaF;
+			imp0.top -= normal * deltaF;
+			imp1.bottom += rbXn * deltaF;
+			imp1.top += normal * deltaF;
+
+			//f.appliedForce[threadIndex] = newAppliedForce;
+			Pxstcg(&f.appliedForce[threadIndexInWarp], newAppliedForce);
+			broken = broken | clamp;
+		}
+		Pxstcg(&frictionHeader.broken[threadIndexInWarp], broken);
+	}
+
+	impulse0 = imp0.scale(ref0 * invMass0_1_angDom0_1.z, ref0 * invMass0_1_angDom0_1.x);
+	impulse1 = imp1.scale(ref1 * invMass0_1_angDom0_1.y, ref1 * invMass0_1_angDom0_1.w);
+
+	vel0 = v0;
+	vel1 = v1;
+}
+
+// A light version of the function "solveExtContactsBlock" to quickly check if there is any active contact.
+static __device__ bool checkExtActiveContactBlock(const PxgBlockConstraintBatch& batch, const Cm::UnAlignedSpatialVector& vel0,
+	const Cm::UnAlignedSpatialVector& vel1, PxgBlockSolverContactHeader* contactHeaders,
+	PxgBlockSolverContactPoint* contactPoints, const PxgArticulationBlockResponse* const PX_RESTRICT responses, 
+	const PxU32 threadIndexInWarp)
+{
+	PxgBlockSolverContactHeader& contactHeader = contactHeaders[batch.mConstraintBatchIndex];
+	const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader.invMass0_1_angDom0_1[threadIndexInWarp]);
+	const PxgArticulationBlockResponse* resp = &responses[batch.mArticulationResponseIndex];
+	const uint numNormalConstr = Pxldcg(contactHeader.numNormalConstr[threadIndexInWarp]);
+
+	const float restitution = contactHeader.restitution[threadIndexInWarp];
+	const float cfm = contactHeader.cfm[threadIndexInWarp];
+	const PxU8 flags = contactHeader.flags[threadIndexInWarp];
+
+	PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+
+	Cm::UnAlignedSpatialVector v0 = vel0;
+	Cm::UnAlignedSpatialVector v1 = vel1;
+
+	const float4 normal_staticFriction = Pxldcg(contactHeader.normal_staticFriction[threadIndexInWarp]);
+	const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+	float4 nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndexInWarp]);
+	float4 nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndexInWarp]);
+	float nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndexInWarp]);
+
+	float nextResp0 = Pxldcg(contacts[0].resp0[threadIndexInWarp]);
+	float nextResp1 = Pxldcg(contacts[0].resp1[threadIndexInWarp]);
+
+	float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndexInWarp]);
+	float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndexInWarp]);
+
+	float3 nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+	float3 nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+	float3 nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+	float3 nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+
+	for (uint i = 0; i < numNormalConstr; i++)
+	{
+		resp++;
+
+		const float4 raXn_targetVelocity = nextRaxn_extraCoeff;
+		const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+		const float appliedForce = nextAppliedForce;
+		const float resp0 = nextResp0;
+		const float resp1 = nextResp1;
+		const float coeff0 = nextCoeff0;
+		const float coeff1 = nextCoeff1;
+
+		if ((i + 1) < numNormalConstr)
+		{
+			const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+
+			nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndexInWarp]);
+
+			nextResp0 = Pxldcg(nextC.resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(nextC.resp1[threadIndexInWarp]);
+
+			nextCoeff0 = Pxldcg(nextC.coeff0[threadIndexInWarp]);
+			nextCoeff1 = Pxldcg(nextC.coeff1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+
+		const PxVec3 raXn = PxVec3(raXn_targetVelocity.x, raXn_targetVelocity.y, raXn_targetVelocity.z);
+		const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+		const float targetVelocity = raXn_targetVelocity.w;
+		const float maxImpulse = rbXn_maxImpulse.w;
+
+		float unitResponse = resp0 + resp1;
+		float recipResponse = (unitResponse > 0.f) ? 1.f / (unitResponse + cfm) : 0.f;
+		float velMultiplier = recipResponse;
+		float impulseMul = 1.f;
+		float unbiasedError;
+		float biasedErr;
+
+		computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0, coeff1,
+			velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+		const float v0_ = v0.bottom.dot(normal) + v0.top.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+		const float v1_ = v1.bottom.dot(normal) + v1.top.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+		const float normalVel = v0_ - v1_;
+
+		//KS - clamp the maximum force
+		const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+		const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const float _newForce = appliedForce * impulseMul + _deltaF;
+		const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+		const float deltaF = newForce - appliedForce;
+
+		// Check for active contact.
+		if (PxAbs(deltaF) > 1.0e-8f)
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+// To apply mass-splitting, different data is stored and used when computing impulses.
+// Apart from mass-splitting, the formulation is the same as the previous implementation, see "setupFinalizeExtSolverConstraintsBlock"
+static __device__ PX_FORCE_INLINE void solveExtContactBlockTGS(const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0, Cm::UnAlignedSpatialVector& vel1, const Cm::UnAlignedSpatialVector& delta0, const Cm::UnAlignedSpatialVector& delta1,
+	const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, PxgTGSBlockSolverContactFriction* PX_RESTRICT frictionPoints,
+	PxgArticulationBlockResponse* PX_RESTRICT responses, const PxReal elapsedTime, const PxReal minPen,
+	Cm::UnAlignedSpatialVector& impulse0, Cm::UnAlignedSpatialVector& impulse1, PxgErrorAccumulator* error,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	float accumulatedNormalImpulse = 0.f;
+
+	Cm::UnAlignedSpatialVector imp0(PxVec3(0.f), PxVec3(0.f)), imp1(PxVec3(0.f), PxVec3(0.f));
+	{
+		PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const uint	totalFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+		const uint	numFrictionConstr = totalFrictionConstr & (~0x1);
+
+		const PxReal maxPenBias = contactHeader->maxPenBias[threadIndex];
+
+		PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+
+		PxgArticulationBlockResponse* PX_RESTRICT resp = &responses[batch.mArticulationResponseIndex];
+
+		const float4 invMass0_1_angDom0_1 = contactHeader->invMass0_1_angDom0_1[threadIndex];
+
+		const float4 normal_staticFriction = contactHeader->normal_staticFriction[threadIndex];
+
+		const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
+		const float p8 = contactHeader->p8[threadIndex];
+		const float cfm = contactHeader->cfm[threadIndex];
+		const PxU8 flags = (PxU8)contactHeader->flags[threadIndex];
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		//Bring forward a read event
+		const float staticFrictionCof = normal_staticFriction.w;
+
+		const PxVec3 relMotion = delta0.bottom - delta1.bottom;
+
+		const float deltaV = normal.dot(relMotion);
+
+		{
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				PxgTGSBlockSolverContactPoint& c = contacts[i];
+				PxgArticulationBlockResponse& r = *resp;
+				resp++;
+
+				const float4 raXn_extraCoeff = c.raXn_extraCoeff[threadIndex];
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const float compliantContactCoef = raXn_extraCoeff.w;
+
+				const float4 rbXn_targetVelW = c.rbXn_targetVelW[threadIndex];
+				const float appliedForce = c.appliedForce[threadIndex];
+				const float separation = c.separation[threadIndex];
+
+				const float maxImpulse = c.maxImpulse[threadIndex];
+
+				const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+				const float targetVel = rbXn_targetVelW.w;
+
+				float biasCoefficient = c.biasCoefficient[threadIndex];
+
+				const float resp0 = ref0 * Pxldcs(c.resp0[threadIndex]);
+				const float resp1 = ref1 * Pxldcs(c.resp1[threadIndex]);
+
+				const float unitResponse = resp0 + resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / (unitResponse + cfm)) : 0.f;
+
+				float velMultiplier = recipResponse;
+
+				if (restitutionXdt < 0.f)
+				{
+					computeCompliantContactCoefficientsTGS(flags, restitutionXdt, unitResponse, recipResponse,
+						compliantContactCoef, velMultiplier, biasCoefficient);
+				}
+
+				//Compute the normal velocity of the constraint.
+				const PxReal v0 = angVel0.dot(raXn) + linVel0.dot(normal);
+				const PxReal v1 = angVel1.dot(rbXn) + linVel1.dot(normal);
+				const float normalVel = (v0 - v1);
+
+				const PxReal deltaBias = deltaV + delta0.top.dot(raXn) - delta1.top.dot(rbXn) - targetVel * elapsedTime;
+
+				const float sep = PxMax(minPen, separation + deltaBias);
+
+				const PxReal biased = PxMin(-maxPenBias, biasCoefficient * sep);
+				const PxReal tVelBias = recipResponse * biased;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = tVelBias - (normalVel - targetVel) * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+
+				PxVec3 deltaRALin = ref0 * PxVec3(r.deltaRALin_x[threadIndex], r.deltaRALin_y[threadIndex], r.deltaRALin_z[threadIndex]);
+				PxVec3 deltaRAAng = ref0 * PxVec3(r.deltaRAAng_x[threadIndex], r.deltaRAAng_y[threadIndex], r.deltaRAAng_z[threadIndex]);
+				PxVec3 deltaRBLin = ref1 * PxVec3(r.deltaRBLin_x[threadIndex], r.deltaRBLin_y[threadIndex], r.deltaRBLin_z[threadIndex]);
+				PxVec3 deltaRBAng = ref1 * PxVec3(r.deltaRBAng_x[threadIndex], r.deltaRBAng_y[threadIndex], r.deltaRBAng_z[threadIndex]);
+				linVel0 += deltaRALin * deltaF;
+				linVel1 += deltaRBLin * deltaF;
+				angVel0 += deltaRAAng * deltaF;
+				angVel1 += deltaRBAng * deltaF;
+
+				imp0.top -= normal * deltaF;
+				imp0.bottom -= raXn * deltaF;
+				imp1.top += normal * deltaF;
+				imp1.bottom += rbXn * deltaF;
+
+				if(error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				c.appliedForce[threadIndex] = newForce;
+
+				accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+			}
+
+			accumulatedNormalImpulse = PxMax(accumulatedNormalImpulse, contactHeader->minNormalForce[threadIndex]);
+		}
+
+		if (numFrictionConstr)
+		{
+			PxgTGSBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex];
+
+			const float biasCoefficient = frictionHeader->biasCoefficient[threadIndex];
+
+			const float dynamicFrictionCof = frictionHeader->dynamicFriction[threadIndex];
+			const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse;
+			const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+
+			PxU32 broken = 0;
+
+			const float4 frictionNormal0 = frictionHeader->frictionNormals[0][threadIndex];
+			const float4 frictionNormal1 = frictionHeader->frictionNormals[1][threadIndex];
+			const PxVec3 normal0 = PxVec3(frictionNormal0.x, frictionNormal0.y, frictionNormal0.z);
+			const PxVec3 normal1 = PxVec3(frictionNormal1.x, frictionNormal1.y, frictionNormal1.z);
+
+			const PxReal deltaMotion0 = normal0.dot(relMotion);
+			const PxReal deltaMotion1 = normal1.dot(relMotion);
+			for (uint i = 0; i < numFrictionConstr; i += 2)
+			{
+				PxgTGSBlockSolverContactFriction& f0 = frictions[i];
+				PxgArticulationBlockResponse& r0 = *resp;
+				resp++;
+				PxgTGSBlockSolverContactFriction& f1 = frictions[i + 1];
+				PxgArticulationBlockResponse& r1 = *resp;
+				resp++;
+
+				const float4 raXn_error0 = f0.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW0 = f0.rbXn_targetVelW[threadIndex];
+				const float initialError0 = raXn_error0.w;
+				const float appliedForce0 = f0.appliedForce[threadIndex];
+				const float targetVel0 = rbXn_targetVelW0.w;
+
+				const float4 raXn_error1 = f1.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW1 = f1.rbXn_targetVelW[threadIndex];
+				const float initialError1 = raXn_error1.w;
+				const float appliedForce1 = f1.appliedForce[threadIndex];
+				const float targetVel1 = rbXn_targetVelW1.w;
+
+				const PxVec3 raXn0 = PxVec3(raXn_error0.x, raXn_error0.y, raXn_error0.z);
+				const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
+
+				const PxVec3 raXn1 = PxVec3(raXn_error1.x, raXn_error1.y, raXn_error1.z);
+				const PxVec3 rbXn1 = PxVec3(rbXn_targetVelW1.x, rbXn_targetVelW1.y, rbXn_targetVelW1.z);
+
+				const float resp0_0 = ref0 * f0.resp0[threadIndex];
+				const float resp0_1 = ref1 * f0.resp1[threadIndex];
+				const float resp0 = resp0_0 + resp0_1;
+				const float velMultiplier0 = (resp0 > PX_EPS_REAL) ? (p8 / (resp0 + cfm)) : 0.f;
+
+				const float resp1_0 = ref0 * f1.resp0[threadIndex];
+				const float resp1_1 = ref1 * f1.resp1[threadIndex];
+				const float resp1 = resp1_0 + resp1_1;
+				const float velMultiplier1 = (resp1 > PX_EPS_REAL) ? (p8 / (resp1 + cfm)) : 0.f;
+
+				const PxReal v00 = angVel0.dot(raXn0) + linVel0.dot(normal0);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const PxReal v10 = angVel1.dot(rbXn0) + linVel1.dot(normal0);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel0 = v00 - v10;
+
+				const PxReal v01 = angVel0.dot(raXn1) + linVel0.dot(normal1);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const PxReal v11 = angVel1.dot(rbXn1) + linVel1.dot(normal1);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel1 = v01 - v11;
+
+				const float error0 = initialError0 - targetVel0 * elapsedTime + (raXn0.dot(delta0.top) - rbXn0.dot(delta1.top) + deltaMotion0);
+				const float bias0 = error0 * biasCoefficient;
+				const float tmp10 = appliedForce0 - (bias0 - targetVel0) * velMultiplier0;
+				const float totalImpulse0 = tmp10 - normalVel0 * velMultiplier0;
+
+				const float error1 = initialError1 - targetVel1 * elapsedTime + (raXn1.dot(delta0.top) - rbXn1.dot(delta1.top) + deltaMotion1);
+				const float bias1 = error1 * biasCoefficient;
+				const float tmp11 = appliedForce1 - (bias1 - targetVel1) * velMultiplier1;
+				const float totalImpulse1 = tmp11 - normalVel1 * velMultiplier1;
+
+				const float totalImpulse = PxSqrt(totalImpulse0 * totalImpulse0 + totalImpulse1 * totalImpulse1);
+
+				const bool clamp = totalImpulse > maxFrictionImpulse;
+
+				const float ratio = clamp ? fminf(maxDynFrictionImpulse, totalImpulse) / totalImpulse : 1.f;
+
+				const PxReal newAppliedForce0 = totalImpulse0 * ratio;
+				const PxReal newAppliedForce1 = totalImpulse1 * ratio;
+
+				float deltaF0 = newAppliedForce0 - appliedForce0;
+				float deltaF1 = newAppliedForce1 - appliedForce1;
+
+				if (error)
+					error->accumulateErrorLocal(deltaF0, deltaF1, velMultiplier0, velMultiplier1);
+
+				linVel0 += ref0 * PxVec3(r0.deltaRALin_x[threadIndex], r0.deltaRALin_y[threadIndex], r0.deltaRALin_z[threadIndex]) * deltaF0;
+				linVel1 += ref1 * PxVec3(r0.deltaRBLin_x[threadIndex], r0.deltaRBLin_y[threadIndex], r0.deltaRBLin_z[threadIndex]) * deltaF0;
+				angVel0 += ref0 * PxVec3(r0.deltaRAAng_x[threadIndex], r0.deltaRAAng_y[threadIndex], r0.deltaRAAng_z[threadIndex]) * deltaF0;
+				angVel1 += ref1 * PxVec3(r0.deltaRBAng_x[threadIndex], r0.deltaRBAng_y[threadIndex], r0.deltaRBAng_z[threadIndex]) * deltaF0;
+
+				linVel0 += ref0 * PxVec3(r1.deltaRALin_x[threadIndex], r1.deltaRALin_y[threadIndex], r1.deltaRALin_z[threadIndex]) * deltaF1;
+				linVel1 += ref1 * PxVec3(r1.deltaRBLin_x[threadIndex], r1.deltaRBLin_y[threadIndex], r1.deltaRBLin_z[threadIndex]) * deltaF1;
+				angVel0 += ref0 * PxVec3(r1.deltaRAAng_x[threadIndex], r1.deltaRAAng_y[threadIndex], r1.deltaRAAng_z[threadIndex]) * deltaF1;
+				angVel1 += ref1 * PxVec3(r1.deltaRBAng_x[threadIndex], r1.deltaRBAng_y[threadIndex], r1.deltaRBAng_z[threadIndex]) * deltaF1;
+
+				f0.appliedForce[threadIndex] = newAppliedForce0;
+				f1.appliedForce[threadIndex] = newAppliedForce1;
+				broken = broken | clamp;
+
+				imp0.top -= normal0 * deltaF0;
+				imp0.bottom -= raXn0 * deltaF0;
+				imp1.top += normal0 * deltaF0;
+				imp1.bottom += rbXn0 * deltaF0;
+
+				imp0.top -= normal1 * deltaF1;
+				imp0.bottom -= raXn1 * deltaF1;
+				imp1.top += normal1 * deltaF1;
+				imp1.bottom += rbXn1 * deltaF1;
+			}
+
+			if (numFrictionConstr < totalFrictionConstr)
+			{
+				//We have a torsional friction constraint
+
+				const PxReal frictionScale = frictionHeader->torsionalFrictionScale[threadIndex];
+
+				PxgTGSBlockSolverContactFriction& f0 = frictions[numFrictionConstr];
+				PxgArticulationBlockResponse& r0 = *resp;
+				resp++;
+
+				const float4 raXn_error0 = f0.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW0 = f0.rbXn_targetVelW[threadIndex];
+				const float appliedForce0 = f0.appliedForce[threadIndex];
+				const float targetVel0 = rbXn_targetVelW0.w;
+
+				const PxVec3 raXn0 = PxVec3(raXn_error0.x, raXn_error0.y, raXn_error0.z);
+				const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
+
+				const float resp0_0 = ref0 * f0.resp0[threadIndex];
+				const float resp0_1 = ref1 * f0.resp1[threadIndex];
+				const float resp0 = resp0_0 + resp0_1;
+				const float velMultiplier0 = (resp0 > 0.f) ? (p8 / (resp0 + cfm)) : 0.f;
+
+				const PxReal v00 = angVel0.dot(raXn0);
+				const PxReal v10 = angVel1.dot(rbXn0);
+				const float normalVel0 = v00 - v10;
+
+				const float tmp10 = appliedForce0 - (-targetVel0) * velMultiplier0;
+				const float totalImpulse = tmp10 - normalVel0 * velMultiplier0;
+
+				const bool clamp = PxAbs(totalImpulse) > (maxFrictionImpulse * frictionScale);
+
+				const PxReal totalClamped = PxClamp(totalImpulse, -maxDynFrictionImpulse * frictionScale, maxDynFrictionImpulse * frictionScale);
+
+				const PxReal newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+				const PxReal deltaF0 = newAppliedForce - appliedForce0;
+				if (error)
+					error->accumulateErrorLocal(deltaF0, velMultiplier0);
+
+				linVel0 += ref0 * PxVec3(r0.deltaRALin_x[threadIndex], r0.deltaRALin_y[threadIndex], r0.deltaRALin_z[threadIndex]) * deltaF0;
+				linVel1 += ref1 * PxVec3(r0.deltaRBLin_x[threadIndex], r0.deltaRBLin_y[threadIndex], r0.deltaRBLin_z[threadIndex]) * deltaF0;
+				angVel0 += ref0 * PxVec3(r0.deltaRAAng_x[threadIndex], r0.deltaRAAng_y[threadIndex], r0.deltaRAAng_z[threadIndex]) * deltaF0;
+				angVel1 += ref1 * PxVec3(r0.deltaRBAng_x[threadIndex], r0.deltaRBAng_y[threadIndex], r0.deltaRBAng_z[threadIndex]) * deltaF0;
+
+				f0.appliedForce[threadIndex] = newAppliedForce;
+				broken = broken | clamp;
+
+				imp0.bottom -= raXn0 * deltaF0;
+				imp1.bottom += rbXn0 * deltaF0;
+			}
+
+			frictionHeader->broken[threadIndex] = broken;
+		}
+
+		vel0.bottom = linVel0;
+		vel0.top = angVel0;
+		vel1.bottom = linVel1;
+		vel1.top = angVel1;
+
+		impulse0 = imp0.scale(invMass0_1_angDom0_1.x * ref0, invMass0_1_angDom0_1.z * ref0);
+		impulse1 = imp1.scale(invMass0_1_angDom0_1.y * ref1, invMass0_1_angDom0_1.w * ref1);
+	}
+}
+
+// A light version of the function "solveExtContactBlockTGS" to quickly check if there is any active contact.
+static __device__ PX_FORCE_INLINE bool checkExtActiveContactBlockTGS(const PxgBlockConstraintBatch& batch,
+	const Cm::UnAlignedSpatialVector& vel0, const Cm::UnAlignedSpatialVector& vel1, const Cm::UnAlignedSpatialVector& delta0, const Cm::UnAlignedSpatialVector& delta1,
+	const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, 
+	PxgArticulationBlockResponse* PX_RESTRICT responses, const PxReal elapsedTime, const PxReal minPen)
+{
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	{
+		PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const uint numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const PxReal maxPenBias = contactHeader->maxPenBias[threadIndex];
+		PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+		PxgArticulationBlockResponse* PX_RESTRICT resp = &responses[batch.mArticulationResponseIndex];
+
+		const float4 normal_staticFriction = contactHeader->normal_staticFriction[threadIndex];
+
+		const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
+		const float cfm = contactHeader->cfm[threadIndex];
+		const PxU8 flags = (PxU8)contactHeader->flags[threadIndex];
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		// Bring forward a read event
+		const PxVec3 relMotion = delta0.bottom - delta1.bottom;
+		const float deltaV = normal.dot(relMotion);
+
+		for(uint i = 0; i < numNormalConstr; i++)
+		{
+			PxgTGSBlockSolverContactPoint& c = contacts[i];
+			resp++;
+
+			const float4 raXn_extraCoeff = c.raXn_extraCoeff[threadIndex];
+			const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+			const float compliantContactCoef = raXn_extraCoeff.w;
+
+			const float4 rbXn_targetVelW = c.rbXn_targetVelW[threadIndex];
+			const float appliedForce = c.appliedForce[threadIndex];
+			const float separation = c.separation[threadIndex];
+
+			const float maxImpulse = c.maxImpulse[threadIndex];
+
+			const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+			const float targetVel = rbXn_targetVelW.w;
+
+			float biasCoefficient = c.biasCoefficient[threadIndex];
+
+			const float resp0 = Pxldcs(c.resp0[threadIndex]);
+			const float resp1 = Pxldcs(c.resp1[threadIndex]);
+
+			const float unitResponse = resp0 + resp1;
+			const float recipResponse = (unitResponse > 0.f) ? (1.f / (unitResponse + cfm)) : 0.f;
+
+			float velMultiplier = recipResponse;
+
+			if(restitutionXdt < 0.f)
+			{
+				computeCompliantContactCoefficientsTGS(flags, restitutionXdt, unitResponse, recipResponse,
+				                                       compliantContactCoef, velMultiplier, biasCoefficient);
+			}
+
+			// Compute the normal velocity of the constraint.
+			const PxReal v0 = angVel0.dot(raXn) + linVel0.dot(normal);
+			const PxReal v1 = angVel1.dot(rbXn) + linVel1.dot(normal);
+			const float normalVel = (v0 - v1);
+
+			const PxReal deltaBias = deltaV + delta0.top.dot(raXn) - delta1.top.dot(rbXn) - targetVel * elapsedTime;
+
+			const float sep = PxMax(minPen, separation + deltaBias);
+
+			const PxReal biased = PxMin(-maxPenBias, biasCoefficient * sep);
+			const PxReal tVelBias = recipResponse * biased;
+
+			// KS - clamp the maximum force
+			const float tempDeltaF = tVelBias - (normalVel - targetVel) * velMultiplier;
+			const float _deltaF = fmaxf(tempDeltaF, -appliedForce); // FMax(FNegScaleSub(normalVel, velMultiplier,
+			                                                        // biasedErr), FNeg(appliedForce));
+			const float _newForce = appliedForce + _deltaF;
+			const float newForce = fminf(_newForce, maxImpulse); // FMin(_newForce, maxImpulse);
+			const float deltaF = newForce - appliedForce;
+
+			// Check for active contact.
+			if(PxAbs(deltaF) > 1.0e-8f)
+			{
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlock.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlock.cuh
@@ -0,0 +1,849 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_BLOCK_CUH__
+#define __SOLVER_BLOCK_CUH__
+
+#include "common/PxPhysXCommonConfig.h"
+#include <cuda.h>
+#include <sm_35_intrinsics.h>
+#include "PxgSolverBody.h"
+//#include "PxgSolverConstraint1D.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgIslandContext.h"
+#include "PxgSolverContext.h"
+#include "cutil_math.h"
+#include "PxgSolverCoreDesc.h"
+#include "DyThresholdTable.h"
+#include "PxgFrictionPatch.h"
+#include "foundation/PxUtilities.h"
+#include "PxgConstraintWriteBack.h"
+#include "PxgSolverFlags.h"
+#include "PxgIntrinsics.h"
+#include "stdio.h"
+#include "assert.h"
+#include "solverResidual.cuh"
+#include "constraintPrepShared.cuh"
+
+#include "solverBlockCommon.cuh"
+#include "PxgDynamicsConfiguration.h"
+#include "DyCpuGpu1dConstraint.h"
+
+using namespace physx;
+
+PX_FORCE_INLINE static __device__ uint32_t nextPowerOfTwo(uint32_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	return x + 1;
+}
+
+PX_FORCE_INLINE static __device__ bool isPowerOfTwo(uint32_t x)
+{
+	return (x & (x - 1)) == 0;
+}
+
+// TGS uses three float4s to store linVel, angVel, linDelta, angDelta, while PGS uses 2 float4s for linVel
+// and angVel. Default: PGS
+PX_FORCE_INLINE static __device__ PxU32 ComputeAverageBodyBatchStartIndex(const PxU32 bodyIndex, const PxU32 float4sPerBody = 2)
+{
+	return float4sPerBody * (bodyIndex & (~31)) + (bodyIndex & 31);
+}
+
+PX_FORCE_INLINE static __device__ PxU32 countActiveSlabs(PxU32 index, PxU32 numSlabs, PxU32 numSolverBodies,
+														 const PxU32* const encodedReferenceCounts)
+{
+	PxU32 referenceCount = 0;
+
+	const PxU32 num32Slabs = (numSlabs + 31) / 32; // In case more than 32 slabs are used.
+	for (PxU32 i = 0; i < num32Slabs; ++i)
+	{
+		const PxU32 id = encodedReferenceCounts[index + i * numSolverBodies];
+		referenceCount += static_cast<PxU32>(__popc(id));
+	}
+
+	return PxMax(1u, referenceCount);
+}
+
+PX_FORCE_INLINE static __device__ void resetSlabCount(PxU32 index, PxU32 numSlabs, PxU32 numSolverBodies,
+													  PxU32* PX_RESTRICT encodedReferenceCounts)
+{
+	const PxU32 num32Slabs = (numSlabs + 31) / 32; // In case more than 32 slabs are used.
+	for(PxU32 i = 0; i < num32Slabs; ++i)
+	{
+		encodedReferenceCounts[index + i * numSolverBodies] = 0u;
+	}
+}
+
+// Mass-splitting version of 1D constraints; mass-related terms are computed at every sub-timestep. See "setupSolverConstraintBlockGPU".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solve1DBlock(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, const PxU32 threadIndex,
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod, bool residualReportingEnabled,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT baseMod = &rowsMod[batch.startConstraintIndex];
+
+	PxVec3 linVel0 = b0LinVel;
+	PxVec3 linVel1 = b1LinVel;
+	PxVec3 angVel0 = b0AngVel;
+	PxVec3 angVel1 = b1AngVel;
+
+	float invMass0 = ref0 * header->invMass0D0[threadIndex];
+	float invMass1 = ref1 * header->invMass1D1[threadIndex];
+
+	float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
+	float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
+
+	for (PxU32 i = 0; i < header->rowCounts[threadIndex]; ++i)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = baseCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = baseMod[i];
+
+		const float4 _clinVel0_minImpulse = ccon.lin0XYZ_minImpulse[threadIndex];
+		const float4 _clinVel1_maxImpulse = ccon.lin1XYZ_maxImpulse[threadIndex];
+		const float4 _cangVel0_resp0 = ccon.ang0XYZ_resp0[threadIndex];
+		const float4 _cangVel1_resp1 = ccon.ang1XYZ_resp1[threadIndex];
+
+		const PxVec3 clinVel0(_clinVel0_minImpulse.x, _clinVel0_minImpulse.y, _clinVel0_minImpulse.z);
+		const PxVec3 clinVel1(_clinVel1_maxImpulse.x, _clinVel1_maxImpulse.y, _clinVel1_maxImpulse.z);
+		const PxVec3 cangVel0(_cangVel0_resp0.x, _cangVel0_resp0.y, _cangVel0_resp0.z);
+		const PxVec3 cangVel1(_cangVel1_resp1.x, _cangVel1_resp1.y, _cangVel1_resp1.z);
+
+		const PxReal resp0 = ref0 * _cangVel0_resp0.w;
+		const PxReal resp1 = ref1 * _cangVel1_resp1.w;
+		const PxReal initJointSpeed = ccon.initJointSpeed[threadIndex];
+
+		const PxReal coeff0 = cmod.coeff0[threadIndex];
+		const PxReal coeff1 = cmod.coeff1[threadIndex];
+		const PxU32 flags = cmod.flags[threadIndex];
+
+		const PxReal unitResponse = resp0 + resp1;
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+		const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		PxReal constant, unbiasedConstant, vMul, iMul;
+
+		bool isSpring = flags & DY_SC_FLAG_SPRING;
+		bool isAccelerationSpring = flags & DY_SC_FLAG_ACCELERATION_SPRING;
+
+		Dy::compute1dConstraintSolverConstantsPGS(isSpring, isAccelerationSpring, coeff0, coeff1, initJointSpeed, unitResponse,
+												  recipResponse, constant, unbiasedConstant, vMul, iMul);
+
+		// For velocity iterations, "constant" is overwritten by "unbiasedConstant".
+		// This is currently done by assigning coeff1 to coeff0 in "conclude1DBlock".
+
+		const float appliedForce = cmod.appliedForce[threadIndex];//FLoad(c.appliedForce);
+
+		const float maxImpulse = _clinVel1_maxImpulse.w;//FLoad(c.maxImpulse);
+		const float minImpulse = _clinVel0_minImpulse.w;//FLoad(c.minImpulse);
+
+		const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
+		const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
+
+		const float normalVel = v0 - v1;
+
+		const float unclampedForce = iMul*appliedForce + (vMul*normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
+		const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
+
+		cmod.appliedForce[threadIndex] = clampedForce;
+		if(residualReportingEnabled)
+			cmod.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
+
+		linVel0 = linVel0 + clinVel0*(deltaF*invMass0);//V3ScaleAdd(clinVel0, FMul(deltaF, invMass0), linVel0);			
+		linVel1 = linVel1 - clinVel1*(deltaF*invMass1);//V3NegScaleSub(clinVel1, FMul(deltaF, invMass1), linVel1);
+		angVel0 = angVel0 + cangVel0*deltaF*invInertiaScale0;//V3ScaleAdd(cangVel0, deltaF, angVel0);
+		angVel1 = angVel1 - cangVel1*deltaF*invInertiaScale1;//V3NegScaleSub(cangVel1, deltaF, angVel1);
+
+	}
+
+
+	b0LinVel = linVel0;
+	b0AngVel = angVel0;
+	b1LinVel = linVel1;
+	b1AngVel = angVel1;
+	
+}
+
+// Mass-splitting version of 1D constraints; mass-related terms are computed at every sub-timestep. See "setupArtiSolverConstraintBlockGPU".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solveExt1DBlock(const PxgBlockConstraintBatch& batch,
+	Cm::UnAlignedSpatialVector& vel0,
+	Cm::UnAlignedSpatialVector& vel1,
+	const PxU32 threadIndex,
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers,
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod,
+	PxgArticulationBlockResponse* PX_RESTRICT artiResponse,
+	Cm::UnAlignedSpatialVector& impluse0,
+	Cm::UnAlignedSpatialVector& impluse1,
+	bool residualReportingEnabled,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT baseMod = &rowsMod[batch.startConstraintIndex];
+
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	PxVec3 li0 = impluse0.bottom;
+	PxVec3 li1 = impluse1.bottom;
+	PxVec3 ai0 = impluse0.top;
+	PxVec3 ai1 = impluse1.top;
+
+	float invMass0 = ref0 * header->invMass0D0[threadIndex];
+	float invMass1 = ref1 * header->invMass1D1[threadIndex];
+
+	float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
+	float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
+
+	const float cfm = header->cfm[threadIndex];
+
+	const PxU32 numRows = header->rowCounts[threadIndex];
+	for (PxU32 i = 0; i < numRows; ++i)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = baseCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = baseMod[i];
+		PxgArticulationBlockResponse& response = artiResponse[i];
+
+		const float4 _clinVel0_minImpulse = ccon.lin0XYZ_minImpulse[threadIndex];
+		const float4 _clinVel1_maxImpulse = ccon.lin1XYZ_maxImpulse[threadIndex];
+		const float4 _cangVel0_resp0 = ccon.ang0XYZ_resp0[threadIndex];
+		const float4 _cangVel1_resp1 = ccon.ang1XYZ_resp1[threadIndex];
+
+		const PxReal resp0 = ref0 * _cangVel0_resp0.w;
+		const PxReal resp1 = ref1 * _cangVel1_resp1.w;
+		const PxReal initJointSpeed = ccon.initJointSpeed[threadIndex];
+
+		const PxVec3 clinVel0(_clinVel0_minImpulse.x, _clinVel0_minImpulse.y, _clinVel0_minImpulse.z);
+		const PxVec3 clinVel1(_clinVel1_maxImpulse.x, _clinVel1_maxImpulse.y, _clinVel1_maxImpulse.z);
+		const PxVec3 cangVel0(_cangVel0_resp0.x, _cangVel0_resp0.y, _cangVel0_resp0.z);
+		const PxVec3 cangVel1(_cangVel1_resp1.x, _cangVel1_resp1.y, _cangVel1_resp1.z);
+
+		const PxReal coeff0 = cmod.coeff0[threadIndex];
+		const PxReal coeff1 = cmod.coeff1[threadIndex];
+		const PxU32 flags = cmod.flags[threadIndex];
+
+		const PxReal unitResponse = resp0 + resp1 + cfm;
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+		const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		PxReal constant, unbiasedConstant, vMul, iMul;
+
+		bool isSpring = flags & DY_SC_FLAG_SPRING;
+		bool isAccelerationSpring = flags & DY_SC_FLAG_ACCELERATION_SPRING;
+
+		compute1dConstraintSolverConstantsPGS(isSpring, isAccelerationSpring, coeff0, coeff1, initJointSpeed, unitResponse, recipResponse,
+											  constant, unbiasedConstant, vMul, iMul);
+
+		// For velocity iterations, "constant" is overwritten by "unbiasedConstant".
+		// This is currently done by assigning coeff1 to coeff0 in "conclude1DBlock".
+
+		const float appliedForce = cmod.appliedForce[threadIndex];//FLoad(c.appliedForce);
+		const float maxImpulse = _clinVel1_maxImpulse.w;//FLoad(c.maxImpulse);
+		const float minImpulse = _clinVel0_minImpulse.w;//FLoad(c.minImpulse);
+
+		const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
+		const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
+
+		const float normalVel = v0 - v1;
+
+		const float unclampedForce = iMul * appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
+		const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
+
+		cmod.appliedForce[threadIndex] = clampedForce;
+		if(residualReportingEnabled)
+			cmod.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
+
+		li0 = clinVel0 * deltaF + li0;
+		ai0 = cangVel0 * deltaF + ai0;
+		li1 = clinVel1 * deltaF + li1;
+		ai1 = cangVel1 * deltaF + ai1;
+
+		PxVec3 linVa = ref0 * PxVec3(response.deltaRALin_x[threadIndex], response.deltaRALin_y[threadIndex], response.deltaRALin_z[threadIndex]);
+		PxVec3 angVa = ref0 * PxVec3(response.deltaRAAng_x[threadIndex], response.deltaRAAng_y[threadIndex], response.deltaRAAng_z[threadIndex]);
+		PxVec3 linVb = ref1 * PxVec3(response.deltaRBLin_x[threadIndex], response.deltaRBLin_y[threadIndex], response.deltaRBLin_z[threadIndex]);
+		PxVec3 angVb = ref1 * PxVec3(response.deltaRBAng_x[threadIndex], response.deltaRBAng_y[threadIndex], response.deltaRBAng_z[threadIndex]);
+
+		linVel0 = linVa * deltaF + linVel0;
+		angVel0 = angVa * deltaF + angVel0;
+
+		linVel1 = linVb * deltaF + linVel1;
+		angVel1 = angVb * deltaF + angVel1;
+	}
+
+	vel0.top = angVel0; vel0.bottom = linVel0;
+	vel1.top = angVel1; vel1.bottom = linVel1;
+
+	impluse0.top = li0 * invMass0; impluse0.bottom = ai0 * invInertiaScale0;
+	impluse1.top = li1 * invMass1; impluse1.bottom = ai1 * invInertiaScale1;
+}
+
+static __device__ void conclude1DBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod)
+{
+	using namespace physx;
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT base = &rowsMod[batch.startConstraintIndex];
+
+	for (PxU32 i = 0; i < header->rowCounts[threadIndex]; i++)
+	{
+		PxgBlockSolverConstraint1DMod& c = base[i];
+		if(!(c.flags[threadIndex] & DY_SC_FLAG_SPRING)) // For spring constraints, it is automatically satisfied.
+		{
+			c.coeff0[threadIndex] = c.coeff1[threadIndex]; // This makes sure "unbiased constant" is used as "constant".
+														   // See also "queryReduced1dConstraintSolverConstantsPGS".
+		}
+	}
+}
+
+// Mass-splitting version of contact constraints; mass-related terms are computed at every sub-timestep. See "setupFinalizeSolverConstraintsBlock".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solveContactBlock(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, bool doFriction, const PxU32 threadIndex,
+	PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictionPoints,
+	PxgErrorAccumulator* error, PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	PxVec3 linVel0 = b0LinVel;
+	PxVec3 linVel1 = b1LinVel;
+	PxVec3 angVel0 = b0AngVel;
+	PxVec3 angVel1 = b1AngVel;
+
+	{
+		//printf("Normal batchIndex = %i, startConstraint = %i, startFriction = %i\n", batch.mConstraintBatchIndex, batch.startConstraintIndex, batch.startFrictionIndex);
+		PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		PxgBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint numNormalConstr = Pxldcg(contactHeader->numNormalConstr[threadIndex]);
+		const uint	numFrictionConstr = Pxldcg(frictionHeader->numFrictionConstr[threadIndex]);
+
+		PxgBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+		PxgBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex];
+
+		float accumulatedNormalImpulse = 0.f;
+
+		const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader->invMass0_1_angDom0_1[threadIndex]);
+
+		const float invMassA = ref0 * invMass0_1_angDom0_1.x;
+		const float invMassB = ref1 * invMass0_1_angDom0_1.y;
+
+		const float angDom0 = ref0 * invMass0_1_angDom0_1.z;
+		const float angDom1 = ref1 * invMass0_1_angDom0_1.w;
+
+		const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		const float restitution = contactHeader->restitution[threadIndex];
+		const float p8 = 0.8f;
+		const PxU8 flags = contactHeader->flags[threadIndex];
+
+		const PxVec3 delLinVel0 = normal * invMassA;
+		const PxVec3 delLinVel1 = normal * invMassB;
+
+		//Bring forward a read event
+		const float staticFrictionCof = normal_staticFriction.w;
+
+		float4 nextRaxn_extraCoeff;
+		float4 nextRbxn_maxImpulseW;
+		float nextAppliedForce;
+
+		float nextResp0;
+		float nextResp1;
+
+		{
+			nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndex]);
+			nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndex]);
+			nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndex]);
+
+			nextResp0 = Pxldcg(contacts[0].resp0[threadIndex]);
+			nextResp1 = Pxldcg(contacts[0].resp1[threadIndex]);
+
+			float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndex]);
+			float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndex]);
+
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				PxgBlockSolverContactPoint& c = contacts[i];
+
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+				const float appliedForce = nextAppliedForce;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float coeff0 = nextCoeff0;
+				const float coeff1 = nextCoeff1;
+
+				if ((i + 1) < numNormalConstr)
+				{
+					const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndex]);
+					nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(nextC.resp0[threadIndex]);
+					nextResp1 = Pxldcg(nextC.resp1[threadIndex]);
+
+					nextCoeff0 = Pxldcg(nextC.coeff0[threadIndex]);
+					nextCoeff1 = Pxldcg(nextC.coeff1[threadIndex]);
+				}
+				else if (numFrictionConstr && doFriction)
+				{
+					nextRaxn_extraCoeff = Pxldcg(frictions[0].raXn_bias[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(frictions[0].rbXn_targetVelW[threadIndex]);
+					nextAppliedForce = Pxldcg(frictions[0].appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(frictions[0].resp0[threadIndex]);
+					nextResp1 = Pxldcg(frictions[0].resp1[threadIndex]);
+				}
+
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+				const float targetVelocity = raXn_extraCoeff.w;
+				const float maxImpulse = rbXn_maxImpulse.w;
+
+				const float unitResponse = ref0 * resp0 + ref1 * resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / unitResponse) : 0.f;
+
+				float velMultiplier = recipResponse;
+				float impulseMul = 1.0f;
+				float unbiasedError = 0.0f;
+				float biasedErr = 0.0f;
+
+				computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0,
+				                           coeff1, velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+				//Compute the normal velocity of the constraint.
+				const float v0 = linVel0.dot(normal) + angVel0.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = linVel1.dot(normal) + angVel1.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce * impulseMul + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+				linVel0 += delLinVel0 * deltaF;
+				linVel1 -= delLinVel1 * deltaF;
+				angVel0 += raXn * (deltaF * angDom0);
+				angVel1 -= rbXn * (deltaF * angDom1);
+
+				if(error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				Pxstcg(&c.appliedForce[threadIndex], newForce);
+
+				accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+			}
+		}
+
+		if (numFrictionConstr && doFriction)
+		{
+			const float dynamicFrictionCof = Pxldcg(frictionHeader->dynamicFriction[threadIndex]);
+			const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse;
+			const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+
+			PxU32 broken = 0;
+
+			for (uint i = 0; i < numFrictionConstr; i++)
+			{
+				PxgBlockSolverContactFriction& f = frictions[i];
+
+				const float4 frictionNormal = Pxldg(frictionHeader->frictionNormals[i & 1][threadIndex]);
+
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_targetVelW = nextRbxn_maxImpulseW;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float appliedForce = nextAppliedForce;
+
+				if ((i + 1) < numFrictionConstr)
+				{
+					const PxgBlockSolverContactFriction& f2 = frictions[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(f2.raXn_bias[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(f2.rbXn_targetVelW[threadIndex]);
+					nextAppliedForce = Pxldcg(f2.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(f2.resp0[threadIndex]);
+					nextResp1 = Pxldcg(f2.resp1[threadIndex]);
+				}
+
+				const PxVec3 normal = PxVec3(frictionNormal.x, frictionNormal.y, frictionNormal.z);
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+				const float resp = ref0 * resp0 + ref1 * resp1;
+				const float velMultiplier = (resp > 0.f) ? (p8 / resp) : 0.f;
+
+				const float bias = raXn_extraCoeff.w;
+
+				const PxVec3 delLinVel0 = normal * invMassA;
+				const PxVec3 delLinVel1 = normal * invMassB;
+
+				const float targetVel = rbXn_targetVelW.w;
+
+				const float v0 = angVel0.dot(raXn) + linVel0.dot(normal);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = angVel1.dot(rbXn) + linVel1.dot(normal);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				const float tmp1 = appliedForce - (bias - targetVel) * velMultiplier;
+
+				const float totalImpulse = tmp1 - normalVel * velMultiplier;
+
+				const bool clamp = fabsf(totalImpulse) > maxFrictionImpulse;
+
+				const float totalClamped = fminf(maxDynFrictionImpulse, fmaxf(-maxDynFrictionImpulse, totalImpulse));
+
+				const float newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+				float deltaF = newAppliedForce - appliedForce;//FSub(newAppliedForce, appliedForce);
+
+				if (error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				linVel0 += delLinVel0 * deltaF;
+				linVel1 -= delLinVel1 * deltaF;
+				angVel0 += raXn * (deltaF * angDom0);
+				angVel1 -= rbXn * (deltaF * angDom1);
+
+				Pxstcg(&f.appliedForce[threadIndex], newAppliedForce);
+				broken = broken | clamp;
+			}
+			Pxstcg(&frictionHeader->broken[threadIndex], broken);
+		}
+
+	}
+
+	// Write back
+	b0LinVel = linVel0;
+	b0AngVel = angVel0;
+	b1LinVel = linVel1;
+	b1AngVel = angVel1;
+}
+
+// A light version of the function "solveContactBlock" to quickly check if there is any active contact.
+// TODO: Make this even lighter.
+
+static __device__ bool checkActiveContactBlock(const PxgBlockConstraintBatch& batch, const PxVec3& linVel0,
+	const PxVec3& angVel0, const PxVec3& linVel1, const PxVec3& angVel1, const PxU32 threadIndex,
+	PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverContactPoint* contactPoints)
+{
+	using namespace physx;
+
+	{
+		PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const uint numNormalConstr = Pxldcg(contactHeader->numNormalConstr[threadIndex]);
+		PxgBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+
+		const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader->invMass0_1_angDom0_1[threadIndex]);
+
+		const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		const float restitution = contactHeader->restitution[threadIndex];
+		const PxU8 flags = contactHeader->flags[threadIndex];
+
+		float4 nextRaxn_extraCoeff;
+		float4 nextRbxn_maxImpulseW;
+		float nextAppliedForce;
+
+		float nextResp0;
+		float nextResp1;
+
+		{
+			nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndex]);
+			nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndex]);
+			nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndex]);
+
+			nextResp0 = Pxldcg(contacts[0].resp0[threadIndex]);
+			nextResp1 = Pxldcg(contacts[0].resp1[threadIndex]);
+
+			float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndex]);
+			float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndex]);
+
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+				const float appliedForce = nextAppliedForce;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float coeff0 = nextCoeff0;
+				const float coeff1 = nextCoeff1;
+
+				if ((i + 1) < numNormalConstr)
+				{
+					const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndex]);
+					nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(nextC.resp0[threadIndex]);
+					nextResp1 = Pxldcg(nextC.resp1[threadIndex]);
+
+					nextCoeff0 = Pxldcg(nextC.coeff0[threadIndex]);
+					nextCoeff1 = Pxldcg(nextC.coeff1[threadIndex]);
+				}
+
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+				const float targetVelocity = raXn_extraCoeff.w;
+				const float maxImpulse = rbXn_maxImpulse.w;
+
+				const float unitResponse = resp0 + resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / unitResponse) : 0.f;
+
+				float velMultiplier = recipResponse;
+				float impulseMul = 1.0f;
+				float unbiasedError = 0.0f;
+				float biasedErr = 0.0f;
+
+				computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0,
+					coeff1, velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+				//Compute the normal velocity of the constraint.
+				const float v0 = linVel0.dot(normal) + angVel0.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = linVel1.dot(normal) + angVel1.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce * impulseMul + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+				// Check for active contact.
+				if (PxAbs(deltaF) > 1.0e-8f)
+				{
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+static __device__ void concludeContactBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, 
+	PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictions)
+{
+
+	using namespace physx;
+
+	{
+		const PxgBlockSolverContactHeader* contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const PxgBlockSolverFrictionHeader* frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint32_t numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const uint32_t numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+
+		PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+		if (numNormalConstr)
+		{
+			PxReal restitution = contactHeader->restitution[threadIndex];
+
+			// Assigning unbiased error to biased error.
+			// When restitution is negative (compliant contact), no additional care is required as it is automatically
+			// enforced.
+			if (restitution >= 0.f)
+			{
+				for (uint32_t i = 0; i < numNormalConstr; i++)
+				{
+					contacts[i].coeff0[threadIndex] = contacts[i].coeff1[threadIndex];
+				}
+			}
+		}
+
+		PxgBlockSolverContactFriction* frictionConstr = &frictions[batch.startFrictionIndex];
+		for (uint32_t i = 0; i < numFrictionConstr; i++)
+		{
+			float4 raXn_bias = frictionConstr[i].raXn_bias[threadIndex];
+			raXn_bias.w = 0.f;
+
+			frictionConstr[i].raXn_bias[threadIndex] = raXn_bias;
+		}
+	}
+}
+
+
+
+static __device__ void writeBackContactBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex,
+											 const PxgSolverBodyData* bodies, Dy::ThresholdStreamElement* thresholdStream,
+											 PxI32* sharedThresholdStreamIndex, PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, 
+											PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictions,
+											PxF32* forcewritebackBuffer, PxgBlockFrictionPatch& frictionPatchBlock,
+											PxgFrictionPatchGPU* frictionPatches)
+{
+	const PxU32 bodyAIndex = batch.bodyAIndex[threadIndex];
+	const PxU32 bodyBIndex = batch.bodyBIndex[threadIndex];
+
+	const PxgSolverBodyData& bd0 = bodies[bodyAIndex];
+	const PxgSolverBodyData& bd1 = bodies[bodyBIndex];
+	bool forceThreshold = false;
+
+	float normalForce = 0.f;
+
+	{
+		const PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const PxgBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+	
+		PxU32 forceWritebackOffset = contactHeader->forceWritebackOffset[threadIndex];
+
+		forceThreshold = contactHeader->flags[threadIndex] & PxgSolverContactFlags::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32	numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+
+		const PxU32 numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		if(forceWritebackOffset!=0xFFFFFFFF)
+		{
+			PxReal* vForceWriteback = &forcewritebackBuffer[forceWritebackOffset];
+			PxgBlockSolverContactPoint* c = &contactPoints[batch.startConstraintIndex];
+			for(PxU32 i=0; i<numNormalConstr; i++)
+			{
+				const PxReal appliedForce = c[i].appliedForce[threadIndex];//FStore(c->getAppliedForce());
+				*vForceWriteback++ = appliedForce;
+				normalForce += appliedForce;
+			}
+		}
+
+		writeBackContactBlockFriction(threadIndex, numFrictionConstr, frictionHeader,
+			frictionPatchBlock, frictions + batch.startFrictionIndex, frictionPatches);
+
+		if(numFrictionConstr && frictionHeader->broken[threadIndex])
+		{
+			frictionPatchBlock.broken[threadIndex] = 1;
+		}
+	}
+
+	float reportThreshold0 = bd0.reportThreshold;
+	float reportThreshold1 = bd1.reportThreshold;
+
+	if((forceThreshold && normalForce !=0 && (reportThreshold0 < PX_MAX_REAL  || reportThreshold1 < PX_MAX_REAL)))
+	{
+		//ToDo : support PxgThresholdStreamElement
+		Dy::ThresholdStreamElement elt;
+		elt.normalForce = normalForce;
+		elt.threshold = PxMin<float>(reportThreshold0, reportThreshold1);
+		
+		elt.nodeIndexA = bd0.islandNodeIndex;
+		elt.nodeIndexB = bd1.islandNodeIndex;
+		elt.shapeInteraction = batch.shapeInteraction[threadIndex];
+		PxOrder(elt.nodeIndexA, elt.nodeIndexB);
+		assert(elt.nodeIndexA < elt.nodeIndexB);
+
+		PxI32 index = atomicAdd(sharedThresholdStreamIndex, 1);
+
+		//KS - force a 16-byte coalesced write
+		//((float4*)thresholdStream)[index] = *((float4*)&elt);
+		thresholdStream[index] = elt;
+	}
+}
+
+static __device__ void writeBack1DBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, 
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon, PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod,PxgConstraintWriteback* constraintWriteBacks)
+{
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* conBase = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT modBase = &rowsMod[batch.startConstraintIndex];
+
+	PxU32 forceWritebackOffset = header->writeBackOffset[threadIndex];
+
+	const PxU8 breakable = header->breakable[threadIndex];
+
+	const PxU32 numRows = header->rowCounts[threadIndex];
+
+	if (forceWritebackOffset != 0xFFFFFFFF)
+	{
+		PxgConstraintWriteback& writeback = constraintWriteBacks[forceWritebackOffset];
+	
+		PxVec3 linVel(0), angVel(0);
+		PxReal constraintErrorSq = 0.0f;
+		for (PxU32 i = 0; i < numRows; ++i)
+		{
+			PxgBlockSolverConstraint1DCon& con = conBase[i];
+			PxgBlockSolverConstraint1DMod& mod = modBase[i];
+
+			if (mod.flags[threadIndex] & DY_SC_FLAG_OUTPUT_FORCE)
+			{
+				const float4 lin0XYZ_minImpulse = con.lin0XYZ_minImpulse[threadIndex];
+				const PxVec3 lin0(lin0XYZ_minImpulse.x, lin0XYZ_minImpulse.y, lin0XYZ_minImpulse.z);
+				const PxVec3 ang0WriteBack = mod.ang0Writeback[threadIndex];
+				const PxReal appliedForce = mod.appliedForce[threadIndex];
+				linVel += lin0 * appliedForce;
+				angVel += ang0WriteBack *appliedForce;
+			}
+
+			PxReal err = mod.residual[threadIndex];
+			constraintErrorSq += err * err;
+		}
+
+		const float4 body0WorldOffset_linBreakImpulse = header->body0WorldOffset_linBreakImpulse[threadIndex];
+		const PxVec3 body0WorldOffset(body0WorldOffset_linBreakImpulse.x, body0WorldOffset_linBreakImpulse.y, body0WorldOffset_linBreakImpulse.z);
+		angVel -= body0WorldOffset.cross(linVel);
+
+
+		const PxU32 broken = breakable ? PxU32((linVel.magnitude() > body0WorldOffset_linBreakImpulse.w) || (angVel.magnitude() > header->angBreakImpulse[threadIndex])) : 0;
+		writeback.angularImpulse_residual = make_float4(angVel.x, angVel.y, angVel.z, constraintErrorSq);
+		writeback.linearImpulse_broken = make_float4(linVel.x, linVel.y, linVel.z, broken ? -0.0f : 0.0f);
+
+	}
+
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockCommon.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockCommon.cuh
@@ -0,0 +1,67 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_BLOCK_COMMON_CUH__
+#define __SOLVER_BLOCK_COMMON_CUH__
+
+template <typename FRICTION_HEADER, typename FRICTION>
+static __device__ void writeBackContactBlockFriction(
+	const PxU32 threadIndex, PxU32	numFrictionConstr, const FRICTION_HEADER* PX_RESTRICT frictionHeader,
+	PxgBlockFrictionPatch& frictionPatchBlock, FRICTION* fric, PxgFrictionPatchGPU* frictionPatches
+)
+{
+	PxU32 patchIndex = frictionPatchBlock.patchIndex[threadIndex];
+	if (patchIndex != 0xFFFFFFFF)
+	{
+		PxgFrictionPatchGPU& frictionInfo = frictionPatches[patchIndex];
+
+		float4 axis0 = frictionHeader->frictionNormals[0][threadIndex];
+		float4 axis1 = frictionHeader->frictionNormals[1][threadIndex];
+
+		frictionInfo.anchors = numFrictionConstr / 2;
+
+		if (numFrictionConstr >= 2)
+		{
+			float4 anchor = frictionPatchBlock.anchorPoints[0][threadIndex];
+			frictionInfo.points[0] = PxVec3(anchor.x, anchor.y, anchor.z);
+			PxReal impulse0 = fric[0].appliedForce[threadIndex];
+			PxReal impulse1 = fric[1].appliedForce[threadIndex];
+			frictionInfo.impulses[0] = PxVec3(axis0.x, axis0.y, axis0.z) * impulse0 + PxVec3(axis1.x, axis1.y, axis1.z) * impulse1;
+		}
+		if (numFrictionConstr >= 4)
+		{
+			float4 anchor = frictionPatchBlock.anchorPoints[1][threadIndex];
+			frictionInfo.points[1] = PxVec3(anchor.x, anchor.y, anchor.z);
+			PxReal impulse0 = fric[2].appliedForce[threadIndex];
+			PxReal impulse1 = fric[3].appliedForce[threadIndex];
+			frictionInfo.impulses[1] = PxVec3(axis0.x, axis0.y, axis0.z) * impulse0 + PxVec3(axis1.x, axis1.y, axis1.z) * impulse1;
+		}
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockTGS.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockTGS.cuh
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlock.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlock.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlockTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlockTGS.cu
--- a/engine/third_party/physx/source/gpusolver/src/PxgConstraintPartition.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgConstraintPartition.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgContext.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgCudaSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgCudaSolverCore.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgDynamicsContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgDynamicsContext.cpp
@@ -0,0 +1,120 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgDynamicsContext.h"
+#include "PxgKernelWrangler.h"
+#include "PxgArticulationCore.h"
+#include "PxgCudaSolverCore.h"
+
+namespace physx
+{
+	PxgDynamicsContext::PxgDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+		const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions,
+		bool enableStabilization, bool useEnhancedDeterminism,
+		PxReal maxBiasCoefficient,
+		PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+		bool frictionEveryIteration, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled)
+		:
+		PxgGpuContext(flushPool, islandManager, maxNumPartitions, maxNumStaticPartitions, enableStabilization, useEnhancedDeterminism, maxBiasCoefficient, simStats, heapMemoryManager, lengthScale, enableDirectGPUAPI, contextID, isResidualReportingEnabled, false)
+	{
+		mWorldSolverBody.linearVelocity = PxVec3(0);
+		mWorldSolverBody.angularVelocity = PxVec3(0);
+		mWorldSolverBodyData.invMass = 0;
+		mWorldSolverBodyData.reportThreshold = PX_MAX_REAL;
+		mWorldSolverBodyData.maxImpulse = PX_MAX_REAL;
+		mWorldSolverBodyData.penBiasClamp = -PX_MAX_REAL;
+		mWorldSolverBodyData.initialAngVel = mWorldSolverBodyData.initialLinVel = PxVec3(0.f);
+		mWorldSolverBodyData.body2World = PxAlignedTransform(PxIdentity);
+		mWorldSolverBodyData.islandNodeIndex = PxNodeIndex(PX_INVALID_NODE);
+		mWorldSolverBodyData.offsetSlop = 0.f;
+
+		mWorldTxIData.sqrtInvInertia = PxMat33(PxZero);
+		mWorldTxIData.deltaBody2World = PxTransform(PxIdentity);
+
+		{
+			mGpuArticulationCore = PX_NEW(PxgArticulationCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, heapMemoryManager);
+
+			mGpuSolverCore = PX_NEW(PxgCudaSolverCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, this, heapMemoryManager, config, frictionEveryIteration);
+
+			mGpuArticulationCore->setGpuContext(this);
+		}
+
+		mGpuSolverCore->acquireContext();
+
+		mGpuSolverCore->createStreams();
+
+		createThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+		createForceChangeThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+
+		mPinnedMemoryAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.tempBufferCapacity);
+
+		mCurrentContactStream = 0;
+		mContactStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+		mContactStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+
+		mPatchStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+		mPatchStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+	
+		mForceStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxReal) * 2);
+	
+		mFrictionPatchStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxFrictionPatch));
+
+		mContactStreamPool.mDataStream = mContactStreamAllocators[mCurrentContactStream]->mStart;
+		mContactStreamPool.mDataStreamSize = (PxU32)mContactStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mContactStreamPool.mSharedDataIndex = 0;
+		mContactStreamPool.mSharedDataIndexGPU = 0;
+
+		mPatchStreamPool.mDataStream = mPatchStreamAllocators[mCurrentContactStream]->mStart;
+		mPatchStreamPool.mDataStreamSize = (PxU32)mPatchStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mPatchStreamPool.mSharedDataIndex = 0;
+		mPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		mForceStreamPool.mDataStream = mForceStreamAllocator->mStart;
+		mForceStreamPool.mDataStreamSize = (PxU32)mForceStreamAllocator->mTotalSize;
+		mForceStreamPool.mSharedDataIndex = 0;
+		mForceStreamPool.mSharedDataIndexGPU = 0;
+
+		mFrictionPatchStreamPool.mDataStream = mFrictionPatchStreamAllocator->mStart;
+		mFrictionPatchStreamPool.mDataStreamSize = PxTo32(mFrictionPatchStreamAllocator->mTotalSize);
+		mFrictionPatchStreamPool.mSharedDataIndex = 0;
+		mFrictionPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		//Arbitrarily-large number to reserve to minimize allocation churn.
+		mConstraintsPerPartition.reserve(1024);
+
+		mArtiConstraintsPerPartition.reserve(1024);
+
+		mGpuSolverCore->releaseContext();
+	}
+
+	void PxgDynamicsContext::destroy()
+	{
+		this->~PxgDynamicsContext();
+		PX_FREE_THIS;
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgSolver.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgSolver.cpp
@@ -0,0 +1,66 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgBroadPhase.h"
+
+namespace physx
+{
+
+	extern "C" void initSolverKernels0();
+	extern "C" void initSolverKernels1();
+	extern "C" void initSolverKernels2();
+	extern "C" void initSolverKernels3();
+	extern "C" void initSolverKernels4();
+	extern "C" void initSolverKernels5();
+	extern "C" void initSolverKernels6();
+	extern "C" void initSolverKernels7();
+	extern "C" void initSolverKernels9();
+	extern "C" void initSolverKernels10();
+	extern "C" void initSolverKernels11();
+	extern "C" void initSolverKernels13();
+
+	void createPxgSolver()
+	{
+#if !PX_PHYSX_GPU_EXPORTS
+		//this call is needed to force PhysXGpuSolver linkage as Static Library!
+		initSolverKernels0();
+		initSolverKernels1();
+		initSolverKernels2();
+		initSolverKernels3();
+		initSolverKernels4();
+		initSolverKernels5();
+		initSolverKernels6();
+		initSolverKernels7();
+		initSolverKernels9();
+		initSolverKernels10();
+		initSolverKernels11();
+		initSolverKernels13();
+#endif
+	}
+
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgSolverCore.cpp
@@ -0,0 +1,754 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgSolverCore.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxCudaContext.h"
+#include "PxgSimulationController.h"
+#include "PxgSimulationCore.h"
+#include "common/PxProfileZone.h"
+#include "PxgCudaUtils.h"
+#include "PxgKernelWrangler.h"
+#include "PxgKernelIndices.h"
+#include "CudaKernelWrangler.h"
+#include "PxgContext.h"
+#include "PxgArticulationCore.h"
+#include "PxgSolverKernelIndices.h"
+#include "PxgDynamicsConfiguration.h"
+#include "PxgFrictionPatch.h"
+#include "PxgDynamicsContext.h"
+#include "PxgArticulationCoreKernelIndices.h"
+#include "DyConstraintPrep.h"
+#include "PxgIslandContext.h"
+
+#define GPU_CORE_DEBUG 0
+
+using namespace physx;
+
+PxgRadixSortBuffers::PxgRadixSortBuffers(PxgHeapMemoryAllocatorManager* heapMemoryManager) :
+	mInputKeys(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mInputRanks(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutputKeys(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutputRanks(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRadixCounts(heapMemoryManager, PxsHeapStats::eSOLVER)
+{
+}
+
+void PxgRadixSortBuffers::constructRadixSortDesc(PxgRadixSortDesc* rsDesc) const
+{
+	rsDesc[0].inputKeys = reinterpret_cast<PxU32*>(mInputKeys.getDevicePtr());
+	rsDesc[0].inputRanks = reinterpret_cast<PxU32*>(mInputRanks.getDevicePtr());
+	rsDesc[0].outputKeys = reinterpret_cast<PxU32*>(mOutputKeys.getDevicePtr());
+	rsDesc[0].outputRanks = reinterpret_cast<PxU32*>(mOutputRanks.getDevicePtr());
+	rsDesc[0].radixBlockCounts = reinterpret_cast<PxU32*>(mRadixCounts.getDevicePtr());
+
+	rsDesc[1].inputKeys = reinterpret_cast<PxU32*>(mOutputKeys.getDevicePtr());
+	rsDesc[1].inputRanks = reinterpret_cast<PxU32*>(mOutputRanks.getDevicePtr());
+	rsDesc[1].outputKeys = reinterpret_cast<PxU32*>(mInputKeys.getDevicePtr());
+	rsDesc[1].outputRanks = reinterpret_cast<PxU32*>(mInputRanks.getDevicePtr());
+	rsDesc[1].radixBlockCounts = reinterpret_cast<PxU32*>(mRadixCounts.getDevicePtr());
+}
+
+void PxgRadixSortBuffers::allocate(PxU32 totalContactBatches)
+{
+	mInputKeys.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mInputRanks.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mOutputKeys.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mOutputRanks.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mRadixCounts.allocate(sizeof(PxU32)*32*16, PX_FL);
+}
+
+PxgSolverCore::PxgSolverCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, PxgGpuContext* dynamicContext, PxgHeapMemoryAllocatorManager* heapMemoryManager) :
+	mGpuKernelWranglerManager(gpuKernelWrangler),
+	mCudaContextManager(cudaContextManager),
+	mCudaContext(cudaContextManager->getCudaContext()),
+	mGpuContext(dynamicContext), 
+	mHeapMemoryManager(heapMemoryManager),
+	mSolverCoreDesc(NULL),
+	mPrepareDesc(NULL),
+	mPrePrepDesc(NULL),
+	mRsDesc(NULL),
+	mNbStaticRigidSlabs(0),
+	mMaxNumStaticPartitions(0),
+	mTotalContactManagers(0),
+	mNbPrevExceededForceElements(0),
+	mNbArticSlabs(0),
+	mContactHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mContactBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointRowBlockStreamCon(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointRowBlockStreamMod(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintContactPrepBlockPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPoolVel(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPoolPar(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintRowPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintRowPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempStaticBodyOutputPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mIslandNodeIndices2(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutVelocityPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutBody2WorldPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodySleepDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutArtiVelocityPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverTxIDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintsPerPartition(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintsPerPartition(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mMotionVelocityArray(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mBlockConstraintBatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiOrderedStaticConstraints(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiOrderedStaticContacts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyReferences(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mBlockWorkUnits(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionIndexData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionNodeData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverConstantData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionStartBatchIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionArticulationStartBatchIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionJointBatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionArtiJointBatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mDestroyedEdgeIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mNpIndexArray(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mGpuContactBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mDataBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCompressedContacts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCompressedPatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintWriteBackBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mForceBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactStartIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointStartIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempContactUniqueIndicesBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempConstraintUniqueIndicesBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempContactHeaderBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempConstraintHeaderBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mNodeInteractionCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionAnchorPatchBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionIndexStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionAnchorPatchStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCurrentIndex(0),
+	mPinnedEvent(NULL),
+	mCpuIslandNodeIndices(NULL),
+	mSolverBodyOutputVelocityOffset(0),
+	mRadixSort(heapMemoryManager)
+{}
+
+// These two structures must have the same layout
+PX_COMPILE_TIME_ASSERT(sizeof(PxgFrictionPatchGPU) == sizeof(PxFrictionPatch));
+
+void PxgSolverCore::allocateFrictionPatchStream(PxI32 numContactBatches, PxI32 numArtiContactBatches)
+{
+	mFrictionPatchBlockStream[mCurrentIndex].allocate(sizeof(PxgBlockFrictionPatch) * (numContactBatches + numArtiContactBatches), PX_FL);
+	mFrictionAnchorPatchBlockStream[mCurrentIndex].allocate(sizeof(PxgBlockFrictionAnchorPatch) * (numContactBatches + numArtiContactBatches), PX_FL);
+
+	/*frictionPatchStream[currentIndex].allocate(sizeof(PxgFrictionPatch) * numArtiContactBatches);
+	frictionAnchorPatchStream[currentIndex].allocate(sizeof(PxgFrictionAnchorPatch) * numArtiContactBatches);*/
+}
+
+void PxgSolverCore::allocateFrictionCounts(PxU32 totalEdges)
+{
+	mFrictionPatchCounts[1 - mCurrentIndex].allocateCopyOldDataAsync(totalEdges * sizeof(PxU32), mCudaContext, mStream, PX_FL);
+	mFrictionPatchCounts[mCurrentIndex].allocate(totalEdges * sizeof(PxU32), PX_FL);
+}
+
+PxgBlockFrictionIndex* PxgSolverCore::allocateFrictionPatchIndexStream(PxU32 totalFrictionPatchCount)
+{
+	mFrictionIndexStream[mCurrentIndex].allocateCopyOldDataAsync(sizeof(PxgBlockFrictionIndex) * totalFrictionPatchCount, mCudaContext, mStream, PX_FL);
+	return reinterpret_cast<PxgBlockFrictionIndex*>(mFrictionIndexStream[mCurrentIndex].getDevicePtr());
+}
+
+void PxgSolverCore::allocateNodeInteractionCounts(PxU32 nbNodes)
+{
+	mNodeInteractionCounts.allocate(nbNodes * sizeof(PxU32), PX_FL);
+}
+
+void PxgSolverCore::uploadNodeInteractionCounts(const PxU32* nodeInteractionCounts, PxU32 nbNodes)
+{
+	mCudaContext->memcpyHtoDAsync(mNodeInteractionCounts.getDevicePtr(), nodeInteractionCounts, sizeof(PxU32) * nbNodes, mStream);
+}
+
+void PxgSolverCore::gpuMemDMAbackSolverBodies(float4* solverBodyPool, PxU32 nbSolverBodies,
+	PxPinnedArray<PxAlignedTransform>& body2WorldPool,
+	PxPinnedArray<PxgSolverBodySleepData>& solverBodySleepDataPool,
+	const bool enableDirectGPUAPI)
+{
+	PX_PROFILE_ZONE("GpuDynamics.DMABackBodies", 0);
+
+	if (!enableDirectGPUAPI)
+	{
+		mCudaContext->memcpyDtoHAsync(solverBodyPool, mOutVelocityPool.getDevicePtr(), sizeof(PxgSolverBody) * nbSolverBodies, mStream);
+		mCudaContext->memcpyDtoHAsync(body2WorldPool.begin(), mOutBody2WorldPool.getDevicePtr(), sizeof(PxAlignedTransform) * nbSolverBodies, mStream);
+		mCudaContext->memcpyDtoHAsync(solverBodySleepDataPool.begin(), mSolverBodySleepDataPool.getDevicePtr(), sizeof(PxgSolverBodySleepData) * nbSolverBodies, mStream);
+	}
+
+	synchronizeStreams(mCudaContext, mStream2, mStream, mIntegrateEvent);
+
+	CUfunction signalFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::BP_SIGNAL_COMPLETE);
+
+	*mPinnedEvent = 0;
+
+	void* devicePtr = getMappedDevicePtr(mCudaContext, mPinnedEvent);
+	PxCudaKernelParam signalParams[] =
+	{
+		PX_CUDA_KERNEL_PARAM(devicePtr)
+	};
+
+	mCudaContext->launchKernel(signalFunction, 1, 1, 1, 1, 1, 1, 0, mStream, signalParams, sizeof(signalParams), 0, PX_FL);
+}
+
+void PxgSolverCore::allocateSolverBodyBuffersCommon(PxU32 numSolverBodies, PxPinnedArray<PxNodeIndex>& islandNodeIndices)
+{
+	mMotionVelocityArray.allocate(sizeof(float4) * numSolverBodies * 2, PX_FL);
+
+	mSolverBodyIndices.allocate(sizeof(PxU32) * numSolverBodies, PX_FL);
+
+	//allocate enough solver body data space(static + kinematic + dynamic), but we just need to dma static and kinematic solver body and preIntegration kernel will
+	//fill in dynamic solver body data
+	mSolverBodyDataPool.allocate(sizeof(PxgSolverBodyData) * numSolverBodies, PX_FL);
+	mSolverBodySleepDataPool.allocate(sizeof(PxgSolverBodySleepData) * numSolverBodies, PX_FL);
+	mSolverTxIDataPool.allocate(sizeof(PxgSolverTxIData) * numSolverBodies, PX_FL);
+	mOutVelocityPool.allocate(sizeof(float4)*numSolverBodies * 2, PX_FL); //Output buffer to read back solver body velocities
+	mOutBody2WorldPool.allocate(sizeof(PxAlignedTransform)*numSolverBodies, PX_FL); //output buffer to read back solver body transform
+
+	//allocate enough memory for numArticulations * maxLinks
+	//mOutArtiVelocityPool.allocate(sizeof(float4)*numActiveActiculations*maxArticulationLinks * 2);
+	
+	mIslandNodeIndices2.allocate(sizeof(PxNodeIndex) * islandNodeIndices.size(), PX_FL);
+
+	mCudaContext->memsetD32Async(mSolverBodyIndices.getDevicePtr(), 0xFFffFFff, numSolverBodies, mStream);
+	mCudaContext->memcpyHtoDAsync(mIslandNodeIndices2.getDevicePtr(), islandNodeIndices.begin(), sizeof(PxNodeIndex) *islandNodeIndices.size(), mStream);
+
+	synchronizeStreams(mCudaContext, mStream, mGpuContext->getArticulationCore()->getStream());
+
+	mCpuIslandNodeIndices = islandNodeIndices.begin();
+}
+
+void PxgSolverCore::constructConstraintPrePrepDesc(PxgPrePrepDesc& preDesc, PxU32 numBatches, PxU32 numStaticBatches, PxU32 numArtiBatches, PxU32 numArtiStaticBatches,
+	PxU32 numArtiSelfBatches, const PxgPartitionData& pData, PxContact* cpuCompressedContactsBase, PxContactPatch* cpuCompressedPatchesBase, PxReal* cpuForceBufferBase,
+	PxU32 nbD6RigidJoint, PxU32 nbD6ArtiJoint, PxU32 nbTotalArtiJoints,
+	PxsContactManagerOutputIterator& outputIterator, PxU32 maxConstraintPartitions, PxU32 totalActiveBodies, PxU32 nbArticulations,
+	PxU32 activeBodyStartOffset, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances, PxsTorsionalFrictionData* torsionalData,
+	PxU32 nbElementsPerBody, PxU32 numSlabs)
+{
+	preDesc.blockBatches = reinterpret_cast<PxgBlockConstraintBatch*>(mBlockConstraintBatches.getDevicePtr());
+	preDesc.numBatches = numBatches;
+	preDesc.numStaticBatches = numStaticBatches;
+	preDesc.numArtiBatches = numArtiBatches;
+	preDesc.numArtiStaticBatches = numArtiStaticBatches; //this is just estimation. we write the actually numArticStaticBatches in artiSumInternalContactAndJointBatches2
+	preDesc.numArtiSelfBatches = numArtiSelfBatches; //this is also just an estimation.
+	preDesc.blockWorkUnit = reinterpret_cast<PxgBlockWorkUnit*>(mBlockWorkUnits.getDevicePtr());
+
+	preDesc.numTotalContacts = pData.numTotalContacts;
+	preDesc.numTotalConstraints = pData.numTotalConstraints;
+	preDesc.numTotalStaticConstraints = pData.numTotalStaticConstraints;
+	preDesc.numTotalStaticContacts = pData.numTotalStaticContacts;
+
+	preDesc.numTotalArtiContacts = pData.numTotalArtiContacts;
+	preDesc.numTotalArtiConstraints = pData.numTotalArtiConstraints;
+	preDesc.numTotalStaticArtiContacts = pData.numTotalArtiStaticContacts;
+	preDesc.numTotalStaticArtiConstraints = pData.numTotalArtiStaticConstraints;
+	preDesc.numTotalSelfArtiContacts = pData.numTotalArtiSelfContacts;
+	preDesc.numTotalSelfArtiConstraints = pData.numTotalArtiSelfConstraints;
+
+	preDesc.artiStaticConstraintBatchOffset = pData.artiStaticConstraintBatchOffset;
+	preDesc.artiStaticContactBatchOffset = pData.artiStaticContactBatchOffset;
+
+	preDesc.blockContactData = reinterpret_cast<PxgBlockContactData*>(mConstraintContactPrepBlockPool.getDevicePtr());
+	preDesc.blockContactPoints = reinterpret_cast<PxgBlockContactPoint*>(mGpuContactBlockBuffer.getDevicePtr());
+	preDesc.compressedContacts = reinterpret_cast<PxContact*>(mCompressedContacts.getDevicePtr());
+	preDesc.compressedPatches = reinterpret_cast<PxContactPatch*>(mCompressedPatches.getDevicePtr());
+	preDesc.forceBuffer = reinterpret_cast<PxU8*>(mForceBuffer.getDevicePtr());
+
+	preDesc.sharedJointRowIndex = 0;
+	preDesc.nbD6RigidJoints = nbD6RigidJoint;
+	preDesc.nbD6ArtiJoints = nbD6ArtiJoint;
+	preDesc.nbTotalArtiJoints = nbTotalArtiJoints;
+
+	preDesc.blockPrepData = reinterpret_cast<PxgBlockConstraint1DData*>(mConstraint1DPrepBlockPool.getDevicePtr());
+	preDesc.blockPrepVelocityData = reinterpret_cast<PxgBlockConstraint1DVelocities*>(mConstraint1DPrepBlockPoolVel.getDevicePtr());
+	preDesc.blockPrepParameterData = reinterpret_cast<PxgBlockConstraint1DParameters*>(mConstraint1DPrepBlockPoolPar.getDevicePtr());
+
+	//this is the first pass of constraint 1D data which filled in by the GPU for D6 joint. After that, if we have other joint type, which is filled in by CPU, we need to append
+	//the CPU result in this buffer and do the second pass of data filling for the block format in GPU
+	preDesc.constraintData = reinterpret_cast<PxgConstraintData*>(mConstraintDataPool.getDevicePtr());
+	preDesc.constraintRows = reinterpret_cast<Px1DConstraint*>(mConstraintRowPool.getDevicePtr());
+
+	preDesc.artiConstraintData = reinterpret_cast<PxgConstraintData*>(mArtiConstraintDataPool.getDevicePtr());
+	preDesc.artiConstraintRows = reinterpret_cast<Px1DConstraint*>(mArtiConstraintRowPool.getDevicePtr());
+
+	PxgSimulationCore* simCore = mGpuContext->getSimulationCore();
+	preDesc.rigidJointData = reinterpret_cast<PxgD6JointData*>(simCore->getD6RigidJointBuffer().getDevicePtr());//reinterpret_cast<PxgD6JointData*>(mD6JointDataPool.getDevicePtr(0)); 
+	preDesc.rigidConstraintPrePrep = reinterpret_cast<PxgConstraintPrePrep*>(simCore->getD6RigidJointPrePreBuffer().getDevicePtr());//reinterpret_cast<PxgConstraintPrePrep*>(mD6JointPrePrepPool.getDevicePtr(0));
+
+	preDesc.artiJointData = reinterpret_cast<PxgD6JointData*>(simCore->getD6ArtiJointBuffer().getDevicePtr());
+	preDesc.artiConstraintPrePrep = reinterpret_cast<PxgConstraintPrePrep*>(simCore->getD6ArtiJointPrePreBuffer().getDevicePtr());
+
+	preDesc.cpuCompressedContactsBase = cpuCompressedContactsBase;
+	preDesc.cpuCompressedPatchesBase = cpuCompressedPatchesBase;
+	preDesc.cpuForceBufferBase = cpuForceBufferBase;
+
+	preDesc.contactManagerOutputBase = reinterpret_cast<PxsContactManagerOutput*>(mGpuContactManagerOutputBase);
+	preDesc.sharedFrictionConstraintIndex = 0;
+	preDesc.sharedContactConstraintIndex = 0;	
+	preDesc.sharedArticulationResponseIndex = 0;
+	preDesc.solverBodyIndices = reinterpret_cast<PxU32*>(mSolverBodyIndices.getDevicePtr());
+
+	preDesc.mPartitionIndices = reinterpret_cast<PartitionIndexData*>(mPartitionIndexData.getDevicePtr());
+	preDesc.mPartitionstartBatchIndices = reinterpret_cast<PxU32*>(mPartitionStartBatchIndices.getDevicePtr());
+	preDesc.mPartitionArtiStartBatchIndices = reinterpret_cast<PxU32*>(mPartitionArticulationStartBatchIndices.getDevicePtr());
+	preDesc.mPartitionJointCounts = reinterpret_cast<PxU32*>(mPartitionJointBatchCounts.getDevicePtr());
+	preDesc.mPartitionArtiJointCounts = reinterpret_cast<PxU32*>(mPartitionArtiJointBatchCounts.getDevicePtr());
+	preDesc.currFrictionPatchCount = reinterpret_cast<PxU32*>(mFrictionPatchCounts[mCurrentIndex].getDevicePtr());
+	preDesc.prevFrictionPatchCount = reinterpret_cast<PxU32*>(mFrictionPatchCounts[1 - mCurrentIndex].getDevicePtr());
+
+	preDesc.mNpOutputIndices = reinterpret_cast<PxU32*>(mNpIndexArray.getDevicePtr());
+
+	for (PxU32 i = 0; i < GPU_BUCKET_ID::eCount; ++i)
+		preDesc.mCmOutputOffsets[i] = outputIterator.getIndex(i + GPU_BUCKET_ID::eCount);
+
+	preDesc.mSolverBodyData = reinterpret_cast<PxgSolverBodyData*>(mSolverBodyDataPool.getDevicePtr());
+	preDesc.mPartitionNodeData = reinterpret_cast<PartitionNodeData*>(mPartitionNodeData.getDevicePtr());
+
+	preDesc.mContactConstantData = reinterpret_cast<PxgSolverConstraintManagerConstants*>(mSolverConstantData.getDevicePtr());
+
+	preDesc.mBatchHeaders = reinterpret_cast<PxgConstraintBatchHeader*>(mConstraintBatchHeaders);
+	preDesc.mContactUniqueIndices = reinterpret_cast<PxU32*>(mContactUniqueIndices);
+	preDesc.mConstraintUniqueIndices = reinterpret_cast<PxU32*>(mConstraintUniqueIndices);
+	preDesc.mArtiConstraintUniqueIndices = reinterpret_cast<PxU32*>(mArtiConstraintUniqueIndices);
+	preDesc.mArtiContactUniqueIndices = reinterpret_cast<PxU32*>(mArtiContactUniqueIndices);
+
+	preDesc.mSolverBodyReferences = reinterpret_cast<PxgSolverReferences*>(mSolverBodyReferences.getDevicePtr());
+	preDesc.mMaxConstraintPartitions = maxConstraintPartitions;
+	preDesc.mTotalSlabs = numSlabs;
+	preDesc.mTotalActiveBodies = totalActiveBodies;
+	preDesc.mTotalActiveArticulations = nbArticulations;
+	preDesc.mActiveBodyStartOffset = activeBodyStartOffset;
+	preDesc.nbElementsPerBody = nbElementsPerBody;
+	preDesc.mRestDistances = restDistances;
+	preDesc.mTorsionalFrictionData = torsionalData;
+	preDesc.mShapeInteractions = shapeInteractions;
+
+	preDesc.mArtiStaticContactIndices = reinterpret_cast<PxU32*>(mArtiStaticContactIndices.getDevicePtr());
+	preDesc.mArtiStaticConstraintIndices = reinterpret_cast<PxU32*>(mArtiStaticJointIndices.getDevicePtr());
+	preDesc.mArtiStaticContactCounts = reinterpret_cast<PxU32*>(mArtiStaticContactCounts.getDevicePtr());
+	preDesc.mArtiStaticConstraintCounts = reinterpret_cast<PxU32*>(mArtiStaticJointCounts.getDevicePtr());
+
+	preDesc.mArtiSelfContactIndices = reinterpret_cast<PxU32*>(mArtiSelfContactIndices.getDevicePtr());
+	preDesc.mArtiSelfConstraintIndices = reinterpret_cast<PxU32*>(mArtiSelfJointIndices.getDevicePtr());
+	preDesc.mArtiSelfContactCounts = reinterpret_cast<PxU32*>(mArtiSelfContactCounts.getDevicePtr());
+	preDesc.mArtiSelfConstraintCounts = reinterpret_cast<PxU32*>(mArtiSelfJointCounts.getDevicePtr());
+
+	preDesc.mRigidStaticContactIndices = reinterpret_cast<PxU32*>(mRigidStaticContactIndices.getDevicePtr());
+	preDesc.mRigidStaticConstraintIndices = reinterpret_cast<PxU32*>(mRigidStaticJointIndices.getDevicePtr());
+	preDesc.mRigidStaticContactCounts = reinterpret_cast<PxU32*>(mRigidStaticContactCounts.getDevicePtr());
+	preDesc.mRigidStaticConstraintCounts = reinterpret_cast<PxU32*>(mRigidStaticJointCounts.getDevicePtr());
+
+	preDesc.mRigidStaticContactStartIndices = reinterpret_cast<PxU32*>(mRigidStaticContactStartIndices.getDevicePtr());
+	preDesc.mRigidStaticConstraintStartIndices = reinterpret_cast<PxU32*>(mRigidStaticJointStartIndices.getDevicePtr());
+
+	preDesc.mTempContactUniqueIndices = reinterpret_cast<PxU32*>(mTempContactUniqueIndicesBlockBuffer.getDevicePtr());
+	preDesc.mTempConstraintUniqueIndices = reinterpret_cast<PxU32*>(mTempConstraintUniqueIndicesBlockBuffer.getDevicePtr());
+	preDesc.mTempContactBlockHeader = reinterpret_cast<PxU32*>(mTempContactHeaderBlockBuffer.getDevicePtr());
+	preDesc.mTempConstraintBlockHeader = reinterpret_cast<PxU32*>(mTempConstraintHeaderBlockBuffer.getDevicePtr());
+}
+
+void PxgSolverCore::constructSolverSharedDescCommon(PxgSolverSharedDescBase& sharedDesc, const PxgConstantData& cData,
+	Cm::UnAlignedSpatialVector* deferredZ, PxU32* articulationDirty, uint4* articulationSlabMask)
+{
+	sharedDesc.dt = cData.dt;
+	sharedDesc.invDtF32 = cData.invDtF32;
+
+	sharedDesc.blockCurrentFrictionPatches = reinterpret_cast<PxgBlockFrictionPatch*>(mFrictionPatchBlockStream[mCurrentIndex].getDevicePtr());
+	sharedDesc.blockPreviousFrictionPatches = reinterpret_cast<PxgBlockFrictionPatch*>(mFrictionPatchBlockStream[1 - mCurrentIndex].getDevicePtr());
+
+	sharedDesc.currentFrictionPatches = reinterpret_cast<PxgFrictionPatch*>(mFrictionPatchStream[mCurrentIndex].getDevicePtr());
+	sharedDesc.previousFrictionPatches = reinterpret_cast<PxgFrictionPatch*>(mFrictionPatchStream[1 - mCurrentIndex].getDevicePtr());
+
+	PxgSimulationCore* core = mGpuContext->getSimulationCore();
+
+	sharedDesc.mBodySimBufferDeviceData = core->getBodySimBufferDeviceData().getPointer();
+	sharedDesc.articulations = reinterpret_cast<PxgArticulation*>(core->getArticulationBuffer().getDevicePtr());
+	sharedDesc.articulationDeferredZ = deferredZ;
+	sharedDesc.articulationDirty = articulationDirty;
+	sharedDesc.articulationSlabMask = articulationSlabMask;
+
+	sharedDesc.deltaOutOffset = mSolverBodyOutputVelocityOffset;
+}
+
+// PT: I don't understand the existing code. We already have a constructSolverSharedDescCommon function above, working on a
+// PxgSolverSharedDescBase structure. But there is still plenty of "constructSolverDesc" code that could be shared between
+// PGS and TGS when we initialize PxgSolverCoreDesc (which doesn't inherit from PxgSolverSharedDescBase). I just started moving
+// that shared code here, without touching the other bits.
+void PxgSolverCore::constructSolverDesc(PxgSolverCoreDesc& scDesc, PxU32 numIslands, PxU32 numSolverBodies, PxU32 numConstraintBatchHeader, PxU32 numArticConstraints, PxU32 numSlabs, bool enableStabilization)
+{
+	CUdeviceptr islandContextPoold = mIslandContextPool;//mIslandContextPool.getDevicePtr(0);
+	CUdeviceptr motionVelocityArrayd = mMotionVelocityArray.getDevicePtr();
+	CUdeviceptr constraintsPerPartitiond = mConstraintsPerPartition.getDevicePtr();
+
+	scDesc.outSolverVelocity = reinterpret_cast<float4*>(mOutVelocityPool.getDevicePtr());
+	scDesc.outBody2World = reinterpret_cast<PxAlignedTransform*>(mOutBody2WorldPool.getDevicePtr());
+	scDesc.solverBodyDataPool = reinterpret_cast<PxgSolverBodyData*>(mSolverBodyDataPool.getDevicePtr());
+	scDesc.solverBodyTxIDataPool = reinterpret_cast<PxgSolverTxIData*>(mSolverTxIDataPool.getDevicePtr());
+	scDesc.solverBodySleepDataPool = reinterpret_cast<PxgSolverBodySleepData*>(mSolverBodySleepDataPool.getDevicePtr());
+
+	scDesc.outArtiVelocity = reinterpret_cast<float4*>(mOutArtiVelocityPool.getDevicePtr());
+		
+	scDesc.constraintWriteBack = reinterpret_cast<PxgConstraintWriteback*>(mConstraintWriteBackBuffer.getDevicePtr());
+	scDesc.forceBuffer = reinterpret_cast<PxF32*>(mForceBuffer.getDevicePtr());
+	scDesc.frictionPatches = reinterpret_cast<PxFrictionPatch*>(mFrictionPatches.getDevicePtr());
+
+	scDesc.solverBodyReferences = reinterpret_cast<PxgSolverReferences*>(mSolverBodyReferences.getDevicePtr());
+
+	scDesc.contactManagerOutputBase = reinterpret_cast<PxsContactManagerOutput*>(mGpuContactManagerOutputBase);
+
+	scDesc.islandContextPool = reinterpret_cast<PxgIslandContext*>(islandContextPoold);
+	scDesc.motionVelocityArray = reinterpret_cast<float4*>(motionVelocityArrayd);
+	scDesc.constraintsPerPartition = reinterpret_cast<PxU32*>(constraintsPerPartitiond);
+	scDesc.artiConstraintsPerPartition = reinterpret_cast<PxU32*>(mArtiConstraintsPerPartition.getDevicePtr());
+	PxgSimulationCore* simulationCore = mGpuContext->getSimulationCore(); 
+	scDesc.mBodySimBufferDeviceData = simulationCore->getBodySimBufferDeviceData().getPointer();
+	scDesc.mBodySimPrevVelocitiesBufferDeviceData = simulationCore->getBodySimPrevVelocitiesBufferDeviceData().getPointer();
+
+	scDesc.mRigidStaticContactCounts = reinterpret_cast<PxU32*>(mRigidStaticContactCounts.getDevicePtr());
+	scDesc.mRigidStaticContactStartIndices = reinterpret_cast<PxU32*>(mRigidStaticContactStartIndices.getDevicePtr());
+
+	scDesc.mRigidStaticJointCounts = reinterpret_cast<PxU32*>(mRigidStaticJointCounts.getDevicePtr());
+	scDesc.mRigidStaticJointStartIndices = reinterpret_cast<PxU32*>(mRigidStaticJointStartIndices.getDevicePtr());
+
+	scDesc.numIslands = numIslands;
+	scDesc.numSolverBodies = numSolverBodies;
+	scDesc.numBatches = numConstraintBatchHeader;
+	scDesc.numArticBatches = numArticConstraints;
+
+	scDesc.accumulatedBodyDeltaVOffset = mSolverBodyOutputVelocityOffset;
+
+	scDesc.numSlabs = numSlabs;
+	scDesc.maxLinksPerArticulation = mGpuContext->getSimulationCore()->getMaxArticulationLinks();
+
+	scDesc.sharedThresholdStreamIndex = 0;
+	scDesc.nbForceChangeElements = 0;
+	scDesc.nbExceededThresholdElements = 0;
+	scDesc.nbPrevExceededThresholdElements = mNbPrevExceededForceElements;
+	scDesc.enableStabilization = enableStabilization;
+}
+
+void PxgSolverCore::gpuMemDMAUpJointData(const PxPinnedArray<PxgConstraintData>& cpuJointDataPool, const PxPinnedArray<Px1DConstraint>& cpuJointRowPool,
+	PxU32 nbCpuJoints, PxU32 nbGpuJoints, PxU32 totalCpuRows)
+{
+	CUdeviceptr startPtr = mConstraintDataPool.getDevicePtr() + nbGpuJoints * sizeof(PxgConstraintData);
+	CUdeviceptr startRowPtr = mConstraintRowPool.getDevicePtr() + nbGpuJoints * sizeof(Px1DConstraint)*Dy::MAX_CONSTRAINT_ROWS;
+	mCudaContext->memcpyHtoDAsync(startPtr, cpuJointDataPool.begin(), nbCpuJoints * sizeof(PxgConstraintData), mStream);
+	mCudaContext->memcpyHtoDAsync(startRowPtr, cpuJointRowPool.begin(), totalCpuRows * sizeof(Px1DConstraint), mStream);
+
+#if GPU_CORE_DEBUG
+	CUresult result = mCudaContext->streamSynchronize(mStream);
+	if (result != CUDA_SUCCESS)
+		PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU DMA up cpu joint data fail!!\n");
+#endif
+}
+
+void PxgSolverCore::gpuMemDMAUpArtiJointData(const PxPinnedArray<PxgConstraintData>& cpuArtiJointDataPool, const PxPinnedArray<Px1DConstraint>& cpuArtiJointRowPool,
+	PxU32 nbCpuArtiJoints, PxU32 nbGpuArtiJoints, PxU32 totalArtiRows)
+{
+	CUdeviceptr startPtr = mArtiConstraintDataPool.getDevicePtr() + nbGpuArtiJoints * sizeof(PxgConstraintData);
+	CUdeviceptr startRowPtr = mArtiConstraintRowPool.getDevicePtr() + nbGpuArtiJoints * sizeof(Px1DConstraint)*Dy::MAX_CONSTRAINT_ROWS;
+
+	mCudaContext->memcpyHtoDAsync(startPtr, cpuArtiJointDataPool.begin(), nbCpuArtiJoints * sizeof(PxgConstraintData), mStream);
+	mCudaContext->memcpyHtoDAsync(startRowPtr, cpuArtiJointRowPool.begin(), totalArtiRows * sizeof(Px1DConstraint), mStream);
+
+#if GPU_CORE_DEBUG
+	CUresult result = mCudaContext->streamSynchronize(mStream);
+	if (result != CUDA_SUCCESS)
+		PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU DMA up articulation joint data fail!!\n");
+#endif
+}
+
+void PxgSolverCore::constraintPrePrepParallel(PxU32 nbConstraintBatches, PxU32 nbD6Joints, PxU32 numBodies)
+{
+	PX_PROFILE_ZONE("GpuDynamics.ConstraintPrePrepParallel", 0);
+
+	///////////////////////////////////////
+	//New step here!!!
+	//We need to prep up the static rigid body contact buffers prior to contact pre-prep
+
+	{
+		const CUfunction staticKernel1 = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::RIGID_SUM_STATIC_CONTACT1);
+
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mPrePrepDescd),
+			PX_CUDA_KERNEL_PARAM(numBodies)
+		};
+
+		const PxU32 nbBlocksRequired = 32;
+
+		CUresult launchResult = mCudaContext->launchKernel(staticKernel1, nbBlocksRequired, 1, 1, PxgKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU rigidSumInternalContactAndJointBatches1 kernel fail!\n");
+#endif
+	}
+
+	{
+		const CUfunction staticKernel2 = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::RIGID_SUM_STATIC_CONTACT2);
+
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mPrePrepDescd),
+			PX_CUDA_KERNEL_PARAM(mSolverCoreDescd),
+			PX_CUDA_KERNEL_PARAM(mPrepareDescd),
+			PX_CUDA_KERNEL_PARAM(numBodies)
+		};
+
+		const PxU32 nbBlocksRequired = 32;
+
+		CUresult launchResult = mCudaContext->launchKernel(staticKernel2, nbBlocksRequired, 1, 1, PxgKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU rigidSumInternalContactAndJointBatches1 kernel fail!\n");
+#endif
+	}
+
+	//////////////////////////////////////
+
+	CUfunction kernelFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::CONTACT_CONSTRAINT_PREPREP_BLOCK);
+
+	CUdeviceptr descd = mPrePrepDescd;
+	CUdeviceptr shDescd = mSharedDescd;
+	PxCudaKernelParam kernelParams[] =
+	{
+		PX_CUDA_KERNEL_PARAM(descd),
+		PX_CUDA_KERNEL_PARAM(shDescd)
+	};
+
+	const PxU32 nbBlocksRequired = (nbConstraintBatches*PXG_BATCH_SIZE + PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK - 1) / PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK;
+
+	if (nbBlocksRequired > 0)
+	{
+		CUresult launchResult = mCudaContext->launchKernel(kernelFunction, nbBlocksRequired, 1, 1, PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU constraintContactBlockPrePrepLaunch kernel fail!\n");
+#endif
+	}
+
+	const PxU32 nbD6JointsBlocks = (nbD6Joints + PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK - 1) / PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK;
+	if (nbD6JointsBlocks > 0)
+	{
+		//non-block joint constraint pre-prepare
+		kernelFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::JOINT_CONSTRAINT_PREPREP);
+
+		CUresult result = mCudaContext->launchKernel(kernelFunction, nbD6JointsBlocks, 1, 1, PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+
+		PX_ASSERT(result == CUDA_SUCCESS);
+		PX_UNUSED(result);
+
+#if GPU_CORE_DEBUG
+		result = mCudaContext->streamSynchronize(mStream);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU constraintPrePrepare kernel fail!\n");
+#endif
+	}
+}
+
+void PxgSolverCore::resetVelocities(bool isTGS)
+{
+	PX_PROFILE_ZONE("GpuDynamics.ZeroBodies", 0);
+
+	{
+		CUfunction zeroBodiesFunction =
+		    isTGS ? mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::ZERO_BODIES_TGS)
+		          : mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::ZERO_BODIES);
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mSolverCoreDescd),
+			PX_CUDA_KERNEL_PARAM(mSharedDescd)
+		};
+		CUresult result = mCudaContext->launchKernel(zeroBodiesFunction, PxgKernelGridDim::ZERO_BODIES, 1, 1, PxgKernelBlockDim::ZERO_BODIES, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU zero bodies fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+		result = mCudaContext->streamSynchronize(mStream);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU zero bodies kernel fail!\n");
+#endif
+	}
+}
+
+void PxgSolverCore::precomputeReferenceCount(PxgIslandContext* islandContext, PxU32 islandIndex, PxInt32ArrayPinned& constraintsPerPartition,
+	PxInt32ArrayPinned& artiConstraintsPerPartition, bool isTGS, PxReal minPen, PxReal elapsedTime)
+{
+	PX_PROFILE_ZONE("GpuDynamics.precomputeReferenceCount", 0);
+	{
+		PxgIslandContext& context = islandContext[islandIndex];
+
+		if(context.mNumPartitions)
+		{
+			const PxU32 numThreadsPerWarp = WARP_SIZE;
+			PxU32 numWarpsPerBlock =
+			    PxgArticulationCoreKernelBlockDim::COMPUTE_UNCONSTRAINED_VELOCITES / numThreadsPerWarp;
+
+			if (isTGS)
+			{
+				CUfunction markActiveSlabTGS = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(
+					PxgKernelIds::MARK_ACTIVE_SLAB_TGS);
+
+				// Mark slabs that are active.
+				// Loosely following solveBlockUnified launches.
+				{
+					const PxU32 lastPartition = context.mNumPartitions - 1; // Pass the last partition so that all the
+																			// partition iterations can be run in a single
+																			// kernel.
+					CUdeviceptr artiDescd = mGpuContext->getArticulationCore()->getArticulationCoreDescd();
+
+					PxCudaKernelParam kernelParamsTGS[] = {
+						PX_CUDA_KERNEL_PARAM(mSolverCoreDescd), PX_CUDA_KERNEL_PARAM(mSharedDescd),
+						PX_CUDA_KERNEL_PARAM(islandIndex),      PX_CUDA_KERNEL_PARAM(lastPartition),
+						PX_CUDA_KERNEL_PARAM(minPen),           PX_CUDA_KERNEL_PARAM(elapsedTime),
+						PX_CUDA_KERNEL_PARAM(artiDescd)
+					};
+
+					PxU32 nbBlocks = (constraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					PxU32 nbArtiBlocks = (artiConstraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					const PxU32 maxBlocks = PxMax(nbBlocks, nbArtiBlocks);
+
+					if (maxBlocks)
+					{
+						const PxU32 blockY = nbArtiBlocks > 0 ? 2 : 1;
+						CUresult result = mCudaContext->launchKernel(markActiveSlabTGS, maxBlocks, blockY, 1,
+						                                             numThreadsPerWarp, numWarpsPerBlock, 1, 0, mStream,
+						                                             kernelParamsTGS, sizeof(kernelParamsTGS), 0, PX_FL);
+
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+						result = mCudaContext->streamSynchronize(mStream);
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab kernel fail!\n");
+#endif
+					}
+				}
+			}
+			else
+			{
+				CUfunction markActiveSlabPGS = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(
+					PxgKernelIds::MARK_ACTIVE_SLAB_PGS);
+
+				// Mark slabs that are active.
+				// Loosely following solveBlockUnified launches.
+				{
+					const PxU32 lastPartition = context.mNumPartitions - 1; // Pass the last partition so that all the
+																			// partition iterations can be run in a single
+																			// kernel.
+					CUdeviceptr artiDescd = mGpuContext->getArticulationCore()->getArticulationCoreDescd();
+
+					PxCudaKernelParam kernelParamsPGS[] = {
+						PX_CUDA_KERNEL_PARAM(mSolverCoreDescd), PX_CUDA_KERNEL_PARAM(mSharedDescd),
+						PX_CUDA_KERNEL_PARAM(islandIndex),      PX_CUDA_KERNEL_PARAM(lastPartition),
+						PX_CUDA_KERNEL_PARAM(artiDescd)
+					};
+
+					PxU32 nbBlocks = (constraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					PxU32 nbArtiBlocks = (artiConstraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					const PxU32 maxBlocks = PxMax(nbBlocks, nbArtiBlocks);
+
+					if (maxBlocks)
+					{
+						const PxU32 blockY = nbArtiBlocks > 0 ? 2 : 1;
+						CUresult result = mCudaContext->launchKernel(markActiveSlabPGS, maxBlocks, blockY, 1,
+						                                             numThreadsPerWarp, numWarpsPerBlock, 1, 0, mStream,
+						                                             kernelParamsPGS, sizeof(kernelParamsPGS), 0, PX_FL);
+
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+						result = mCudaContext->streamSynchronize(mStream);
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab kernel fail!\n");
+#endif
+					}
+				}
+			}
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgTGSCudaSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgTGSCudaSolverCore.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgTGSDynamicsContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgTGSDynamicsContext.cpp
@@ -0,0 +1,122 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgTGSDynamicsContext.h"
+#include "PxgKernelWrangler.h"
+#include "PxgArticulationCore.h"
+#include "PxgTGSCudaSolverCore.h"
+
+namespace physx
+{
+	PxgTGSDynamicsContext::PxgTGSDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+		const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions,
+		bool enableStabilization, bool useEnhancedDeterminism,
+		PxReal maxBiasCoefficient,
+		PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+		bool externalForcesEveryTgsIterationEnabled, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled)
+		:
+		PxgGpuContext(flushPool, islandManager, maxNumPartitions, maxNumStaticPartitions, enableStabilization, useEnhancedDeterminism,
+			maxBiasCoefficient, simStats, heapMemoryManager, lengthScale, enableDirectGPUAPI, contextID, isResidualReportingEnabled, true)
+	{
+		mWorldSolverBody.linearVelocity = PxVec3(0);
+		mWorldSolverBody.angularVelocity = PxVec3(0);
+		mWorldSolverBodyData.invMass = 0;
+		mWorldSolverBodyData.reportThreshold = PX_MAX_REAL;
+		mWorldSolverBodyData.maxImpulse = PX_MAX_REAL;
+		mWorldSolverBodyData.penBiasClamp = -PX_MAX_REAL;
+		mWorldSolverBodyData.initialAngVel = mWorldSolverBodyData.initialLinVel = PxVec3(0.f);
+		mWorldSolverBodyData.body2World = PxAlignedTransform(PxIdentity);
+		mWorldSolverBodyData.islandNodeIndex = PxNodeIndex(PX_INVALID_NODE);
+		mWorldSolverBodyData.offsetSlop = 0.f;
+
+		mWorldTxIData.sqrtInvInertia = PxMat33(PxZero);
+		mWorldTxIData.deltaBody2World = PxTransform(PxIdentity);
+
+		{
+			mGpuArticulationCore = PX_NEW(PxgArticulationCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, heapMemoryManager);
+
+			mGpuSolverCore = PX_NEW(PxgTGSCudaSolverCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, this, heapMemoryManager, config);
+
+			mGpuArticulationCore->setGpuContext(this);
+		}
+
+		mGpuSolverCore->acquireContext();
+
+		mGpuSolverCore->createStreams();
+
+		createThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+		createForceChangeThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+
+		mPinnedMemoryAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.tempBufferCapacity);
+
+		mCurrentContactStream = 0;
+		mContactStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+		mContactStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+
+		mPatchStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+		mPatchStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+	
+		mForceStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxReal) * 2);
+
+		mFrictionPatchStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxFrictionPatch));
+
+		mContactStreamPool.mDataStream = mContactStreamAllocators[mCurrentContactStream]->mStart;
+		mContactStreamPool.mDataStreamSize = (PxU32)mContactStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mContactStreamPool.mSharedDataIndex = 0;
+		mContactStreamPool.mSharedDataIndexGPU = 0;
+
+		mPatchStreamPool.mDataStream = mPatchStreamAllocators[mCurrentContactStream]->mStart;
+		mPatchStreamPool.mDataStreamSize = (PxU32)mPatchStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mPatchStreamPool.mSharedDataIndex = 0;
+		mPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		mForceStreamPool.mDataStream = mForceStreamAllocator->mStart;
+		mForceStreamPool.mDataStreamSize = (PxU32)mForceStreamAllocator->mTotalSize;
+		mForceStreamPool.mSharedDataIndex = 0;
+		mForceStreamPool.mSharedDataIndexGPU = 0;
+
+		mFrictionPatchStreamPool.mDataStream = mFrictionPatchStreamAllocator->mStart;
+		mFrictionPatchStreamPool.mDataStreamSize = PxTo32(mFrictionPatchStreamAllocator->mTotalSize);
+		mFrictionPatchStreamPool.mSharedDataIndex = 0;
+		mFrictionPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		//Arbitrarily-large number to reserve to minimize allocation churn.
+		mConstraintsPerPartition.reserve(1024);
+
+		mArtiConstraintsPerPartition.reserve(1024);
+
+		mGpuSolverCore->releaseContext();
+	    mIsExternalForcesEveryTgsIterationEnabled = externalForcesEveryTgsIterationEnabled;
+	}
+
+	void PxgTGSDynamicsContext::destroy()
+	{
+		this->~PxgTGSDynamicsContext();
+		PX_FREE_THIS;
+	}
+}