feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/accumulateThresholdStream.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/accumulateThresholdStream.cu
@@ -0,0 +1,848 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgRadixSortDesc.h"
+#include "DyThresholdTable.h"
+#include "RadixSort.cuh"
+#include "PxgRadixSortDesc.h"
+#include "PxgCommonDefines.h"
+
+#include "reduction.cuh"
+#include <stdio.h>
+#include "PxgSolverKernelIndices.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels0() {}
+
+extern "C" __global__ void bodyInputAndRanksSingleBlockLaunch(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
+	uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
+	PxU32* gRadixCount = desc->radixBlockCounts;
+	const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
+
+	radixSortSingleBlock<PxgKernelBlockDim::RADIXSORT/WARP_SIZE>(gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount);
+}
+
+extern "C" __global__ void bodyInputAndRanksBlocksLaunch(const PxgSolverCoreDesc* solverDesc, PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
+	uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
+	PxU32* gOutputKeys = desc->outputKeys;
+	PxU32* gOutputRanks = desc->outputRanks;
+
+	PxU32* gRadixCount = desc->radixBlockCounts;
+	const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
+
+	radixSortCalculateRanks<PxgKernelBlockDim::RADIXSORT / WARP_SIZE>( gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
+}
+
+extern "C" __global__ void initialRanksAndBodyIndexB(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	PxU32* gInputKeys = rsDesc->inputKeys;
+	PxU32* gInputRanks = rsDesc->inputRanks;
+
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
+
+	for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		Dy::ThresholdStreamElement& elements =  thresholdStream[i];
+		gInputKeys[i] = elements.nodeIndexB.index();
+		gInputRanks[i] = i;
+	}
+
+	//we need to pad the handles to the multiply of 4 for the radix sort
+	const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
+
+	for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		const PxU32 index = i + nbThresholdElements;
+		gInputKeys[index]  = 0xffffffff;
+		gInputRanks[index] = index;
+	}
+
+}
+
+extern "C" __global__ void initialRanksAndBodyIndexA(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	//we need to use the inputRanks from the bodyAIndex to reorganize the threshold stream
+	PxU32* gInputKeys = rsDesc->inputKeys;
+	PxU32* gInputRanks = rsDesc->inputRanks;
+
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
+
+	for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		Dy::ThresholdStreamElement& elements =  thresholdStream[gInputRanks[i]];
+		gInputKeys[i] = elements.nodeIndexA.index();
+	}
+
+	//we need to pad the handles to the multiply of 4 for the radix sort
+	const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
+
+	for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
+	{
+		const PxU32 index = i + nbThresholdElements;
+		gInputKeys[index]  = 0xffffffff;
+	}
+
+}
+
+extern "C" __global__ void reorganizeThresholdElements(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
+{
+	Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
+	Dy::ThresholdStreamElement* tmpThresholdStream = solverDesc->tmpThresholdStream;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32* gInputRanks = rsDesc->inputRanks;
+	PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 i = globalThreadIdx/8; i < nbThresholdElements; i+=((blockDim.x*gridDim.x)/8))
+	{
+		PxU32* dest = reinterpret_cast<PxU32*>(thresholdStream + i);
+		PxU32* src = reinterpret_cast<PxU32*>(tmpThresholdStream + gInputRanks[i]);
+		
+		dest[threadIdx.x&7] = src[threadIdx.x&7];
+	}
+
+}
+
+extern "C" __global__ void computeAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert(WARP_PERBLOCK_SIZE == (1 << LOG2_WARP_PERBLOCK_SIZE));
+
+	__shared__ PxReal sWarpAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxReal sBlockAccumulator;
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	//Each body can be made of multiple shapes, therefore, we need to accumulated difference forces from the shapes to the body pairs. In this case, we will have thresholdStreams have
+	//same bodyAIndex and bodyBIndex
+
+	//The threshold stream has been sorted based on the bodyAIndex and bodyBIndex, therefore, if pairs have the same bodyAIndex and bodyBIndex, they will laied in continuously memory
+	Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
+
+	PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
+	PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockAccumulator = 0;
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+	
+		PxReal val = 0.f;
+		bool isNewPair = false;
+		PxNodeIndex nodeIndexA(PX_INVALID_NODE);
+		PxNodeIndex nodeIndexB(PX_INVALID_NODE);
+	
+		if(workIndex < nbThresholdElements)
+		{
+			val = gThresholdStream[workIndex].normalForce;
+			nodeIndexA = gThresholdStream[workIndex].nodeIndexA;
+			nodeIndexB = gThresholdStream[workIndex].nodeIndexB;
+
+			if(workIndex+1 < nbThresholdElements)
+			{
+				Dy::ThresholdStreamElement& nElement = gThresholdStream[workIndex+1];
+				if(!(nodeIndexA == nElement.nodeIndexA && nodeIndexB == nElement.nodeIndexB))
+				{
+					isNewPair = true;
+				}
+			}
+			else
+			{
+				isNewPair = true;
+			}
+		}
+
+		//warpScan is inclusive add but the accumVal is exclusive add result
+		const PxReal accumVal = warpScan<AddOpPxReal, PxReal>(FULL_MASK, val) - val;
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isNewPair)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpAccumulator[warpIndex] = accumVal + val;
+			sWarpPairsAccumulator[warpIndex] = accumPairs + isNewPair;
+		}
+
+		const PxReal prevBlockAccumulator = sBlockAccumulator;
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			PxReal forceVal = sWarpAccumulator[idx];
+
+			const PxReal accumulatedForce = warpScan<AddOpPxReal, PxReal, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, forceVal) - forceVal;
+			sWarpAccumulator[idx] = accumulatedForce;
+		
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockAccumulator += (accumulatedForce + forceVal);
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < nbThresholdElements)
+		{
+			//accumVal is exclusive result within a warp and sWarpAccumulator is the exclusive result within a block
+			gThresholdStreamAccumulatedForce[workIndex] = val + accumVal + prevBlockAccumulator + sWarpAccumulator[warpIndex]; //this is inclusive
+			gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = isNewPair;
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gThresholdStreamAccumulatedForceBetweenBlocks[blockIdx.x] = sBlockAccumulator;
+		gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+
+}
+
+extern "C" __global__ void outputAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_ACCUMULATED_THRESHOLDSTREAM;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
+	PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	__shared__ PxReal sBlockForceAccum[nbBlocks];
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	const PxU32 idx = threadIdx.x;
+//	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+
+	PxReal val = 0;
+	PxReal res = 0;
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		val = gThresholdStreamAccumulatedForceBetweenBlocks[idx];
+		pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
+
+		res = warpScan<AddOpPxReal, PxReal>(mask_idx, val) - val;
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockForceAccum[idx] = res;
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxReal blockForceAccum = sBlockForceAccum[blockIdx.x];
+
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < nbThresholdElements)
+		{
+			gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			gThresholdStreamAccumulatedForce[workIndex] = gThresholdStreamAccumulatedForce[workIndex] + blockForceAccum;
+			
+		}
+	}
+
+}
+
+extern "C" __global__ void writeoutAccumulatedForcePerObject(PxgSolverCoreDesc* solverDesc)
+{
+	
+	PxReal* gAccumulatedForces = solverDesc->thresholdStreamAccumulatedForce;
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	//accumulate normal force between blocks
+	for(PxU32 workIndex = globalThreadIdx; workIndex < nbThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
+
+		bool isNewPairs = gThresholdStreamWriteable[workIndex];
+
+		if(isNewPairs)
+		{
+			gAccumulatedForceObjectPairs[writeIndex] = gAccumulatedForces[workIndex];
+		}
+	}
+}
+
+
+extern "C" __global__ void computeExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	const PxReal dt = sharedDesc->dt;
+
+	Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
+	PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
+
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+
+		bool isExceededForce = false;
+	
+		if(workIndex < nbThresholdElements)
+		{
+			Dy::ThresholdStreamElement& element = gThresholdStream[workIndex];
+
+			//we are reusing the write index buffer. However, because the work index is the same, so as long as we read before we write, it should be safe
+			const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
+			PxReal accumulatedForce = gAccumulatedForceObjectPairs[writeIndex];
+
+			if(writeIndex > 0)
+			{
+				accumulatedForce -=  gAccumulatedForceObjectPairs[writeIndex-1];
+			}
+
+			//write back the accumulated force
+			element.accumulatedForce = accumulatedForce;
+
+			isExceededForce = accumulatedForce > (element.threshold * dt);
+		}
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isExceededForce)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpPairsAccumulator[warpIndex] = accumPairs + isExceededForce;
+		}
+
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < nbThresholdElements)
+		{
+			gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = isExceededForce;
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+
+}
+
+extern "C" __global__ void outputExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE/WARP_SIZE;
+	
+	PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+	const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+	Dy::ThresholdStreamElement* gThresholdElements = solverDesc->thresholdStream;
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	const PxU32 idx = threadIdx.x;
+//	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < nbThresholdElements)
+		{
+			///gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
+
+			bool isExceededForce = gThresholdStreamWriteable[workIndex];
+
+			if(isExceededForce)
+			{
+				Dy::ThresholdStreamElement& element = gThresholdElements[workIndex];
+				Dy::ThresholdStreamElement tempElement;
+				tempElement.shapeInteraction = element.shapeInteraction;
+				tempElement.nodeIndexA = element.nodeIndexA;
+				tempElement.nodeIndexB = element.nodeIndexB;
+				tempElement.normalForce = element.normalForce;
+				tempElement.accumulatedForce = element.accumulatedForce;
+				tempElement.threshold = element.threshold;
+				gExceededForceElements[writeIndex] = tempElement;
+			}
+
+			//last element
+			if(workIndex == nbThresholdElements -1)
+			{
+				solverDesc->nbExceededThresholdElements = isExceededForce ? (writeIndex + 1) : writeIndex;
+			}
+		}
+	}
+}
+
+
+//we will insure nbPrevExceededThresholdPairs > 0 in the CPU and all pair masks have been set to 1
+//The data laied out in gExceededForceElementMask is previous exceeded threshold element mask first, then current exceeded threshold element mask second.
+//persistent exceeded threshold element mask last. Persistent exceeded elements have to be in the previous exceeded threshold elements array and current 
+//exceeded threshold elements array so the number of persistent exceeded elements will be less than or equal to the previous exceeded threshold elements. 
+//Therefore, Persistent exceeded thresold element maks has the same size as the previous exceeded threshold and corresponding to the same element as in
+//previous exceeded force element. 
+extern "C" __global__ void setThresholdElementsMask(PxgSolverCoreDesc* solverDesc)
+{
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+	Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 workIndex = globalThreadIdx; workIndex < nbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		Dy::ThresholdStreamElement& element = gExceededForceElements[workIndex];
+
+		//this will find the last element match the element if value exist in the array 
+		PxU32 pos = binarySearch<Dy::ThresholdStreamElement>(gPrevExceededForceElements, nbPrevExceededThresholdElements, element);
+		Dy::ThresholdStreamElement* prePair = &gPrevExceededForceElements[pos];
+
+		bool done = false;
+
+		while (!done)
+		{
+			done = true;
+			if (prePair->nodeIndexA == element.nodeIndexA && prePair->nodeIndexB == element.nodeIndexB)
+			{
+				if (prePair->shapeInteraction == element.shapeInteraction)
+				{
+					//found a pair, raise 0 in the masks so that we won't generate any force change event. Because the mask array store previous and current exceeded force pairs, we need to
+					// raise 0 in two position: one for the previous mask and one for the current mask
+					gExceededForceElementMask[pos] = 0;
+					gExceededForceElementMask[nbPrevExceededThresholdElements + workIndex] = 0;
+				}
+				else if (pos > 1)
+				{
+					pos = pos - 1;
+					prePair = &gPrevExceededForceElements[pos];
+					done = false;
+				}
+			}
+		}
+
+	}
+}
+
+__device__ void setPersistentForceElementMask(PxgSolverCoreDesc* solverDesc)
+{
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
+
+	for (PxU32 workIndex = globalThreadIdx; workIndex < nbPrevExceededThresholdElements; workIndex += (blockDim.x*gridDim.x))
+	{
+		//based on the previous exceeded force elements
+		gExceededForceElementMask[persistentExceededStart + workIndex] = !gExceededForceElementMask[workIndex];
+	}
+}
+
+extern "C" __global__ void computeThresholdElementMaskIndices(PxgSolverCoreDesc* solverDesc)
+{
+	//this function should be called in setThresholdElementsMask. However, if there are no preExceededThresholdElements(which we will know in the CPU code), we don't
+	//kick of setThresholdElementMask kernel at all so the persistentExceededThresholdElementMask is still set to be one. Therefore, we need to call the setPersistentForceElement
+	//method in here
+
+	const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES / WARP_SIZE;
+
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
+
+	setPersistentForceElementMask(solverDesc);
+
+	__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sBlockPairsAccumulator;
+
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//prev, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+
+	const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sBlockPairsAccumulator = 0;
+	}
+
+	__syncthreads();
+
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock; 
+	
+		PxU32 forceChangeMask = 0;
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			forceChangeMask = gExceededForceElementMask[workIndex];
+		}
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, forceChangeMask)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sWarpPairsAccumulator[warpIndex] = accumPairs + forceChangeMask;
+		}
+
+		const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			PxU32 pairVal = sWarpPairsAccumulator[idx];
+			const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
+			sWarpPairsAccumulator[idx] = accumulatedPairs;
+
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
+			}
+			
+		}
+		
+		__syncthreads();
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			gExceededForceElementMask[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
+			gThresholdStreamWriteable[workIndex] = !!(forceChangeMask);
+		}
+
+	}
+	
+	if(threadIdx.x == 0)
+	{
+		gExceededForceElementMaskBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
+	}
+}
+
+extern "C" __global__ void outputThresholdPairsMaskIndices(PxgSolverCoreDesc* solverDesc)
+{
+	const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
+
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
+
+
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//previous, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+
+	const PxU32 idx = threadIdx.x;
+
+
+	PxU32 pairIndex = 0;
+	PxU32 pairRes = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		pairIndex = gExceededForceElementMaskBetweenBlocks[idx];
+		pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
+
+		sBlockWriteIndexAccum[idx] = pairRes;
+	}
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+	
+	const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
+
+
+	//accumulate normal force between blocks
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i*PxgKernelBlockDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < totalNbExceededThresholdElements)
+		{
+			gExceededForceElementMask[workIndex] = gExceededForceElementMask[workIndex] + blockWriteIndexAccum;
+		}
+	}
+}
+
+extern "C" __global__ void createForceChangeThresholdElements(PxgSolverCoreDesc* solverDesc)
+{
+	PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
+	Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
+	Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
+	Dy::ThresholdStreamElement* gForceChangeElements = solverDesc->forceChangeThresholdElements;
+
+	//we copy the original mask value to thresholdStreamWriteable in computeThresholdElementMaskIndices so it corresponding with mask
+	bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
+	const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+	const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
+	const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
+
+	//previous, current and persistent
+	const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
+	const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
+
+	for(PxU32 workIndex = globalThreadIdx; workIndex < totalNbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
+	{
+		const bool hasForceChangeOrPersistent = gThresholdStreamWriteable[workIndex];
+
+		const PxU32 writeIndex = gExceededForceElementMask[workIndex];
+
+		if (hasForceChangeOrPersistent)
+		{
+			bool lostPair = workIndex < nbPrevExceededThresholdElements;
+			bool foundPair = (workIndex < persistentExceededStart) && !lostPair;
+
+			Dy::ThresholdStreamElement* pair = NULL;
+			if (lostPair)
+			{
+				pair = &gPrevExceededForceElements[workIndex];
+			}
+			else if (foundPair)
+			{
+				pair = &gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
+			}
+			else
+			{
+				//persistent pair
+				pair = &gPrevExceededForceElements[workIndex - persistentExceededStart];
+			}
+
+			//Dy::ThresholdStreamElement& pair = lostPair ? gPrevExceededForceElements[workIndex] : gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
+		
+			Dy::ThresholdStreamElement tempPair;
+			tempPair.shapeInteraction = pair->shapeInteraction;
+			tempPair.nodeIndexA = pair->nodeIndexA;
+			tempPair.nodeIndexB = pair->nodeIndexB;
+			tempPair.normalForce = pair->normalForce;
+			tempPair.accumulatedForce = lostPair ? 0.f : pair->accumulatedForce;
+			tempPair.threshold = pair->threshold;
+			
+			gForceChangeElements[writeIndex] = tempPair;
+		}
+
+		if(workIndex == totalNbExceededThresholdElements-1)
+		{
+			solverDesc->nbForceChangeElements = hasForceChangeOrPersistent ? (writeIndex + 1) : writeIndex;
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/artiConstraintPrep2.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/artiConstraintPrep2.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constant.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constant.cuh
@@ -0,0 +1,37 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef	__CONSTRAINT_CUH__
+#define	__CONSTRAINT_CUH__
+
+#include "PxgSolverCoreDesc.h"
+
+__constant__ PxgSolverCoreDesc constraintSolverCoreDescC;
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrePrep.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrePrep.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrep.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrep.cu
@@ -0,0 +1,494 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgBodySim.h"
+#include "PxgArticulation.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgArticulationCoreKernelIndices.h"
+#include "DySolverConstraintTypes.h"
+#include "DyConstraintPrep.h"
+#include "PxNodeIndex.h"
+#include "PxContact.h"
+#include "PxsContactManagerState.h"
+#include "contactConstraintBlockPrep.cuh"
+#include "contactConstraintPrep.cuh"
+#include "jointConstraintBlockPrep.cuh"
+#include "constant.cuh"
+#include "constraintPrepShared.cuh"
+#include <assert.h>
+#include "stdio.h"
+
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels1() {}
+
+#define LOAD_BODY_DATA 0
+
+#if LOAD_BODY_DATA
+
+//Enough memory to fit 32 warps and load 11 solver body data objects per-pass, i.e. load solverBodyData for all 32 warps in 3 passes.
+//Note, we can +1 on the size to avoid bank conflicts but then 16 byte aligned structs won't be aligned anymore
+#define BODIES_PER_BLOCK 11u
+volatile __shared__ PxU8 bodyLoadData[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL/32][BODIES_PER_BLOCK][sizeof(PxgSolverBodyPrepData)];
+
+
+static __device__ void loadBodyData(const PxgSolverBodyData* PX_RESTRICT datas, const PxU32 batchStride, const PxU32 bodyIndex, const PxU32 threadIndexInWarp, const PxU32 warpIndex,
+	PxgSolverBodyPrepData& outBodyPrepData/*float4& initialLinVelXYZ_invMassW, float4& initialAngVelXYZ_penBiasClamp, PxAlignedMat33&	sqrtInvInertia, PxAlignedTransform& body2World*/)
+{
+	//Iterate through the body datas, pulling in the data we need, then index into shared data, pull out the solver body data and return it by value to store on stack (either in register or in local mem).
+	threadCounts[threadIdx.x] = bodyIndex;
+
+	const PxU32 solverPrepDataWords = sizeof(PxgSolverBodyPrepData)/4;
+
+	PxU32 warpStartIndex = warpIndex*32;
+
+	for(PxU32 a = 0; a < batchStride; a+=BODIES_PER_BLOCK)
+	{
+		PxU32 remainder = PxMin(batchStride - a, BODIES_PER_BLOCK);
+
+		for(PxU32 b = 0; b < remainder; ++b)
+		{
+			PxU32 bodyIndex = threadCounts[warpStartIndex + a + b]; //KS - potentially can use SM3.0 shuffle instead
+
+			const PxU32* PX_RESTRICT sourceData = reinterpret_cast<const PxU32*>(datas + bodyIndex);
+
+			volatile PxU32* bodyData = reinterpret_cast<volatile PxU32*>(&bodyLoadData[warpIndex][b][0]);
+
+			for(PxU32 i = threadIndexInWarp; i < solverPrepDataWords; i+=32)
+			{
+				bodyData[i] = sourceData[i];
+			}
+
+		}
+
+		if((threadIndexInWarp - a) < BODIES_PER_BLOCK)
+		{
+			volatile PxgSolverBodyPrepData& data = reinterpret_cast<volatile PxgSolverBodyPrepData&>(bodyLoadData[warpIndex][threadIndexInWarp-a][0]);
+
+			/*initialLinVelXYZ_invMassW = make_float4(data.initialLinVelXYZ_invMassW.x, data.initialLinVelXYZ_invMassW.y, data.initialLinVelXYZ_invMassW.z,
+				initialLinVelXYZ_invMassW.w);
+			initialAngVelXYZ_penBiasClamp = make_float4(data.initialAngVelXYZ_penBiasClamp.x, data.initialAngVelXYZ_penBiasClamp.y, data.initialAngVelXYZ_penBiasClamp.z,
+				data.initialAngVelXYZ_penBiasClamp.w);
+
+			body2World.p = make_float4(data.body2World.p.x, data.body2World.p.y, data.body2World.p.z, data.body2World.p.w);
+			body2World.q = make_float4(data.body2World.q.q.x, data.body2World.q.q.y, data.body2World.q.q.z, data.body2World.q.q.w);
+
+			sqrtInvInertia = (PxAlignedMat33&)data.sqrtInvInertia;*/
+			outBodyPrepData = (PxgSolverBodyPrepData&)data;
+			
+			/*PxU32* outPrepDataU32 = reinterpret_cast<PxU32*>(&outPrepData);
+			for(PxU32 i = 0; i < solverPrepDataWords; ++i)
+			{
+				outPrepDataU32[i] = bodyLoadData[warpIndex][threadIndexInWarp - a][i];
+			}*/
+		}
+
+	}
+
+	threadCounts[threadIdx.x] = 0; //Reset thread counts to 0 because they're used for accumulators in later code
+
+}
+
+#endif
+
+extern "C" __global__ void jointConstraintBlockPrepareParallelLaunch(
+	PxgConstraintPrepareDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	//PxgBlockWorkUnit* workUnits = constraintPrepDesc->workUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	PxgSolverBodyData* solverBodyDatas = solverDesc->solverBodyDataPool;
+	PxgSolverTxIData* solverTxIData = solverDesc->solverBodyTxIDataPool;
+
+	PxgBlockSolverConstraint1DHeader* jointConstraintHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
+	PxgBlockSolverConstraint1DCon* jointConstraintRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
+	PxgBlockSolverConstraint1DMod* jointConstraintRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
+	PxU32* batchIndices = solverDesc->jointConstraintBatchIndices;
+
+	const PxU32 num1dConstraintBatches = solverDesc->num1dConstraintBatches + solverDesc->numStatic1dConstraintBatches;
+
+	//for(PxU32 i=warpIndex; i< constraintPrepDesc->num1dConstraintBatches; i+=totalNumWarps)
+	PxU32 i = warpIndex;
+	if (i < num1dConstraintBatches)
+	{
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = sharedDesc->iterativeData.blockConstraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+				//desc.descIndex for joint in fact is the batch index
+			PxgBlockConstraint1DData& constraintData = solverDesc->blockJointPrepPool[descIndexBatch];
+			PxgBlockConstraint1DVelocities* rowVelocities = &solverDesc->blockJointPrepPool0[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+			PxgBlockConstraint1DParameters* rowParameters = &solverDesc->blockJointPrepPool1[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			PxgSolverTxIData* txIData0 = &solverTxIData[bodyAIndex];
+			PxgSolverTxIData* txIData1 = &solverTxIData[bodyBIndex];
+
+			PxU32 uniqueIndex = solverDesc->constraintUniqueIndices[batch.mStartPartitionIndex + threadIndexInWarp];
+				
+			setupSolverConstraintBlockGPU<PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1, sharedDesc->dt, sharedDesc->invDtF32, batch, threadIndexInWarp,
+					&jointConstraintHeaders[descIndexBatch], &jointConstraintRowsCon[batch.startConstraintIndex], &jointConstraintRowsMod[batch.startConstraintIndex],
+					solverDesc->solverConstantData[uniqueIndex]);
+		}    
+	}
+}
+
+extern "C" __global__ void contactConstraintBlockPrepareParallelLaunch(
+	PxgConstraintPrepareDesc* constraintPrepDesc,
+	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	PxgBlockWorkUnit* workUnits = constraintPrepDesc->blockWorkUnit;
+
+	const PxU32 warpSize = WARP_SIZE;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	const PxU32 totalPreviousEdges = constraintPrepDesc->totalPreviousEdges;
+	const PxU32 totalCurrentEdges = constraintPrepDesc->totalCurrentEdges;
+	const PxU32 nbContactBatches = constraintPrepDesc->numContactBatches + constraintPrepDesc->numStaticContactBatches;
+
+
+	/*if (warpIndex == 0 && threadIndexInWarp == 0)
+	{
+		printf("NumBatches = %i, numContactBatches = %i, numStaticContactBatches = %i %p\n", nbContactBatches,
+			constraintPrepDesc->numContactBatches, constraintPrepDesc->numStaticContactBatches, constraintPrepDesc);
+	}*/
+
+	__shared__ PxgSolverBodyData* solverBodyDatas;
+	__shared__ PxgSolverTxIData* solverTxIDatas;
+
+	__shared__ PxgBlockSolverContactHeader* contactHeaders;
+	__shared__ PxgBlockSolverFrictionHeader* frictionHeaders;
+	__shared__ PxgBlockSolverContactPoint* contactPoints;
+	__shared__ PxgBlockSolverContactFriction* frictions;
+	__shared__ PxU32* batchIndices;
+	__shared__ PxgBlockFrictionIndex* frictionIndices;
+	__shared__ PxgBlockFrictionIndex* prevFrictionIndices;
+	__shared__ PxgBlockContactPoint* contactBase;
+	__shared__ PxgBlockConstraintBatch* constraintBatch;
+	__shared__ PxgBlockContactData* contactCurrentPrepPool;
+	__shared__ PxgBlockFrictionPatch* prevFrictionPatches;
+	__shared__ PxgBlockFrictionPatch* currFrictionPatches;
+	__shared__ PxgBlockFrictionAnchorPatch* prevFrictionAnchors;
+	__shared__ PxgBlockFrictionAnchorPatch* currFrictionAnchors;
+	__shared__ PxAlignedTransform* bodyFrames;
+
+	
+	volatile __shared__ char sInertias[sizeof(PxMat33) * (PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize) * warpSize];
+	//volatile __shared__ PxMat33 inertias[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize][warpSize];
+
+	volatile PxMat33* inertias = reinterpret_cast<volatile PxMat33*>(sInertias);
+
+	if(threadIdx.x == 0)
+	{
+		solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
+		solverTxIDatas = constraintPrepDesc->solverBodyTxIDataPool;
+
+		contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
+		frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
+		contactPoints = sharedDesc->iterativeData.blockContactPoints;
+		frictions = sharedDesc->iterativeData.blockFrictions;
+		batchIndices = constraintPrepDesc->contactConstraintBatchIndices;
+		frictionIndices = constraintPrepDesc->blockCurrentFrictionIndices;
+		prevFrictionIndices = constraintPrepDesc->blockPreviousFrictionIndices;
+
+		contactBase = constraintPrepDesc->blockContactPoints;
+		constraintBatch = sharedDesc->iterativeData.blockConstraintBatch;
+		contactCurrentPrepPool = constraintPrepDesc->blockContactCurrentPrepPool;
+		currFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
+		prevFrictionPatches = sharedDesc->blockPreviousFrictionPatches;
+		prevFrictionAnchors = constraintPrepDesc->blockPreviousAnchorPatches;
+		currFrictionAnchors = constraintPrepDesc->blockCurrentAnchorPatches;
+		bodyFrames = constraintPrepDesc->body2WorldPool;
+	}
+	
+	__syncthreads();
+
+	PxU32 i = warpIndex;
+	//unsigned mask_nbContactBatches = __ballot_sync(FULL_MASK, i < nbContactBatches);
+	if(i < nbContactBatches)
+	{
+		const PxU32 batchIndex = batchIndices[i];
+
+		//if (batchIndex >= totalBatches)
+		//{
+		//	if(batchIndices[i-1] < totalBatches)
+		//		assert(batchIndex < totalBatches); //Ensure we are not shooting past the max number of batches...
+		//}
+
+		PxgBlockConstraintBatch& batch = constraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//Read in 16 bytes at a time, we take 3 threads to read in a single inertia tensor, and we have some spare bandwidth. We can read
+		//32 inertia tensors in 3 passes
+
+		const PxU32 descStride2 = descStride*2;
+
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx/2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyAIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx &1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				//volatile float* sh = reinterpret_cast<volatile float*>(&inertias[threadIdx.x / 32][bodyToLoad]);
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if(offset == 0)
+					sh[offset*4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia0;
+		const PxU32 index = (threadIdx.x / warpSize) * warpSize + threadIndexInWarp;
+		if (threadIndexInWarp < descStride)
+		{	
+			invInertia0.column0.x = inertias[index].column0.x;
+			invInertia0.column0.y = inertias[index].column0.y;
+			invInertia0.column0.z = inertias[index].column0.z;
+			invInertia0.column1.x = inertias[index].column1.x;
+			invInertia0.column1.y = inertias[index].column1.y;
+			invInertia0.column1.z = inertias[index].column1.z;
+			invInertia0.column2.x = inertias[index].column2.x;
+			invInertia0.column2.y = inertias[index].column2.y;
+			invInertia0.column2.z = inertias[index].column2.z;
+
+			//printf("%i: (%f, %f, %f) (%f, %f, %f) (%f, %f, %f)\n", threadIdx.x, invInertia0.column0.x, invInertia0.column0.y, invInertia0.column0.z, invInertia0.column1.x, invInertia0.column1.y, invInertia0.column1.z, invInertia0.column2.x, invInertia0.column2.y, invInertia0.column2.z);
+		}
+
+		__syncwarp(); //Required (racecheck confirmed) because inertias (Ptr sh points to inertias) is written below and read above
+
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx / 2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyBIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia1;
+
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia1.column0.x = inertias[index].column0.x;
+			invInertia1.column0.y = inertias[index].column0.y;
+			invInertia1.column0.z = inertias[index].column0.z;
+			invInertia1.column1.x = inertias[index].column1.x;
+			invInertia1.column1.y = inertias[index].column1.y;
+			invInertia1.column1.z = inertias[index].column1.z;
+			invInertia1.column2.x = inertias[index].column2.x;
+			invInertia1.column2.y = inertias[index].column2.y;
+			invInertia1.column2.z = inertias[index].column2.z;
+		}
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+			//port contact code
+			PxgBlockContactData& contactData = contactCurrentPrepPool[descIndexBatch];
+			PxgBlockContactPoint* baseContact = contactBase + batch.blockContactIndex;
+			PxgBlockFrictionPatch& frictionPatch = currFrictionPatches[descIndexBatch];
+			PxgBlockFrictionAnchorPatch& fAnchor = currFrictionAnchors[descIndexBatch];
+
+			//Fill in correlation information for next frame...
+
+			PxgBlockWorkUnit& unit = workUnits[descIndexBatch];
+
+			PxgBlockFrictionIndex index;
+			index.createPatchIndex(descIndexBatch, threadIndexInWarp);
+
+			//PxU32 frictionIndex = unit.mFrictionIndex[threadIndexInWarp];
+			PxU32 edgeIndex = unit.mEdgeIndex[threadIndexInWarp];
+			PxU32 frictionIndex = edgeIndex + totalCurrentEdges * unit.mPatchIndex[threadIndexInWarp];
+			PxgBlockFrictionIndex* targetIndex = &frictionIndices[frictionIndex];
+				
+			*reinterpret_cast<uint2*>(targetIndex) = reinterpret_cast<uint2&>(index);
+
+			//KS - todo - get some of this in shared memory/registers as quickly as possible...
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			//PxgSolverTxIData* txIData0 = &solverTxIDatas[bodyAIndex];
+			//PxgSolverTxIData* txIData1 = &solverTxIDatas[bodyBIndex];
+
+			const PxAlignedTransform bodyFrame0 = bodyFrames[bodyAIndex];
+			const PxAlignedTransform bodyFrame1 = bodyFrames[bodyBIndex];
+
+			//KS - temporarily read the velocities the "slow" way so we can store the inertia-scaled velocities 
+			//in velocities buffer for now. We can then switch over later when we create the new prep code for the 
+			//TGS solver and leave the PGS solver as-is
+#if 0
+			const float4 linVel_invMass0 = velocities[bodyAIndex];
+			const float4 angVelXYZ_penBiasClamp0 = velocities[bodyAIndex + totalBodies];
+
+			const float4 linVel_invMass1 = velocities[bodyBIndex];
+			const float4 angVelXYZ_penBiasClamp1 = velocities[bodyBIndex + totalBodies];
+#else
+			const float4 linVel_invMass0 = bodyData0->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp0 = bodyData0->initialAngVelXYZ_penBiasClamp;
+
+			const float4 linVel_invMass1 = bodyData1->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp1 = bodyData1->initialAngVelXYZ_penBiasClamp;
+#endif
+
+			const PxReal solverOffsetSlop = PxMax(bodyData0->offsetSlop, bodyData1->offsetSlop);
+
+			/*if (i >= constraintPrepDesc->numContactBatches)
+			{
+				if(bodyBIndex != )
+			}*/
+
+			PxU32 offset = unit.mWriteback[threadIndexInWarp];
+			createFinalizeSolverContactsBlockGPU(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices, *bodyData0, *bodyData1, 
+				invInertia0, invInertia1, bodyFrame0, bodyFrame1, linVel_invMass0, angVelXYZ_penBiasClamp0, linVel_invMass1, angVelXYZ_penBiasClamp1,
+				sharedDesc->invDtF32, sharedDesc->dt, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
+				threadIndexInWarp, offset, &contactHeaders[descIndexBatch], &frictionHeaders[descIndexBatch], &contactPoints[batch.startConstraintIndex], 
+				&frictions[batch.startFrictionIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, solverOffsetSlop);
+
+			frictionPatch.patchIndex[threadIndexInWarp] = unit.mFrictionPatchIndex[threadIndexInWarp];
+
+			PxgBlockFrictionPatch& fpatch = frictionPatch;
+			if (fpatch.anchorCount[threadIndexInWarp] >= 1)
+				fpatch.anchorPoints[0][threadIndexInWarp] = PxSave3(bodyFrame0.transform(PxLoad3(fAnchor.body0Anchors[0][threadIndexInWarp])));
+			if (fpatch.anchorCount[threadIndexInWarp] == 2)
+				fpatch.anchorPoints[1][threadIndexInWarp] = PxSave3(bodyFrame0.transform(PxLoad3(fAnchor.body0Anchors[1][threadIndexInWarp])));
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrepTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintBlockPrepTGS.cu
@@ -0,0 +1,410 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#define IS_TGS_SOLVER
+
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "DyConstraintPrep.h"
+#include "contactConstraintBlockPrep.cuh"
+#include "jointConstraintBlockPrepTGS.cuh"
+
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels10() {}
+
+extern "C" __global__ void jointConstraintBlockPrepareParallelLaunchTGS( PxgConstraintPrepareDesc* solverDesc,
+	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	//PxgBlockWorkUnit* workUnits = constraintPrepDesc->workUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	//total numbers of warps in all blocks
+	//const PxU32 totalNumWarps = blockStride * gridDim.x;
+
+	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;
+
+	PxgSolverBodyData* solverBodyDatas = solverDesc->solverBodyDataPool;
+	PxgSolverTxIData* solverTxIData = solverDesc->solverBodyTxIDataPool;
+
+	PxgTGSBlockSolverConstraint1DHeader* jointConstraintHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
+	PxgTGSBlockSolverConstraint1DCon* jointConstraintRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
+	//PxgBlockSolverConstraint1DMod* jointConstraintRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
+	PxU32* batchIndices = solverDesc->jointConstraintBatchIndices;
+	const PxU32 total1dConstraintBatches = solverDesc->num1dConstraintBatches + solverDesc->numStatic1dConstraintBatches;
+
+
+	//for(PxU32 i=warpIndex; i< constraintPrepDesc->num1dConstraintBatches; i+=totalNumWarps)
+	PxU32 i = warpIndex;
+	if (i < total1dConstraintBatches)
+	{
+
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = sharedDesc->iterativeData.blockConstraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+				//desc.descIndex for joint in fact is the batch index
+			PxgBlockConstraint1DData& constraintData = solverDesc->blockJointPrepPool[descIndexBatch];
+			PxgBlockConstraint1DVelocities* rowVelocities = &solverDesc->blockJointPrepPool0[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+			PxgBlockConstraint1DParameters* rowParameters = &solverDesc->blockJointPrepPool1[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
+
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			PxgSolverTxIData* txIData0 = &solverTxIData[bodyAIndex];
+			PxgSolverTxIData* txIData1 = &solverTxIData[bodyBIndex];
+
+			PxU32 uniqueIndex = solverDesc->constraintUniqueIndices[batch.mStartPartitionIndex + threadIndexInWarp];
+				
+			setupSolverConstraintBlockGPUTGS<PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1, 
+				sharedDesc->stepDt, sharedDesc->stepInvDtF32, sharedDesc->dt, sharedDesc->invDtF32, sharedDesc->lengthScale, 
+				solverDesc->biasCoefficient, batch, threadIndexInWarp,
+					&jointConstraintHeaders[descIndexBatch], &jointConstraintRowsCon[batch.startConstraintIndex],
+					solverDesc->solverConstantData[uniqueIndex]);
+		}    
+	}
+}
+
+extern "C" __global__ void contactConstraintBlockPrepareParallelLaunchTGS(
+	PxgConstraintPrepareDesc* constraintPrepDesc,
+	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
+{
+	//threadCounts[threadIdx.x] = 0;
+
+	//__syncthreads();
+
+	PxgBlockWorkUnit* workUnits = constraintPrepDesc->blockWorkUnit;
+
+	const PxU32 warpSize = 32;
+
+	const PxU32 blockStride = blockDim.x/warpSize;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/warpSize;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize-1);
+
+	const PxU32 totalPreviousEdges = constraintPrepDesc->totalPreviousEdges;
+	const PxU32 totalCurrentEdges = constraintPrepDesc->totalCurrentEdges;
+	const PxU32 nbContactBatches = constraintPrepDesc->numContactBatches + constraintPrepDesc->numStaticContactBatches;
+	
+
+	__shared__ PxgSolverBodyData* solverBodyDatas;
+	__shared__ PxgSolverTxIData* solverTxIDatas;
+
+	__shared__ PxgTGSBlockSolverContactHeader* contactHeaders;
+	__shared__ PxgTGSBlockSolverFrictionHeader* frictionHeaders;
+	__shared__ PxgTGSBlockSolverContactPoint* contactPoints;
+	__shared__ PxgTGSBlockSolverContactFriction* frictions;
+	__shared__ PxU32* batchIndices;
+	__shared__ PxgBlockFrictionIndex* frictionIndices;
+	__shared__ PxgBlockFrictionIndex* prevFrictionIndices;
+	__shared__ PxgBlockContactPoint* contactBase;
+	__shared__ PxgBlockConstraintBatch* constraintBatch;
+	__shared__ PxgBlockContactData* contactCurrentPrepPool;
+	__shared__ PxgBlockFrictionPatch* prevFrictionPatches;
+	__shared__ PxgBlockFrictionPatch* currFrictionPatches;
+	__shared__ PxgBlockFrictionAnchorPatch* prevFrictionAnchors;
+	__shared__ PxgBlockFrictionAnchorPatch* currFrictionAnchors;
+	__shared__ PxAlignedTransform* bodyFrames;
+
+	volatile __shared__ char sInertias[sizeof(PxMat33) * (PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / warpSize) * warpSize];
+	volatile PxMat33* inertias = reinterpret_cast<volatile PxMat33*>(sInertias);
+
+	//volatile __shared__ PxMat33 inertias[PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL / 32][32];
+
+	if(threadIdx.x == 0)
+	{
+		solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
+		solverTxIDatas = constraintPrepDesc->solverBodyTxIDataPool;
+
+		contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
+		frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
+		contactPoints = sharedDesc->iterativeData.blockContactPoints;
+		frictions = sharedDesc->iterativeData.blockFrictions;
+		batchIndices = constraintPrepDesc->contactConstraintBatchIndices;
+		frictionIndices = constraintPrepDesc->blockCurrentFrictionIndices;
+		prevFrictionIndices = constraintPrepDesc->blockPreviousFrictionIndices;
+
+		contactBase = constraintPrepDesc->blockContactPoints;
+		constraintBatch = sharedDesc->iterativeData.blockConstraintBatch;
+		contactCurrentPrepPool = constraintPrepDesc->blockContactCurrentPrepPool;
+		currFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
+		prevFrictionPatches = sharedDesc->blockPreviousFrictionPatches;
+		prevFrictionAnchors = constraintPrepDesc->blockPreviousAnchorPatches;
+		currFrictionAnchors = constraintPrepDesc->blockCurrentAnchorPatches;
+		bodyFrames = constraintPrepDesc->body2WorldPool;
+	}
+	
+	__syncthreads();
+
+	PxU32 i = warpIndex;
+	//unsigned mask_nbContactBatches = __ballot_sync(FULL_MASK, i < nbContactBatches);
+	if(i < nbContactBatches)
+	{
+		/*if (threadIndexInWarp == 0)
+			printf("Processing batch %i\n", i);*/
+		const PxU32 batchIndex = batchIndices[i];
+		PxgBlockConstraintBatch& batch = constraintBatch[batchIndex];
+		const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];
+
+		/*if (threadIndexInWarp == 0)
+			printf("Processing batchIndex %i\n", batchIndex);*/
+			
+		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
+
+		const PxU32 descStride = batch.mDescStride;
+
+		//PxgSolverBodyPrepData bodyData0, bodyData1;
+
+#if LOAD_BODY_DATA
+		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
+			bodyData0.sqrtInvInertia, bodyData0.body2World);
+		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
+			bodyData1.sqrtInvInertia, bodyData1.body2World);
+#endif
+
+		//Read in 16 bytes at a time, we take 3 threads to read in a single inertia tensor, and we have some spare bandwidth. We can read
+		//32 inertia tensors in 3 passes
+
+		const PxU32 descStride2 = descStride*2;
+
+		/*if(threadIndexInWarp == 0)
+			printf("Loading first txIData\n");*/
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx/2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyAIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				//volatile float* sh = reinterpret_cast<volatile float*>(&inertias[threadIdx.x / 32][bodyToLoad]);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		PxMat33 invInertia0;
+		const PxU32 index = (threadIdx.x / warpSize) * warpSize + threadIndexInWarp;
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia0.column0.x = inertias[index].column0.x;
+			invInertia0.column0.y = inertias[index].column0.y;
+			invInertia0.column0.z = inertias[index].column0.z;
+			invInertia0.column1.x = inertias[index].column1.x;
+			invInertia0.column1.y = inertias[index].column1.y;
+			invInertia0.column1.z = inertias[index].column1.z;
+			invInertia0.column2.x = inertias[index].column2.x;
+			invInertia0.column2.y = inertias[index].column2.y;
+			invInertia0.column2.z = inertias[index].column2.z;
+
+			//printf("%i: (%f, %f, %f) (%f, %f, %f) (%f, %f, %f)\n", threadIdx.x, invInertia0.column0.x, invInertia0.column0.y, invInertia0.column0.z, invInertia0.column1.x, invInertia0.column1.y, invInertia0.column1.z, invInertia0.column2.x, invInertia0.column2.y, invInertia0.column2.z);
+		}
+
+		__syncwarp(); //Required (racecheck confirmed) because inertias (Ptr sh points to inertias) is written below and read above
+
+		/*if (threadIndexInWarp == 0)
+			printf("Loading second txIData\n");*/
+		for (PxU32 i = 0; i < descStride2; i += 32)
+		{
+			PxU32 idx = i + threadIndexInWarp;
+			PxU32 bodyToLoad = idx / 2;
+
+			PxU32 bodyIdx = __shfl_sync(FULL_MASK, bodyBIndex, bodyToLoad);
+
+			if (idx < descStride2)
+			{
+				PxU32 offset = idx & 1;
+				float4* val = reinterpret_cast<float4*>(&solverTxIDatas[bodyIdx].sqrtInvInertia.column0.y);
+				const PxU32 ind = (threadIdx.x / warpSize) * warpSize + bodyToLoad;
+				volatile float* sh = reinterpret_cast<volatile float*>(&inertias[ind]);
+
+				float4 v = val[offset];
+
+				float v0 = solverTxIDatas[bodyIdx].sqrtInvInertia.column0.x;
+
+				sh[1 + offset * 4] = v.x;
+				sh[2 + offset * 4] = v.y;
+				sh[3 + offset * 4] = v.z;
+				sh[4 + offset * 4] = v.w;
+
+				if (offset == 0)
+					sh[offset * 4] = v0;
+			}
+		}
+
+		__syncwarp();
+
+		/*if (threadIndexInWarp == 0)
+			printf("Loaded second txIData\n");*/
+
+		PxMat33 invInertia1;
+
+		if (threadIndexInWarp < descStride)
+		{
+			invInertia1.column0.x = inertias[index].column0.x;
+			invInertia1.column0.y = inertias[index].column0.y;
+			invInertia1.column0.z = inertias[index].column0.z;
+			invInertia1.column1.x = inertias[index].column1.x;
+			invInertia1.column1.y = inertias[index].column1.y;
+			invInertia1.column1.z = inertias[index].column1.z;
+			invInertia1.column2.x = inertias[index].column2.x;
+			invInertia1.column2.y = inertias[index].column2.y;
+			invInertia1.column2.z = inertias[index].column2.z;
+		}
+
+		//mDescStride might less than 32, we need to guard against it
+		if(threadIndexInWarp < descStride)
+		{
+			//port contact code
+			PxgBlockContactData& contactData = contactCurrentPrepPool[descIndexBatch];
+			PxgBlockContactPoint* baseContact = contactBase + batch.blockContactIndex;
+			PxgBlockFrictionPatch& frictionPatch = currFrictionPatches[descIndexBatch];
+			PxgBlockFrictionAnchorPatch& fAnchor = currFrictionAnchors[descIndexBatch];
+
+			//Fill in correlation information for next frame...
+
+			PxgBlockWorkUnit& unit = workUnits[descIndexBatch];
+
+			PxgBlockFrictionIndex index;
+			index.createPatchIndex(descIndexBatch, threadIndexInWarp);
+
+			//PxU32 frictionIndex = unit.mFrictionIndex[threadIndexInWarp];
+			PxU32 edgeIndex = unit.mEdgeIndex[threadIndexInWarp];
+			PxU32 frictionIndex = edgeIndex + totalCurrentEdges * unit.mPatchIndex[threadIndexInWarp];
+			PxgBlockFrictionIndex* targetIndex = &frictionIndices[frictionIndex];
+				
+			*reinterpret_cast<uint2*>(targetIndex) = reinterpret_cast<uint2&>(index);
+
+			//KS - todo - get some of this in shared memory/registers as quickly as possible...
+			PxgSolverBodyData* bodyData0 = &solverBodyDatas[bodyAIndex];
+			PxgSolverBodyData* bodyData1 = &solverBodyDatas[bodyBIndex];
+			//PxgSolverTxIData* txIData0 = &solverTxIDatas[bodyAIndex];
+			//PxgSolverTxIData* txIData1 = &solverTxIDatas[bodyBIndex];
+
+			const PxAlignedTransform bodyFrame0 = bodyFrames[bodyAIndex];
+			const PxAlignedTransform bodyFrame1 = bodyFrames[bodyBIndex];
+
+			//KS - temporarily read the velocities the "slow" way so we can store the inertia-scaled velocities 
+			//in velocities buffer for now. We can then switch over later when we create the new prep code for the 
+			//TGS solver and leave the PGS solver as-is
+#if 0
+			const float4 linVel_invMass0 = velocities[bodyAIndex];
+			const float4 angVelXYZ_penBiasClamp0 = velocities[bodyAIndex + totalBodies];
+
+			const float4 linVel_invMass1 = velocities[bodyBIndex];
+			const float4 angVelXYZ_penBiasClamp1 = velocities[bodyBIndex + totalBodies];
+#else
+
+			//We use these velocities because these are not multiplied by sqrtInertia. This is a bit slower to read but 
+			//means we can treat kinematics and statics the same in the below code.
+			const float4 linVel_invMass0 = bodyData0->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp0 = bodyData0->initialAngVelXYZ_penBiasClamp;
+
+			const float4 linVel_invMass1 = bodyData1->initialLinVelXYZ_invMassW;
+			const float4 angVelXYZ_penBiasClamp1 = bodyData1->initialAngVelXYZ_penBiasClamp;
+#endif
+
+			const PxReal offsetSlop = PxMax(bodyData0->offsetSlop, bodyData1->offsetSlop);
+
+			PxU32 offset = unit.mWriteback[threadIndexInWarp];
+			const float2 torsionalData = unit.mTorsionalFrictionData[threadIndexInWarp];
+			createFinalizeSolverContactsBlockGPUTGS(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices, *bodyData0, *bodyData1,
+				invInertia0, invInertia1, bodyFrame0, bodyFrame1, linVel_invMass0, angVelXYZ_penBiasClamp0, linVel_invMass1, angVelXYZ_penBiasClamp1,
+				sharedDesc->stepInvDtF32, sharedDesc->stepDt, sharedDesc->dt, sharedDesc->invDtF32, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
+				constraintPrepDesc->biasCoefficient, threadIndexInWarp, offset, &contactHeaders[descIndexBatch], &frictionHeaders[descIndexBatch], &contactPoints[batch.startConstraintIndex], 
+				&frictions[batch.startFrictionIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, offsetSlop,
+				torsionalData);
+
+			frictionPatch.patchIndex[threadIndexInWarp] = unit.mFrictionPatchIndex[threadIndexInWarp];
+
+			PxgBlockFrictionPatch& fpatch = frictionPatch;
+			if (fpatch.anchorCount[threadIndexInWarp] >= 1)
+				fpatch.anchorPoints[0][threadIndexInWarp] = bodyFrame0.transform(fAnchor.body0Anchors[0][threadIndexInWarp]);
+			if (fpatch.anchorCount[threadIndexInWarp] == 2)
+				fpatch.anchorPoints[1][threadIndexInWarp] = bodyFrame0.transform(fAnchor.body0Anchors[1][threadIndexInWarp]);
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/constraintPrepShared.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/constraintPrepShared.cuh
@@ -0,0 +1,378 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__CONSTRAINT_PREP_SHARED_CUH__
+#define	__CONSTRAINT_PREP_SHARED_CUH__
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySolverConstraintTypes.h"
+#include "PxNodeIndex.h"
+#include "PxgArticulation.h"
+#include "PxgEdgeType.h"
+#include "PxgDynamicsConfiguration.h"
+#include "stdio.h"
+#include "utils.cuh"
+#include "PxgSolverFlags.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgPartitionNode.h"
+
+#define PXC_SAME_NORMAL 0.999f
+
+static __device__ physx::PxU32 computeRemapIndexRigidBody(bool isSecondBody, 
+	const physx::PxU32* const PX_RESTRICT partitionStartIndices, 
+	const physx::PxU32* const PX_RESTRICT partitionArticStartIndices, 
+	const physx::PxU32* const PX_RESTRICT partitionJointCounts,
+	const physx::PxU32* const PX_RESTRICT partitionArticulationJointCounts, 
+	const physx::PartitionIndexData& indexData, 
+	physx::PxgSolverReferences* solverBodyReferences, 
+	physx::PxU32 currPartition, 
+	physx::PxU32 maxNbPartitions, 
+	const physx::PxU32 totalActiveBodyCount,
+	const physx::PxU32 bodyId,
+	const physx::PxU32 activeBodyOffset, 
+	const physx::PxU32 totalRigidBatches, 
+	const physx::PxU32 totalArticBatches,
+	const physx::PxU32 nbElemsPerBody,
+	const physx::PxU32 nbSlabs,
+	const physx::PxU32 solverBodyOutputVelocityOffset //Only used for assert
+)
+{
+	using namespace physx;
+	//Computes the remapped index for the rigid body being referenced by this constraint.
+	//This is quite complicated. For rigid body constraints, there are up to 32 pairs of rigid bodies referenced
+	//by each constraint batch.
+	//Joints attached to articulations and articulation contacts are indexed similar to rigid bodies. The rigid 
+	//bodies for articulation constraints appear after the rigid body constraints in the list, prior to the accumulation buffer.
+
+	const PxU32 partitionIndex = indexData.mPartitionIndex;
+	//printf("indexData partitionIndex %i\n", partitionIndex);
+	const PxU32 partitionEntryIndex = indexData.mPartitionEntryIndex;
+	//printf("indexData partitionEntryIndex %i\n", partitionEntryIndex);
+	PxU32 index;
+
+	//printf("indexData type %i\n", indexData.mCType);
+	switch (indexData.mCType)
+	{
+	case PxgEdgeType::eCONTACT_MANAGER:
+		index = partitionStartIndices[partitionIndex] + partitionJointCounts[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eCONSTRAINT:
+		index = partitionStartIndices[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eARTICULATION_CONTACT:
+		index = totalRigidBatches + partitionArticStartIndices[partitionIndex] + partitionArticulationJointCounts[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	case PxgEdgeType::eARTICULATION_CONSTRAINT:
+		index = totalRigidBatches + partitionArticStartIndices[partitionIndex] + partitionEntryIndex / PXG_BATCH_SIZE;
+		break;
+	default:
+		break;
+	}
+
+	/*printf("Index is now = %i\n", index);
+
+	printf("partitionJointCounts[%i] = %i\n", partitionIndex, partitionJointCounts[partitionIndex]);
+	printf("partitionArticStartIndices[%i] = %i\n", partitionIndex, partitionArticStartIndices[partitionIndex]);
+	printf("partitionArticulationJointCounts[%i] = %i\n", partitionIndex, partitionArticulationJointCounts[partitionIndex]);
+	printf("partitionEntryIndex %i\n", partitionEntryIndex);*/
+
+	//if (!isArticulation)
+	{
+		//printf("Next is not an articulation constraint!!!\n");
+		const PxU32 batchMask = PXG_BATCH_SIZE - 1;
+
+		index = (index*PXG_BATCH_SIZE * 2)*nbElemsPerBody + (partitionEntryIndex & batchMask);
+		if (isSecondBody)
+			index += PXG_BATCH_SIZE*nbElemsPerBody;
+	}
+	//else
+	//{
+	//	//printf("Next is an articulation constraint index = %i!!!\n", index);
+	//	//Make sure that there's the slot for index, index+32
+	//	index = ((index & (~31))*nbElemsPerBody) + (index&31);
+	//	index += totalRigidBatches * PXG_BATCH_SIZE * 2* nbElemsPerBody;
+	//}
+
+	//printf("index %i Type = %i\n", index, indexData.mCType);
+
+	if (bodyId >= activeBodyOffset)
+	{
+		//printf("bodyId>= activeBodyOffset\n");
+		if ((partitionIndex & (~(maxNbPartitions - 1))) != (currPartition & (~(maxNbPartitions - 1))) || partitionIndex <= currPartition)
+		{
+			//printf("recalculate slabId\n");
+			//We changed slabs, so we need to introduce a new solver reference
+			PxU32 slabId = partitionIndex / maxNbPartitions;
+
+			//printf("slabId %i bodyId %i activeBodyOffset%i \n", slabId, bodyId, activeBodyOffset);
+			//PxU32 referenceId = slabId * totalActiveBodyCount + bodyId - activeBodyOffset;
+			//PxU32 referenceId = slabId * totalActiveBodyCount + bodyId - activeBodyOffset;
+			PxU32 referenceId = nbSlabs * (bodyId - activeBodyOffset) + slabId;
+
+			/*printf("%i: slabId = %i, nbSlabs = %i, bodyId = %i, activeBodyOffset = %i, referenceId = %i\n", 
+				threadIdx.x, slabId, nbSlabs, bodyId, activeBodyOffset, referenceId);*/
+
+			solverBodyReferences[referenceId].mRemappedBodyIndex = index;
+
+			referenceId = (referenceId&(~31))*nbElemsPerBody + (referenceId & 31);
+
+			//printf("solverBodyReferences[%i].mRemappedBodyIndex %i\n", referenceId, solverBodyReferences[referenceId].mRemappedBodyIndex);
+			//Change remap table so that outputs that need to be averaged are placed in a format that is suitable for coalesced loading!
+			//There are totalBatches * 32 * 2 * 2 float4 velocity vectors for the solver, then nbSlabs * bodyCount * 2 float4 velocity vectors for averaging!
+			index = (totalArticBatches + totalRigidBatches) * PXG_BATCH_SIZE * 2 * nbElemsPerBody + referenceId;
+
+			//printf("TotalRigidBatches = %i, totalArticBatches = %i\n", totalRigidBatches, totalArticBatches);
+		}
+	}
+
+	assert(index < solverBodyOutputVelocityOffset);
+
+	return index;
+}
+
+
+static __device__ PX_FORCE_INLINE bool pointsAreClose(const physx::PxAlignedTransform& body1ToBody0,
+	const float4& localAnchor0, const float4& localAnchor1,
+	const float4& axis, float correlDist)
+{
+	using namespace physx;
+
+	const float4 body0PatchPoint1 = body1ToBody0.transform(localAnchor1);
+
+	return PxAbs(dot3(localAnchor0 - body0PatchPoint1, axis))<correlDist;
+}
+
+// Decides whether or not the damper should be turned on. We don't want damping if the contact
+// is not expected to be closed this step because the damper can produce repulsive forces
+// even before the contact is closed.
+static __device__ PX_FORCE_INLINE PxReal computeCompliantDamping(bool isSeparated, bool collidingWithVrel, PxReal damping)
+{
+	const float dampingIfEnabled = (isSeparated && !collidingWithVrel) ? 0.0f : damping;
+	return dampingIfEnabled;
+}
+
+// Storing minimal coefficients to compute constant, unbiasedConstant, velMultiplier, and impulseMultiplier in
+// "compute1dConstraintSolverConstantsPGS"
+static __device__ PX_FORCE_INLINE void queryReduced1dConstraintSolverConstantsPGS(
+	const PxU16 constraintFlags, const PxReal springStiffness, const PxReal springDamping, const PxReal restitution,
+	const PxReal bounceThreshold, const PxReal geometricError, const PxReal velocityTarget,
+	const PxReal jointSpeedForRestitutionBounce, const PxReal erp, const PxReal simDt, const PxReal recipSimDt,
+	PxReal& coeff0, PxReal& coeff1)
+{
+	if (constraintFlags & Px1DConstraintFlag::eSPRING)
+	{
+		// coeff0: a, coeff1: b in "compute1dConstraintSolverConstantsPGS"
+
+		const PxReal a = simDt * simDt * springStiffness + simDt * springDamping;
+		const PxReal b = simDt * (springDamping * velocityTarget - springStiffness * geometricError);
+
+		coeff0 = a;
+		coeff1 = b;
+	}
+	else
+	{
+		// coeff0: constant (to be scaled by recipUnitResponse)
+		// coeff1: unbiasedConstant (to be scaled by recipUnitResponse)
+
+		const PxReal bounceVel = Dy::computeBounceVelocity(constraintFlags, jointSpeedForRestitutionBounce, bounceThreshold, restitution, geometricError);
+		if (bounceVel != 0.0f)
+		{
+			coeff0 = bounceVel;
+			coeff1 = bounceVel;
+		}
+		else
+		{
+			const PxReal geomError = geometricError * erp;
+			coeff0 = velocityTarget - geomError * recipSimDt;
+			coeff1 = (!(constraintFlags & Px1DConstraintFlag::eKEEPBIAS)) ? velocityTarget : coeff0;
+		}
+	}
+}
+
+// Computing constant, unbiasedConstant, velMultiplier, and impulseMultiplier using precomputed coefficients. 
+// See also "queryReduced1dConstraintSolverConstantsPGS" and "compute1dConstraintSolverConstantsPGS."
+static __device__ PX_FORCE_INLINE void compute1dConstraintSolverConstantsPGS
+(bool isSpring, bool isAccelerationSpring, PxReal coeff0, PxReal coeff1, PxReal coeff2,
+	const PxReal unitResponse, const PxReal recipUnitResponse,
+	PxReal& constant, PxReal& unbiasedConstant, PxReal& velMultiplier, PxReal& impulseMultiplier)
+{
+	if (isSpring)
+	{
+		// coeff0: a
+		// coeff1: b
+
+		const PxReal a = coeff0;
+		const PxReal b = coeff1;
+
+		if (isAccelerationSpring)
+		{
+			const PxReal x = 1.0f / (1.0f + a);
+			constant = x * recipUnitResponse * b;
+			unbiasedConstant = constant;
+			velMultiplier = -x * recipUnitResponse * a;
+			impulseMultiplier = 1.0f - x;
+		}
+		else
+		{
+			const PxReal x = 1.0f / (1.0f + a * unitResponse);
+			constant = x * b;
+			unbiasedConstant = constant;
+			velMultiplier = -x * a;
+			impulseMultiplier = 1.0f - x;
+		}
+	}
+	else
+	{
+		// coeff0: constant (to be scaled by recipUnitResponse)
+		// coeff1: unbiasedConstant (to be scaled by recipUnitResponse)
+
+		velMultiplier = -recipUnitResponse;
+		impulseMultiplier = 1.0f;
+
+		constant = coeff0 * recipUnitResponse;
+		unbiasedConstant = coeff1 * recipUnitResponse;
+	}
+
+	// coeff2: initJointSpeed
+	const PxReal velBias = coeff2 * velMultiplier;
+	constant += velBias;
+	unbiasedConstant += velBias;
+}
+
+
+static __device__ PX_FORCE_INLINE PxReal
+computeCompliantContactCoefficients(PxReal dt, PxU8 flags, PxReal restitution, PxReal damping, PxReal unitResponse,
+                                    PxReal recipResponse, PxReal penetration, PxReal targetVelocity, bool isSeparated,
+                                    bool collidingWithVrel, PxReal& velMultiplier, PxReal& impulseMultiplier,
+                                    PxReal& unbiasedErr, PxReal& biasedErr)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+	const PxReal nrdt = dt * restitution;
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	const PxReal a = dt * (dampingIfEnabled - nrdt);
+	const PxReal b = -(nrdt * penetration * massIfAccelElseOne);
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+	// scaledBias = FSel(isSeparated, FNeg(invStepDt), FDiv(FMul(nrdt, FMul(x, unitResponse)), velMultiplier));
+	const PxReal scaledBias = x * b;
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	impulseMultiplier = 1.f - x;
+	unbiasedErr = biasedErr = targetVelocity * velMultiplier - scaledBias;
+
+	return a;
+}
+
+// Query two coefficients, coeff0 and coeff1, to compute contact coefficients efficiently at every sub-timestep or
+// iteration. See "computeCompliantContactCoefficients".
+static __device__ PX_FORCE_INLINE void
+queryReducedCompliantContactCoefficients(PxReal dt, PxU8 flags, PxReal restitution, PxReal damping, PxReal penetration, 
+										 PxReal targetVelocity, bool isSeparated, bool collidingWithVrel, 
+										 PxReal& coeff0, PxReal& coeff1)
+{
+	const PxReal nrdt = dt * restitution;
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	coeff0 = dt * (dampingIfEnabled - nrdt); // a = coeff0
+	coeff1 = -nrdt * penetration; // b = -(nrdt * penetration * massIfAccelElseOne) = coeff1 * massIfAccelElseOne
+}
+
+
+// Compute contact-related coefficients, velMultiplier, impulseMultiplier, unbiasedErr, and biasedErr with precomputed coefficients, coeff0 and coeff1.
+// See "computeCompliantContactCoefficients".
+static __device__ PX_FORCE_INLINE void 
+computeContactCoefficients(PxU8 flags, PxReal restitution, PxReal unitResponse, PxReal recipResponse, PxReal targetVelocity, 
+						   PxReal coeff0, PxReal coeff1, PxReal& velMultiplier, PxReal& impulseMultiplier, 
+						   PxReal& unbiasedErr, PxReal& biasedErr)
+{
+	if (restitution < 0.f)
+	{
+		const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+		const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+		const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+		const PxReal a = coeff0;
+		const PxReal b = coeff1 * massIfAccelElseOne;
+
+		const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+		const PxReal scaledBias = x * b;
+
+		velMultiplier = x * a * massIfAccelElseOne;
+		impulseMultiplier = 1.f - x;
+		unbiasedErr = biasedErr = targetVelocity * velMultiplier - scaledBias;
+	}
+	else
+	{
+		velMultiplier = recipResponse;
+		biasedErr = coeff0 * velMultiplier;
+		unbiasedErr = coeff1 * velMultiplier;
+		impulseMultiplier = 1.f;
+	}
+}
+
+static __device__ PX_FORCE_INLINE PxReal 
+computeCompliantContactCoefficientsTGS(PxReal stepDt, PxU8 flags,PxReal restitution, PxReal damping,
+                                       PxReal unitResponse, PxReal recipResponse, bool isSeparated, bool collidingWithVrel,
+                                       PxReal& velMultiplier, PxReal& scaledBias)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+
+	const PxReal dampingIfEnabled = computeCompliantDamping(isSeparated, collidingWithVrel, damping);
+	const PxReal nrdt = stepDt * restitution;
+	const PxReal a = stepDt * (dampingIfEnabled - nrdt);
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	scaledBias = nrdt * x * oneIfAccelElseR;
+
+	return a; // compliant contact coefficient a.
+}
+
+static __device__ PX_FORCE_INLINE void 
+computeCompliantContactCoefficientsTGS(PxU8 flags, PxReal nrdt, PxReal unitResponse, PxReal recipResponse, PxReal a, 
+									  PxReal& velMultiplier, PxReal& scaledBias)
+{
+	const bool accelSpring = !!(flags & PxgSolverContactFlags::eCOMPLIANT_ACCELERATION_SPRING);
+	const PxReal massIfAccelElseOne = accelSpring ? recipResponse : 1.0f;
+	const PxReal oneIfAccelElseR = accelSpring ? 1.0f : unitResponse;
+	//const PxReal nrdt = stepDt * restitution;
+	const PxReal x = 1.f / (a * oneIfAccelElseR + 1.f);
+
+	velMultiplier = x * a * massIfAccelElseOne;
+	scaledBias = nrdt * x * oneIfAccelElseR;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintBlockPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintBlockPrep.cuh
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/contactConstraintPrep.cuh
@@ -0,0 +1,368 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef	__CONTACT_CONSTRAINT_PREP_CUH__
+#define	__CONTACT_CONSTRAINT_PREP_CUH__
+
+
+#include "PxgSolverBody.h"
+#include "PxgConstraintBlock.h"
+#include "PxgFrictionPatch.h"
+#include "PxgConstraintPrep.h"
+#include "PxgSolverConstraintDesc.h"
+#include "cutil_math.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpuArticulation.h"
+#include "PxMaterial.h"
+#include "PxgSolverKernelIndices.h"
+#include "PxgSolverFlags.h"
+#include "MemoryAllocator.cuh"
+#include "vector.cuh"
+#include "PxgCommonDefines.h"
+#include "constraintPrepShared.cuh"
+#include "copy.cuh"
+
+using namespace physx;
+
+
+
+static __device__ bool getFrictionPatches(PxgFrictionPatch&  frictionPatch,
+	PxgFrictionAnchorPatch& anchorPatch,
+	const PxgBlockFrictionIndex* PX_RESTRICT prevFrictionIndices,
+	const PxU32 prevFrictionStartIndex,
+	const PxgFrictionPatch* PX_RESTRICT previousPatches,
+	const PxgFrictionAnchorPatch* PX_RESTRICT previousAnchors,
+	PxU32 frictionPatchCount,
+	const PxAlignedTransform& bodyFrame0,
+	const PxAlignedTransform& bodyFrame1,
+	PxReal correlationDistance,
+	const PxU32 totalNbEdges,
+	PxReal& patchExtents,
+	const PxU32 threadIndexInWarp)
+{
+	if (prevFrictionStartIndex == 0xFFFFFFFF || frictionPatchCount == 0)
+		return true;
+
+	PxgFrictionPatch& newPatch = frictionPatch;
+	PxgFrictionAnchorPatch& newAnchor = anchorPatch;
+
+	for (PxU32 a = 0; a < frictionPatchCount; a++)
+	{
+		const PxU64 index = prevFrictionIndices[prevFrictionStartIndex + a*totalNbEdges].getPatchIndex();
+		//indices += totalNbEdges;
+		const PxgFrictionPatch& oldPatch = previousPatches[index];
+		const PxgFrictionAnchorPatch& oldAnchor = previousAnchors[index];
+
+		assert(oldPatch.broken == 0 || oldPatch.broken == 1);
+		if (!oldPatch.broken)
+		{
+			const float4 oldBody0Normal = oldPatch.body0Normal;
+			if (dot3(oldBody0Normal, newPatch.body0Normal) > PXC_SAME_NORMAL) //TODO - check that they're the same material!
+			{
+				const PxU8 anchorCount = oldPatch.anchorCount;
+				if (anchorCount != 0)
+				{
+					assert(anchorCount <= 2);
+
+					const PxAlignedTransform body1ToBody0 = bodyFrame0.transformInv(bodyFrame1);
+					const float4 oldBody1Normal = oldPatch.body1Normal;
+					const float result = dot3(oldBody0Normal, body1ToBody0.rotate(oldBody1Normal));
+					if (dot3(oldBody0Normal, body1ToBody0.rotate(oldBody1Normal)) > PXC_SAME_NORMAL)
+					{
+
+						const float4 body0Anchor0 = oldAnchor.body0Anchors[0];
+						const float4 body1Anchor0 = oldAnchor.body1Anchors[0];
+						if (pointsAreClose(body1ToBody0, body0Anchor0, body1Anchor0, oldBody0Normal, correlationDistance))
+						{
+							const float4 body0Anchor1 = oldAnchor.body0Anchors[1];
+							const float4 body1Anchor1 = oldAnchor.body1Anchors[1];
+							if (anchorCount < 2 || pointsAreClose(body1ToBody0, body0Anchor1, body1Anchor1, oldBody0Normal, correlationDistance))
+							{
+								newPatch.contactID[0] = 0xff;
+								newPatch.contactID[1] = 0xff;
+								newPatch.anchorCount = anchorCount;
+								newPatch.body0Normal = oldBody0Normal;
+								newPatch.body1Normal = oldBody1Normal;
+								newAnchor.body0Anchors[0] = body0Anchor0;
+								newAnchor.body0Anchors[1] = body0Anchor1;
+								newAnchor.body1Anchors[0] = body1Anchor0;
+								newAnchor.body1Anchors[1] = body1Anchor1;
+
+								const float4 ext = (body0Anchor0 - body0Anchor1);
+								patchExtents = ext.x*ext.x + ext.y*ext.y + ext.z*ext.z;
+								return true; //Found a match = terminate!
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+
+	return true;
+}
+
+static __device__ void growPatches(PxgFrictionPatch& fp, PxgFrictionAnchorPatch& fAnchor,
+	const physx::PxgContactPoint* msContacts, const PxU32 numContacts,
+	const physx::PxTransform& msBodyFrame0,
+	const physx::PxTransform& msBodyFrame1,
+	float frictionOffsetThreshold,
+	const PxReal anchorSqDistance,
+	const float minimum,			//PxBounds3
+	const float maximum,			//PxBounds3
+	ScratchMemoryAllocator& sAlloc,
+	const PxU32 threadIndexInWarp)
+{
+	using namespace physx;
+	PxU32 oldAnchorCount = fp.anchorCount;
+
+	ScratchMemoryMarker marker(sAlloc);
+
+
+	if (oldAnchorCount == 2)
+	{
+		float dif = 0.f;
+		if (threadIndexInWarp < 3)
+		{
+			dif = maximum - minimum;
+			dif = dif * dif;
+		}
+
+		const PxReal frictionPatchDiagonalSq = __shfl_sync(FULL_MASK, dif, 0)
+			+ __shfl_sync(FULL_MASK, dif, 1)
+			+ __shfl_sync(FULL_MASK, dif, 2);
+
+		//If the squared distance between the anchors is more than a quarter of the patch diagonal, we can keep, 
+		//otherwise the anchors are potentially clustered around a corner so force a rebuild of the patch
+		if ((anchorSqDistance * 4.f) >= frictionPatchDiagonalSq)
+			return;
+
+		oldAnchorCount = 0;
+
+	}
+
+	//__shared__ PxVec3 worldAnchors[2];
+	//__shared__ PxU32 contactID[2];
+
+	PxVec3* msWorldAnchors = sAlloc.allocAligned<PxVec3>(sizeof(PxVec3) * 2);
+	PxU32* msContactID = sAlloc.allocAligned<PxU32>(sizeof(PxU32) * 2);
+
+	PxU16 anchorCount = 0;
+	PxReal pointDistSq = 0.0f, dist0, dist1;
+
+	// if we have an anchor already, keep it
+	if (oldAnchorCount == 1)
+	{
+
+		float v = 0.f;
+		if (threadIndexInWarp < 3)
+		{
+			float* anchors = reinterpret_cast<float*>(&fAnchor.body0Anchors[0].x);
+			v = anchors[threadIndexInWarp];
+		}
+
+		transform(v, msBodyFrame0, msWorldAnchors[0], threadIndexInWarp);
+
+
+		if (threadIndexInWarp == 0)
+			msContactID[0] = 0xFF;
+
+		/*const PxVec3 v(fAnchor.body0Anchors[0].x, fAnchor.body0Anchors[0].y, fAnchor.body0Anchors[0].z);
+		worldAnchors[0] = bodyFrame0.transform(v);
+		contactID[0] = 0xFF;*/
+		anchorCount++;
+	}
+
+	__syncwarp();
+
+	//PxVec3& msWorldPoint = *sAlloc.alloc<PxVec3>(sizeof(PxVec3));
+
+	for (PxU32 j = 0; j<numContacts; j++)
+	{
+		const PxReal separation = msContacts[j].point_separationW.w;
+
+		if (separation < frictionOffsetThreshold)
+		{
+			//const float* contacts = reinterpret_cast<const float*>(&msContacts[j].point_separationW.x);
+			const PxVec3& worldPoint = reinterpret_cast<const PxVec3&>(msContacts[j].point_separationW.x);
+			switch (anchorCount)
+			{
+			case 0:
+				if (threadIndexInWarp < 3)
+				{
+					msWorldAnchors[0][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+					if (threadIndexInWarp == 0)
+						msContactID[0] = PxU16(j);
+				}
+				anchorCount++;
+				__syncwarp();
+				/*contactID[0] = PxU16(j);
+				worldAnchors[0] = worldPoint;
+				anchorCount++;*/
+				break;
+			case 1:
+				//pointDistSq = (worldPoint - worldAnchors[0]).magnitudeSquared();
+				pointDistSq = negateMagnitudeSquared(worldPoint, msWorldAnchors[0], threadIndexInWarp);
+				if (pointDistSq > 1e-8f)
+				{
+					if (threadIndexInWarp < 3)
+					{
+						msWorldAnchors[1][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+						if (threadIndexInWarp == 0)
+							msContactID[1] = PxU16(j);
+					}
+					anchorCount++;
+
+					__syncwarp();
+
+				}
+
+				break;
+			default: //case 2
+				dist0 = negateMagnitudeSquared(worldPoint, msWorldAnchors[0], threadIndexInWarp);
+				dist1 = negateMagnitudeSquared(worldPoint, msWorldAnchors[1], threadIndexInWarp);
+
+				//dist0 = (worldPoint - worldAnchors[0]).magnitudeSquared();
+				//dist1 = (worldPoint - worldAnchors[1]).magnitudeSquared();
+				if (dist0 > dist1)
+				{
+					if (dist0 > pointDistSq)
+					{
+						if (threadIndexInWarp < 3)
+						{
+							msWorldAnchors[1][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+							if (threadIndexInWarp == 0)
+								msContactID[1] = PxU16(j);
+						}
+						//contactID[1] = PxU16(j);
+						//worldAnchors[1] = worldPoint;
+						pointDistSq = dist0;
+
+						__syncwarp();
+					}
+				}
+				else if (dist1 > pointDistSq)
+				{
+					if (threadIndexInWarp < 3)
+					{
+						msWorldAnchors[0][threadIndexInWarp] = worldPoint[threadIndexInWarp];
+						if (threadIndexInWarp == 0)
+							msContactID[0] = PxU16(j);
+					}
+					/*contactID[0] = PxU16(j);
+					worldAnchors[0] = worldPoint;*/
+					pointDistSq = dist1;
+
+					__syncwarp();
+				}
+			}
+		}
+	}
+
+
+
+	switch (anchorCount)
+	{
+	case 2:
+	{
+		//KS - if there is a 2nd anchor, we always write it. If we already had 2 anchors, we would have exited earlier!
+
+		transformInv(msWorldAnchors[1], msBodyFrame0, reinterpret_cast<PxVec3&>(fAnchor.body0Anchors[1]), threadIndexInWarp);
+		transformInv(msWorldAnchors[1], msBodyFrame1, reinterpret_cast<PxVec3&>(fAnchor.body1Anchors[1]), threadIndexInWarp);
+
+	}
+	case 1:
+		if (oldAnchorCount == 0)
+		{
+			//KS - if there is a 2nd anchor, we always write it. If we already had 2 anchors, we would have exited earlier!
+			transformInv(msWorldAnchors[0], msBodyFrame0, reinterpret_cast<PxVec3&>(fAnchor.body0Anchors[0]), threadIndexInWarp);
+			transformInv(msWorldAnchors[0], msBodyFrame1, reinterpret_cast<PxVec3&>(fAnchor.body1Anchors[0]), threadIndexInWarp);
+		}
+	default:
+		break;
+	};
+
+	if (threadIndexInWarp == 0)
+		fp.anchorCount = anchorCount;
+
+	__syncwarp();
+}
+
+static __device__ void initFrictionPatch(physx::PxgFrictionPatch& p,
+	const float msBody0Normal, const float msBody1Normal,
+	const PxU32 threadIndexInWarp)
+{
+	if (threadIndexInWarp < 3)
+	{
+		float* dBody0Normal = &p.body0Normal.x;
+		dBody0Normal[threadIndexInWarp] = msBody0Normal;
+
+		float* dBody1Normal = &p.body1Normal.x;
+		dBody1Normal[threadIndexInWarp] = msBody1Normal;
+
+		if (threadIndexInWarp == 0)
+		{
+			dBody0Normal[3] = 0.f;
+			dBody1Normal[3] = 0.f;
+			p.anchorCount = 0;
+			p.broken = 0;
+		}
+	}
+	/*p.body0Normal = make_float4(body0Normal.x, body0Normal.y, body0Normal.z, 0.f);
+	p.body1Normal = make_float4(body1Normal.x, body1Normal.y, body1Normal.z, 0.f);
+	p.anchorCount = 0;
+	p.broken = 0;*/
+}
+
+
+static __device__ void correlatePatches(PxgFrictionPatch& frictionPatch, const physx::PxgContactPoint* contacts, const PxU32 nbContacts,
+	const float msNormal, const physx::PxAlignedTransform& msBodyFrame0, const physx::PxAlignedTransform& msBodyFrame1,
+	float normalTolerance, ScratchMemoryAllocator& sAlloc, const PxU32 threadIndexInWarp)
+{
+	using namespace physx;
+
+	if (nbContacts > 0)
+	{
+
+		const PxQuat& quat0 = reinterpret_cast<const PxQuat&>(msBodyFrame0.q);
+		const PxQuat& quat1 = reinterpret_cast<const PxQuat&>(msBodyFrame1.q);
+
+		float msBody0Normal = rotateInvR(msNormal, quat0, threadIndexInWarp);
+		float msBody1Normal = rotateInvR(msNormal, quat1, threadIndexInWarp);
+
+		initFrictionPatch(frictionPatch, msBody0Normal, msBody1Normal, threadIndexInWarp);
+
+		__syncwarp();
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cu
@@ -0,0 +1,115 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySleepingConfigulation.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "assert.h"
+#include "stdio.h"
+#include "integration.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels3() {}
+
+extern "C" __global__ void integrateCoreParallelLaunch(
+	const uint32_t offset, const PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
+	const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
+	const PxU32* PX_RESTRICT islandIds, 
+	const PxU32* PX_RESTRICT islandStaticTouchCounts,
+	const PxU32* PX_RESTRICT numCountedInteractions)
+{	
+	//integrateCoreParallel(motionVelocity, solverBody, solverBodyData, numBodies, dt);
+	uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const float4* PX_RESTRICT motionVelocityArray = solverCoreDesc->motionVelocityArray;
+	const uint32_t numSolverBodies = solverCoreDesc->numSolverBodies;
+	const float4* PX_RESTRICT solverBodyVelocity = sharedDesc->iterativeData.solverBodyVelPool;
+
+	const PxgSolverTxIData* PX_RESTRICT txIData = solverCoreDesc->solverBodyTxIDataPool;
+
+	float4* PX_RESTRICT outSolverVelocity = solverCoreDesc->outSolverVelocity;
+	PxAlignedTransform* PX_RESTRICT outBody2World = solverCoreDesc->outBody2World;
+
+	//for(uint32_t a = idx+offset; a < numSolverBodies; a+=blockSize)
+	uint32_t a = idx+offset;
+	if(a < numSolverBodies)
+	{
+		const PxgSolverBodyData& data = solverCoreDesc->solverBodyDataPool[a];
+
+		const PxU32 nodeIndex = data.islandNodeIndex.index();// >> 2;
+
+		PxgBodySim&	bodySim = solverCoreDesc->mBodySimBufferDeviceData[nodeIndex];
+
+		//KS - TODO - access all data via shared memory
+		// PT: TODO: TGS version uses a copy here, what's better?
+		const PxMat33& sqrtInvInertia = txIData[a].sqrtInvInertia;
+		
+		PxAlignedTransform body2World = bodySim.body2World;	// PT: TODO: TGS version uses outBody2World[a] here, why?
+		const float4 inverseInertia = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+
+		const PxU32 index = solverCoreDesc->accumulatedBodyDeltaVOffset + a;
+
+		// PT: TODO: TGS version uses tmp v0/v1 values here, why?
+		float4 linVel = solverBodyVelocity[index];
+		float4 angVel = solverBodyVelocity[index + numSolverBodies];
+
+		const PxU32 staticTouchCount = islandStaticTouchCounts[islandIds[nodeIndex]];
+
+		//printf("Integrating %i: index = %i, a = %i\n", nodeIndex, index, a);
+
+		//we need to dma the sleep data back for post solver task
+		PxgSolverBodySleepData& sleepData = solverCoreDesc->solverBodySleepDataPool[a];
+
+		integrateCore(motionVelocityArray[a], motionVelocityArray[a + numSolverBodies], inverseInertia, linVel, angVel, body2World, data, bodySim, sleepData, sqrtInvInertia,
+			sharedDesc->dt, sharedDesc->invDtF32, solverCoreDesc->enableStabilization, staticTouchCount != 0, numCountedInteractions[nodeIndex],
+			nodeIndex);
+
+		// PT: TODO: why do we write out the vels & pose to 2 different buffers?
+		outSolverVelocity[a] = linVel;
+		outSolverVelocity[a+numSolverBodies] = angVel;
+		outBody2World[a] = body2World;
+
+		// PT: for acceleration getters (eENABLE_BODY_ACCELERATIONS)
+		PxgBodySimVelocities* prevVelocities = solverCoreDesc->mBodySimPrevVelocitiesBufferDeviceData;
+		if(prevVelocities)
+		{
+			PxgBodySimVelocities& prev = prevVelocities[nodeIndex];
+			prev.linearVelocity = bodySim.linearVelocityXYZ_inverseMassW;
+			prev.angularVelocity = bodySim.angularVelocityXYZ_maxPenBiasW;
+		}
+
+		//write back linear velocity, angular velocity to pxgbodysim
+		bodySim.linearVelocityXYZ_inverseMassW.x = linVel.x; bodySim.linearVelocityXYZ_inverseMassW.y = linVel.y; bodySim.linearVelocityXYZ_inverseMassW.z = linVel.z;
+		bodySim.angularVelocityXYZ_maxPenBiasW.x = angVel.x; bodySim.angularVelocityXYZ_maxPenBiasW.y = angVel.y; bodySim.angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+		bodySim.body2World = body2World;
+		assert(body2World.isSane());
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integration.cuh
@@ -0,0 +1,435 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "PxgSolverKernelIndices.h"
+#include "DySleepingConfigulation.h"
+#include "stdio.h"
+
+using namespace physx;
+
+static __device__ void updateWakeCounter(bool& freeze, float4& solverBodyLinVel, float4& solverBodyAngVel, const PxAlignedTransform& body2World, 
+	PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const float4& inverseInertia, const PxVec3& linearMotionVel, const PxVec3& angularMotionVel, const float invertedMass, const float dt,
+	const float invDt, const bool enableStabilization, const bool hasStaticTouch, PxU32 numCountedInteractions,
+	PxU32 nodeIndex)
+{
+	// update the body's sleep state and 
+	PxReal wakeCounterResetTime = 20.0f*0.02f;
+
+	float4 freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex = bodySim.freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex;
+	float4 sleepLinVelAccXYZ_freezeCountW = bodySim.sleepLinVelAccXYZ_freezeCountW;
+	float4 sleepAngVelAccXYZ_accelScaleW = bodySim.sleepAngVelAccXYZ_accelScaleW;
+
+	PxReal wc = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y; //wakeCounter;
+
+	PxU32 flags = bodySim.internalFlags & (PxsRigidBody::eDISABLE_GRAVITY_GPU | PxsRigidBody::eFROZEN | PxsRigidBody::eENABLE_GYROSCOPIC | PxsRigidBody::eRETAIN_ACCELERATION);
+	PxReal freezeCount = sleepLinVelAccXYZ_freezeCountW.w;
+	PxReal accelScale = sleepAngVelAccXYZ_accelScaleW.w;
+
+	bool alreadyUpdateWC = false;
+	PxVec3 sleepLinVelAcc(0.f), sleepAngVelAcc(0.f);
+
+	{
+		if (enableStabilization)
+		{
+			const PxU32 maxCountedInteractions = 10u; //KS - arbitrary limit to make sure that 
+													  //bool freeze = false;
+													  //const PxAlignedTransform& body2World = solverBodyData.body2World;
+
+													  // calculate normalized energy: kinetic energy divided by mass
+			const PxVec3 inertia(inverseInertia.x > 0.f ? 1.0f / inverseInertia.x : 1.f, inverseInertia.y > 0.f ? 1.0f / inverseInertia.y : 1.f, inverseInertia.z > 0.f ? 1.0f / inverseInertia.z : 1.f);
+			sleepLinVelAcc = linearMotionVel;
+			sleepAngVelAcc = angularMotionVel;
+
+			// scale threshold by cluster factor (more contacts => higher sleep threshold)
+			const PxU32 clusterFactor = PxMin(numCountedInteractions, maxCountedInteractions);
+
+			PxReal invMass = invertedMass;// intialVel_invMass.w;
+			if (invMass == 0.f)
+				invMass = 1.f;
+
+			const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+			const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+			PxReal frameNormalizedEnergy = 0.5f * (angular + linear);
+
+			const PxReal cf = hasStaticTouch ? clusterFactor : 0.f;
+			const PxReal freezeThresh = cf * freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x;// solverBodySleepData.freezeThreshold;
+			freezeCount = PxMax(freezeCount - dt, 0.0f);
+
+			bool settled = true;
+
+			accelScale = PxMin(1.f, accelScale + dt);
+
+			if (frameNormalizedEnergy >= freezeThresh)
+			{
+				settled = false;
+
+				freezeCount = PXD_FREEZE_INTERVAL;
+			}
+
+			if (!hasStaticTouch)
+			{
+				accelScale = 1.f;
+				settled = false;
+			}
+
+			if (settled)
+			{
+				//Dampen bodies that are just about to go to sleep
+				if (cf > 1)
+				{
+					const PxReal d = 1.0f - (PXD_SLEEP_DAMPING * dt);
+
+					solverBodyLinVel = solverBodyLinVel * d;
+					solverBodyAngVel = solverBodyAngVel * d;
+					accelScale = accelScale * 0.75f + 0.25f*PXD_FREEZE_SCALE;
+				}
+				
+				freeze = freezeCount == 0.f && frameNormalizedEnergy < (freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x * PXD_FREEZE_TOLERANCE);
+			}
+
+			if (freeze)
+			{
+				//current flag isn't frozen but freeze flag raise so we need to raise the frozen flag in this frame
+
+				bool wasNotFrozen = (flags & PxsRigidBody::eFROZEN) == 0;
+				flags |= PxsRigidBody::eFROZEN;
+				if (wasNotFrozen)
+				{
+					flags |= PxsRigidBody::eFREEZE_THIS_FRAME;
+				}
+			}
+			else
+			{
+				bool wasFrozen = (flags & PxsRigidBody::eFROZEN) != 0;
+				flags &= (PxsRigidBody::eDISABLE_GRAVITY_GPU | PxsRigidBody::eENABLE_GYROSCOPIC | PxsRigidBody::eRETAIN_ACCELERATION);
+				if (wasFrozen)
+				{
+					flags |= PxsRigidBody::eUNFREEZE_THIS_FRAME;
+				}
+			}
+
+			/*KS: New algorithm for sleeping when using stabilization:
+			* Energy *this frame* must be higher than sleep threshold and accumulated energy over previous frames
+			* must be higher than clusterFactor*energyThreshold.
+			*/
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//Accumulate energy
+				sleepLinVelAcc.x += sleepLinVelAccXYZ_freezeCountW.x;
+				sleepLinVelAcc.y += sleepLinVelAccXYZ_freezeCountW.y;
+				sleepLinVelAcc.z += sleepLinVelAccXYZ_freezeCountW.z;
+
+				sleepAngVelAcc.x += sleepAngVelAccXYZ_accelScaleW.x;
+				sleepAngVelAcc.y += sleepAngVelAccXYZ_accelScaleW.y;
+				sleepAngVelAcc.z += sleepAngVelAccXYZ_accelScaleW.z;
+
+				//If energy this frame is high
+				const PxReal sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+				if (frameNormalizedEnergy >= sleepThreshold)
+				{
+					//Compute energy over sleep preparation time
+		
+					const PxReal sleepAngular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+					const PxReal sleepLinear = sleepLinVelAcc.magnitudeSquared();
+
+					PxReal normalizedEnergy = 0.5f * (sleepAngular + sleepLinear);
+					PxReal sleepClusterFactor = clusterFactor + 1.f;
+
+					// scale threshold by cluster factor (more contacts => higher sleep threshold)
+					const PxReal threshold = sleepClusterFactor * sleepThreshold;
+
+					//If energy over sleep preparation time is high
+					if (normalizedEnergy >= threshold)
+					{
+						//Wake up
+						//assert(isActive());
+						
+						sleepAngVelAcc = PxVec3(0);
+						sleepLinVelAcc = PxVec3(0);
+
+						const float factor = sleepThreshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+						PxReal oldWc = wc;
+						wc = factor * 0.5f * wakeCounterResetTime + dt * (sleepClusterFactor - 1.0f);
+
+						//if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+						//	notifyNotReadyForSleeping(bodyCore.nodeIndex);
+
+						if (oldWc == 0.0f)
+							flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+
+						alreadyUpdateWC = true;
+					}
+				}
+			}
+		}
+		else
+		{
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//const PxAlignedTransform& body2World = solverBodyData.body2World;
+
+				// calculate normalized energy: kinetic energy divided by mass
+				const PxVec3 inertia(inverseInertia.x > 0.f ? 1.0f / inverseInertia.x : 1.f, inverseInertia.y > 0.f ? 1.0f / inverseInertia.y : 1.f, inverseInertia.z > 0.f ? 1.0f / inverseInertia.z : 1.f);
+
+				sleepLinVelAcc = linearMotionVel;// originalBody->mAcceleration.linear;
+				sleepAngVelAcc = body2World.q.rotateInv(angularMotionVel);// originalBody->mAcceleration.angular;
+
+				sleepLinVelAcc.x += sleepLinVelAccXYZ_freezeCountW.x;
+				sleepLinVelAcc.y += sleepLinVelAccXYZ_freezeCountW.y;
+				sleepLinVelAcc.z += sleepLinVelAccXYZ_freezeCountW.z;
+
+				sleepAngVelAcc.x += sleepAngVelAccXYZ_accelScaleW.x;
+				sleepAngVelAcc.y += sleepAngVelAccXYZ_accelScaleW.y;
+				sleepAngVelAcc.z += sleepAngVelAccXYZ_accelScaleW.z;
+				
+
+				PxReal invMass = invertedMass;
+				if (invMass == 0.f)
+					invMass = 1.f;
+
+				const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+				const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+				PxReal normalizedEnergy = 0.5f * (angular + linear);
+
+				// scale threshold by cluster factor (more contacts => higher sleep threshold)
+				const PxReal clusterFactor = PxReal(1 + numCountedInteractions);
+
+				const PxReal sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+				const PxReal threshold = clusterFactor * sleepThreshold;
+
+				if (normalizedEnergy >= threshold)
+				{
+					//assert(isActive());
+					sleepLinVelAcc = PxVec3(0);
+					sleepAngVelAcc = PxVec3(0);
+					const float factor = threshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+					PxReal oldWc = wc;
+					wc = factor * 0.5f * wakeCounterResetTime + dt * (clusterFactor - 1.0f);
+					
+					if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+					{
+						flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+					}
+
+					alreadyUpdateWC = true;
+				}
+			}
+		}
+	}
+
+	if(!alreadyUpdateWC)
+		wc = PxMax(wc - dt, 0.0f);
+
+	bool wakeCounterZero = (wc == 0.0f);
+
+	if (wakeCounterZero)
+	{
+		flags |= PxsRigidBody::eDEACTIVATE_THIS_FRAME;
+		sleepLinVelAcc = PxVec3(0);
+		sleepAngVelAcc = PxVec3(0);
+	}
+
+	bodySim.internalFlags = flags;
+	bodySim.freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y = wc;
+	bodySim.sleepLinVelAccXYZ_freezeCountW = make_float4(sleepLinVelAcc.x, sleepLinVelAcc.y, sleepLinVelAcc.z, freezeCount);
+	bodySim.sleepAngVelAccXYZ_accelScaleW = make_float4(sleepAngVelAcc.x, sleepAngVelAcc.y, sleepAngVelAcc.z, accelScale);
+
+	if (!(flags & PxsRigidBody::eRETAIN_ACCELERATION))
+	{
+		bodySim.externalLinearAcceleration = make_float4(0.f, 0.f, 0.f, 0.f);
+		bodySim.externalAngularAcceleration = make_float4(0.f, 0.f, 0.f, 0.f);
+	}
+
+	sleepData.internalFlags = flags;
+	sleepData.wakeCounter = wc;
+}
+
+static __device__ bool sleepCheck(float4& solverBodyLinVel, float4& solverBodyAngVel, const PxAlignedTransform& body2World, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const float4& inverseInertia, const PxVec3& linearMotionVel, const PxVec3& angularMotionVel, const float invertedMass, const float dt, const float invDt, const bool enableStabilization,
+	const bool hasStaticTouch, const PxU32 numCountedInteractions, PxU32 nodeIndex)
+{
+	bool freeze = false;
+	updateWakeCounter(freeze, solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, invertedMass, dt,
+		invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	return freeze;
+}
+
+static __device__ void integrateCore(const float4 motionLinVelXYZW, const float4 motionAngVelXYZW, const float4& inverseInertia, float4& solverBodyLinVel,
+	float4& solverBodyAngVel, PxAlignedTransform& body2World, const PxgSolverBodyData& solverBodyData, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const PxMat33& sqrtInvInertia, const float dt, const float invDt, const bool enableStabilization, const bool hasStaticTouch, const PxU32 numCountedInteractions,
+	const PxU32 nodeIndex)
+{
+	// Integrate linear part
+	const float4 initialLinVelXYZ_invMassW = solverBodyData.initialLinVelXYZ_invMassW;
+	const float4 initialAngVelXYZ_penBiasClamp = solverBodyData.initialAngVelXYZ_penBiasClamp;
+	//ML: solverBodyData.initialLinVelocity store the PxsBodyCore (original )linearVelocity and angularVelocity
+	const PxVec3 initialLinVel(initialLinVelXYZ_invMassW.x, initialLinVelXYZ_invMassW.y, initialLinVelXYZ_invMassW.z);
+	const PxVec3 initialAngVel(initialAngVelXYZ_penBiasClamp.x, initialAngVelXYZ_penBiasClamp.y, initialAngVelXYZ_penBiasClamp.z);
+
+	PxU32 lockFlags = bodySim.lockFlags;
+
+	//update body lin and ang velocity
+	PxVec3 bodyLinearVelocity(solverBodyLinVel.x, solverBodyLinVel.y, solverBodyLinVel.z);
+	PxVec3 bodyAngVelocity(solverBodyAngVel.x, solverBodyAngVel.y, solverBodyAngVel.z);
+
+#ifndef IS_TGS_SOLVER
+	bodyLinearVelocity = initialLinVel + bodyLinearVelocity;
+	bodyAngVelocity = initialAngVel + sqrtInvInertia * bodyAngVelocity;
+#else
+	bodyAngVelocity = sqrtInvInertia * bodyAngVelocity;
+#endif
+
+	solverBodyLinVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : bodyLinearVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : bodyLinearVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : bodyLinearVelocity.z, 0.f);
+	solverBodyAngVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : bodyAngVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : bodyAngVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : bodyAngVelocity.z, 0.f);
+
+	//we need to perform sleep check here to decide whether we want to update body2World transform for the body
+	const PxVec3 motionLinVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : motionLinVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : motionLinVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : motionLinVelXYZW.z);
+	const PxVec3 motionAngVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : motionAngVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : motionAngVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : motionAngVelXYZW.z);
+
+#ifndef IS_TGS_SOLVER
+	PxVec3 linearMotionVel = initialLinVel + motionLinVel;
+	PxVec3 angularMotionVel = initialAngVel + sqrtInvInertia * motionAngVel;
+
+	//printf("%i: DeltaLinVel = (%f, %f, %f)\n", nodeIndex, motionLinVel.x, motionLinVel.y, motionLinVel.z);
+#else
+	PxVec3 linearMotionVel = motionLinVel;
+	PxVec3 angularMotionVel = sqrtInvInertia * motionAngVel;
+#endif
+
+	// Integrate the rotation using closed form quaternion integrator
+	PxReal w = angularMotionVel.magnitudeSquared();
+	w = PxSqrt(w);
+	const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+	if (w > maxW)
+	{
+		angularMotionVel = angularMotionVel.getNormalized() * maxW;
+		w = maxW;
+	}
+
+	const bool freeze = sleepCheck(solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, initialLinVelXYZ_invMassW.w,
+		dt, invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	if (!freeze)
+	{
+		PxVec3 delta = linearMotionVel * dt;
+		body2World.p.x += delta.x; body2World.p.y += delta.y; body2World.p.z += delta.z;
+
+		if (w != 0.0f)
+		{
+			const PxReal v = dt * w * 0.5f;
+			PxReal s, q;
+			//s = sin(v);
+			//q = cos(v);
+			__sincosf(v, &s, &q);
+			s /= w;
+
+			const PxVec3 pqr = angularMotionVel * s;
+			const PxAlignedQuat quatVel(pqr.x, pqr.y, pqr.z, 0);
+			PxAlignedQuat result = quatVel * body2World.q;
+
+			result += body2World.q * q;
+
+			//ML: solverBodyData store the current transform for PxsBodyCore
+			body2World.q = result.getNormalized();
+		}
+	}
+}
+
+static __device__ void integrateCoreTGS(const float4 motionLinVelXYZW, const float4 motionAngVelXYZW, const float4& inverseInertia, float4& solverBodyLinVel,
+	float4& solverBodyAngVel, PxAlignedTransform& body2World, const PxTransform& deltaBody2World, const PxgSolverBodyData& solverBodyData, PxgBodySim& bodySim, PxgSolverBodySleepData& sleepData,
+	const PxMat33& sqrtInvInertia, const float dt, const float invDt, const bool enableStabilization, const bool hasStaticTouch, const PxU32 numCountedInteractions,
+	const PxU32 nodeIndex)
+{
+	const PxU32 lockFlags = bodySim.lockFlags;
+
+	//KS - TODO - optimize this away
+	const float4 initialLinVelXYZ_invMassW = solverBodyData.initialLinVelXYZ_invMassW;
+
+	//update body lin and ang velocity
+	PxVec3 bodyLinearVelocity(solverBodyLinVel.x, solverBodyLinVel.y, solverBodyLinVel.z);
+	PxVec3 bodyAngVelocity(solverBodyAngVel.x, solverBodyAngVel.y, solverBodyAngVel.z);
+	bodyAngVelocity = sqrtInvInertia * bodyAngVelocity;
+
+	solverBodyLinVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : bodyLinearVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : bodyLinearVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : bodyLinearVelocity.z, 0.f);
+	solverBodyAngVel = make_float4(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : bodyAngVelocity.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : bodyAngVelocity.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : bodyAngVelocity.z, 0.f);
+
+	//we need to perform sleep check here to decide whether we want to update body2World transform for the body
+	const PxVec3 motionLinVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : motionLinVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : motionLinVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : motionLinVelXYZW.z);
+	const PxVec3 motionAngVel(lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : motionAngVelXYZW.x,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : motionAngVelXYZW.y,
+		lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : motionAngVelXYZW.z);
+
+	PxVec3 linearMotionVel = motionLinVel;
+	PxVec3 angularMotionVel = sqrtInvInertia * motionAngVel;
+
+	// Integrate the rotation using closed form quaternion integrator
+	PxReal w = angularMotionVel.magnitudeSquared();
+	w = PxSqrt(w);
+	const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+	if (w > maxW)
+	{
+		angularMotionVel = angularMotionVel.getNormalized() * maxW;
+		w = maxW;
+	}
+
+	const bool freeze = sleepCheck(solverBodyLinVel, solverBodyAngVel, body2World, bodySim, sleepData, inverseInertia, linearMotionVel, angularMotionVel, initialLinVelXYZ_invMassW.w,
+		dt, invDt, enableStabilization, hasStaticTouch, numCountedInteractions, nodeIndex);
+
+	if (!freeze)
+	{
+		//printf("DeltaP = (%f, %f, %f)\n", deltaBody2World.p.x, deltaBody2World.p.y, deltaBody2World.p.z);
+		body2World.p.x += deltaBody2World.p.x; body2World.p.y += deltaBody2World.p.y; body2World.p.z += deltaBody2World.p.z;
+		PxQuat q(body2World.q.q.x, body2World.q.q.y, body2World.q.q.z, body2World.q.q.w);
+		q = (deltaBody2World.q * q).getNormalized();
+
+		body2World.q.q.x = q.x; body2World.q.q.y = q.y; body2World.q.q.z = q.z; body2World.q.q.w = q.w;
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/integrationTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/integrationTGS.cu
@@ -0,0 +1,140 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#define IS_TGS_SOLVER
+
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverCoreDesc.h"
+#include "DySleepingConfigulation.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "assert.h"
+#include "stdio.h"
+#include "integration.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels11() {}
+
+//KS - this will change dramatically once we have all the TGS functionality working!
+extern "C" __global__ void integrateCoreParallelLaunchTGS(
+	const uint32_t offset, const PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
+	const PxgSolverSharedDesc<IterativeSolveDataTGS>* PX_RESTRICT sharedDesc,
+	const PxU32* PX_RESTRICT islandIds,
+	const PxU32* PX_RESTRICT islandStaticTouchCounts,
+	const PxU32* PX_RESTRICT numCountedInteractions)
+{
+	//integrateCoreParallel(motionVelocity, solverBody, solverBodyData, numBodies, dt);
+	uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const float4* PX_RESTRICT motionVelocityArray = solverCoreDesc->motionVelocityArray;
+	const uint32_t numSolverBodies = solverCoreDesc->numSolverBodies;
+	const float4* PX_RESTRICT solverBodyVelocity = sharedDesc->iterativeData.solverBodyVelPool;
+
+	const PxgSolverTxIData* PX_RESTRICT txIDatas = solverCoreDesc->solverBodyTxIDataPool;
+
+	float4* PX_RESTRICT outSolverVelocity = solverCoreDesc->outSolverVelocity;
+	PxAlignedTransform* PX_RESTRICT outBody2World = solverCoreDesc->outBody2World;
+
+	//for(uint32_t a = idx+offset; a < numSolverBodies; a+=blockSize)
+	uint32_t a = idx + offset;
+	if (a < numSolverBodies)
+	{
+		const PxgSolverBodyData& data = solverCoreDesc->solverBodyDataPool[a];
+
+		const PxU32 nodeIndex = data.islandNodeIndex.index();// >> 2;
+
+		PxgBodySim&	bodySim = solverCoreDesc->mBodySimBufferDeviceData[nodeIndex];
+
+		// PT: TODO: PGS version uses a reference here, what's better?
+		const PxMat33 sqrtInvInertia = txIDatas[a].sqrtInvInertia;
+		const PxTransform deltaTransform = txIDatas[a].deltaBody2World;
+
+		PxAlignedTransform body2World = outBody2World[a];	// PT: TODO: PGS version uses bodySim.body2World here, why?
+
+		const float4 inverseInertia = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+
+		const float4 motionLinVel = motionVelocityArray[a];
+		const float4 motionAngVel = motionVelocityArray[a + numSolverBodies];
+
+		//if (index == PxgSolverBody::InvalidHandle)
+		//{
+		//	const float4 zero4 = make_float4(0.f);
+		//	linVel = motionLinVel;
+		//	angVel = motionAngVel;
+		//}
+		//else
+		//{
+		//	//PxU32 ind = 3*(index&(~31)) + (index&31);
+		//	PxU32 ind = index;
+		//	float4 linxyz_angx = solverBodyVelocity[ind];
+		//	float4 angyz_lindxy = solverBodyVelocity[ind + 32];
+		//	linVel = make_float4(linxyz_angx.x, linxyz_angx.y, linxyz_angx.z, 0.f);
+		//	angVel = make_float4(linxyz_angx.w, angyz_lindxy.x, angyz_lindxy.y, 0.f);
+		//}
+
+		const PxU32 readIndex = solverCoreDesc->accumulatedBodyDeltaVOffset + a;
+
+		float4 v0 = solverBodyVelocity[readIndex];
+		float4 v1 = solverBodyVelocity[readIndex + numSolverBodies];
+
+		// PT: TODO: PGS version doesn't use tmp v0/v1 values here, why?
+		float4 linVel = make_float4(v0.x, v0.y, v0.z, 0.f);
+		float4 angVel = make_float4(v0.w, v1.x, v1.y, 0.f);
+
+		const PxU32 staticTouchCount = islandStaticTouchCounts[islandIds[nodeIndex]];
+
+		//this array need to be copied back to CPU
+		PxgSolverBodySleepData& sleepData = solverCoreDesc->solverBodySleepDataPool[a];
+
+		integrateCoreTGS(motionLinVel, motionAngVel, inverseInertia, linVel, angVel, body2World, deltaTransform, data, bodySim, sleepData, sqrtInvInertia,
+			sharedDesc->dt, sharedDesc->invDtF32, solverCoreDesc->enableStabilization, staticTouchCount != 0, numCountedInteractions[nodeIndex],
+			nodeIndex);
+
+		// PT: TODO: why do we write out the vels & pose to 2 different buffers?
+		outSolverVelocity[a] = linVel;
+		outSolverVelocity[a + numSolverBodies] = angVel;
+		outBody2World[a] = body2World;
+
+		// PT: for acceleration getters (eENABLE_BODY_ACCELERATIONS)
+		PxgBodySimVelocities* prevVelocities = solverCoreDesc->mBodySimPrevVelocitiesBufferDeviceData;
+		if(prevVelocities)
+		{
+			PxgBodySimVelocities& prev = prevVelocities[nodeIndex];
+			prev.linearVelocity = bodySim.linearVelocityXYZ_inverseMassW;
+			prev.angularVelocity = bodySim.angularVelocityXYZ_maxPenBiasW;
+		}
+
+		//write back linear velocity, angular velocity to pxgbodysim
+		bodySim.linearVelocityXYZ_inverseMassW.x = linVel.x; bodySim.linearVelocityXYZ_inverseMassW.y = linVel.y; bodySim.linearVelocityXYZ_inverseMassW.z = linVel.z;
+		bodySim.angularVelocityXYZ_maxPenBiasW.x = angVel.x; bodySim.angularVelocityXYZ_maxPenBiasW.y = angVel.y; bodySim.angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+		bodySim.body2World = body2World;
+
+		assert(body2World.isSane());
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrep.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrep.cuh
@@ -0,0 +1,409 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__JOINT_CONSTRAINT_BLOCK_PREP_CUH__
+#define	__JOINT_CONSTRAINT_BLOCK_PREP_CUH__
+
+#include "PxConstraintDesc.h"
+#include "PxConstraint.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraintPrep.h"
+#include "foundation/PxVec4.h"
+#include "MemoryAllocator.cuh"
+#include "PxgSolverKernelIndices.h"
+
+using namespace physx;
+
+namespace physx
+{
+	struct PxgMassProps
+	{
+		float invMass0;
+		float invMass1;
+		float invInertiaScale0;
+		float invInertiaScale1;
+
+		__device__ PxgMassProps(const PxReal iMass0, const PxReal iMass1, 
+			const float4 lin0_ang0_lin1_ang1)
+		{
+			invMass0 = iMass0 * lin0_ang0_lin1_ang1.x;
+			invMass1 = iMass1 * lin0_ang0_lin1_ang1.z;
+			invInertiaScale0 = lin0_ang0_lin1_ang1.y;
+			invInertiaScale1 = lin0_ang0_lin1_ang1.w;
+		}
+
+	};
+}
+
+
+//
+// See orthogonalize() in DyConstraintSetup.cpp for a general explanation
+//
+static __device__ void orthogonalize( PxU32* sortedRowIndices, PxgBlockConstraint1DVelocities* rvs, PxgBlockConstraint1DParameters* rps,
+										PxVec3* angSqrtInvInertia0,
+										PxVec3* angSqrtInvInertia1,
+										PxU32 rowCount, 
+										PxU32 eqRowCount,
+										const physx::PxgMassProps* m,
+										const PxU32 threadIndex)
+{
+	using namespace physx;
+
+	assert(eqRowCount<=6);
+
+	PxVec3 lin1m[6], ang1m[6], lin1[6], ang1[6];	
+	PxVec3 lin0m[6], ang0m[6], lin0[6], ang0[6];
+
+	PxReal geomErr[6];
+	PxReal velTarget[6];
+
+	for(PxU32 i=0;i<rowCount;i++)
+	{
+		const PxU32 index = sortedRowIndices[i];
+		const float4 linear0XYZ_geometricErrorW = rvs[index].linear0XYZ_geometricErrorW[threadIndex];
+		const float4 linear1XYZ_minImpulseW = rvs[index].linear1XYZ_minImpulseW[threadIndex];
+		const float4 angular0XYZ_velocityTargetW = rvs[index].angular0XYZ_velocityTargetW[threadIndex];
+		const float4 angular1XYZ_maxImpulseW = rvs[index].angular1XYZ_maxImpulseW[threadIndex];
+
+		PxVec3 l0(linear0XYZ_geometricErrorW.x, linear0XYZ_geometricErrorW.y, linear0XYZ_geometricErrorW.z);
+		PxVec3 a0(angular0XYZ_velocityTargetW.x, angular0XYZ_velocityTargetW.y, angular0XYZ_velocityTargetW.z);
+
+		PxVec3 l1(linear1XYZ_minImpulseW.x, linear1XYZ_minImpulseW.y, linear1XYZ_minImpulseW.z);
+		PxVec3 a1(angular1XYZ_maxImpulseW.x, angular1XYZ_maxImpulseW.y, angular1XYZ_maxImpulseW.z);
+
+		PxVec3 angSqrtL0 = angSqrtInvInertia0[i];
+		PxVec3 angSqrtL1 = angSqrtInvInertia1[i];
+
+		PxReal g = linear0XYZ_geometricErrorW.w;
+		PxReal T = angular0XYZ_velocityTargetW.w;
+
+		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
+		for(PxU32 j=0;j<eliminationRows;j++)
+		{
+			const PxVec3 s0 = l1.multiply(lin1m[j]) + l0.multiply(lin0m[j]);
+			const PxVec3 s1 = angSqrtL1.multiply(ang1m[j]) + angSqrtL0.multiply(ang0m[j]);
+			const PxVec3 s0s1 = s0+s1;
+			float t = s0s1.x + s0s1.y + s0s1.z;
+
+			l0 = l0 - (lin0[j] * t);
+			a0 = a0 - (ang0[j] * t);
+			l1 = l1 - (lin1[j] * t);
+			a1 = a1 - (ang1[j] * t);
+			g = g - (geomErr[j] * t);
+			T = T - (velTarget[j] * t);
+			angSqrtL0 = angSqrtL0 - (angSqrtInvInertia0[j] * t);
+			angSqrtL1 = angSqrtL1 - (angSqrtInvInertia1[j] * t);
+		}
+
+		rvs[index].linear0XYZ_geometricErrorW[threadIndex]		= make_float4(l0.x, l0.y, l0.z, g);
+		rvs[index].angular0XYZ_velocityTargetW[threadIndex]		= make_float4(a0.x, a0.y, a0.z, T);
+		rvs[index].linear1XYZ_minImpulseW[threadIndex]			= make_float4(l1.x,		l1.y,		l1.z,	linear1XYZ_minImpulseW.w);	
+		rvs[index].angular1XYZ_maxImpulseW[threadIndex]			= make_float4(a1.x,		a1.y,		a1.z,	angular1XYZ_maxImpulseW.w);
+		angSqrtInvInertia0[i] = angSqrtL0;
+		angSqrtInvInertia1[i] = angSqrtL1;
+
+
+		if(i<eqRowCount)
+		{
+			lin0[i] = l0;	
+			ang0[i] = a0;
+			geomErr[i] = g;
+			velTarget[i] = T;
+			lin1[i] = l1;	
+			ang1[i] = a1;	
+			angSqrtInvInertia0[i] = angSqrtL0;
+			angSqrtInvInertia1[i] = angSqrtL1;
+			
+			const PxVec3 l0m = l0 * m->invMass0;
+			const PxVec3 l1m = l1 * m->invMass1;
+			const PxVec3 a0m = angSqrtL0 * m->invInertiaScale0;
+			const PxVec3 a1m = angSqrtL1 * m->invInertiaScale1;
+
+			const PxVec3 s0 = l0.multiply(l0m) + l1.multiply(l1m);
+			const PxVec3 s1 = a0m.multiply(angSqrtL0) + a1m.multiply(angSqrtL1);
+			const PxVec3 s0s1 = s0 + s1;
+			const float s = s0s1.x + s0s1.y + s0s1.z;
+			const float a = s > 0 ? 1.f/s : 0.f;  // with mass scaling, it's possible for the inner product of a row to be zero
+
+			lin0m[i] = l0m * a;
+			ang0m[i] = a0m * a;
+			lin1m[i] = l1m * a;
+			ang1m[i] = a1m * a;
+		}
+	}
+}
+
+
+
+
+static __device__ void preprocessRows(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, 
+									  PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters,
+									  PxVec3* angSqrtInvInertia0, PxVec3* angSqrtInvInertia1,
+									  const physx::PxgSolverBodyPrepData* bd0, const physx::PxgSolverBodyPrepData* bd1,
+									  PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+									  const PxU32 threadIndex, bool disablePreprocessing)
+{
+	using namespace physx;
+
+	//Px1DConstraint* sorted[MAX_CONSTRAINTS];
+	// j is maxed at 12, typically around 7, so insertion sort is fine
+
+	for(PxU32 i=0; i<constraintData->mNumRows[threadIndex]; i++)
+	{
+		PxgBlockConstraint1DParameters& r = rowParameters[i];
+		
+		PxU32 j = i;
+		for(;j>0 && r.solveHint[threadIndex] < rowParameters[sortedRowIndices[j-1]].solveHint[threadIndex]; j--)
+			sortedRowIndices[j] = sortedRowIndices[j-1];
+
+		sortedRowIndices[j] = i;
+	}
+
+	/*for(PxU32 i=1;i<constraintData->mNumRows[threadIndex];i++)
+		assert(sorted[i-1]->solveHint[threadIndex] <= sorted[i]->solveHint[threadIndex]);*/
+
+	PxgMassProps m(bd0->initialLinVelXYZ_invMassW.w, bd1->initialLinVelXYZ_invMassW.w, reinterpret_cast<float4*>(constraintData->mInvMassScale)[threadIndex]);
+
+	const PxMat33 i0 = txIData0->sqrtInvInertia;
+	const PxMat33 i1 = txIData1->sqrtInvInertia;
+
+	for(PxU32 i = 0; i < constraintData->mNumRows[threadIndex]; ++i)
+	{
+		/*const PxVec3 angDelta0 = bd0->sqrtInvInertia * sorted[i]->angular0[threadIndex];
+		const PxVec3 angDelta1 = bd1->sqrtInvInertia * sorted[i]->angular1[threadIndex];*/
+		PxgBlockConstraint1DVelocities& rv =  rowVelocities[sortedRowIndices[i]];
+
+		const float4 angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+		const PxVec3 angular0(angular0XYZ_velocityTargetW.x,	angular0XYZ_velocityTargetW.y,	angular0XYZ_velocityTargetW.z);
+		const PxVec3 angular1(angular1XYZ_maxImpulseW.x,		angular1XYZ_maxImpulseW.y,		angular1XYZ_maxImpulseW.z);
+		/*const PxVec3 angDelta0 = bd0->sqrtInvInertia * angular0;
+		const PxVec3 angDelta1 = bd1->sqrtInvInertia * angular1;
+		angSqrtInvInertia0[i] = angDelta0;
+		angSqrtInvInertia1[i] = angDelta1;*/
+		angSqrtInvInertia0[i] = i0 * angular0;
+		angSqrtInvInertia1[i] = i1 * angular1;
+	}
+
+
+	if (!disablePreprocessing)
+	{
+		//MassProps m(bd0, bd1, ims);
+		for (PxU32 i = 0; i < constraintData->mNumRows[threadIndex];)
+		{
+			PxgBlockConstraint1DParameters& rp = rowParameters[sortedRowIndices[i]];
+
+			const PxU32 groupMajorId = PxU32(rp.solveHint[threadIndex] >> 8), start = i++;
+			while (i < constraintData->mNumRows[threadIndex] && PxU32(rowParameters[sortedRowIndices[i]].solveHint[threadIndex] >> 8) == groupMajorId)
+				i++;
+
+			if (groupMajorId == 4 || (groupMajorId == 8))
+			{
+				PxU32 bCount = start;		// count of bilateral constraints 
+				for (; bCount < i && (rowParameters[sortedRowIndices[bCount]].solveHint[threadIndex] & 255) == 0; bCount++)
+					;
+
+				orthogonalize(sortedRowIndices + start, rowVelocities, rowParameters, angSqrtInvInertia0 + start, angSqrtInvInertia1 + start, i - start, bCount - start, &m, threadIndex);
+
+			}
+		}
+	}
+}
+  
+static __device__ void intializeBlock1D(const physx::PxgBlockConstraint1DVelocities& rv,
+										const physx::PxgBlockConstraint1DParameters& rp,
+											float jointSpeedForRestitutionBounce,
+											float initJointSpeed,
+											float resp0,
+											float resp1,
+											float erp,
+											float dt,
+											float recipdt,
+											PxgBlockSolverConstraint1DCon& scon,
+											PxgBlockSolverConstraint1DMod& smod,
+											const PxVec3& _linear0, const PxVec3& _linear1, 
+											const PxVec3& _angular0, const PxVec3& _angular1,
+											const PxReal _minImpulse, const PxReal _maxImpulse, 
+											const PxReal cfm,
+											const PxU32 threadIndex
+											)
+{
+	using namespace physx;
+	
+	{
+		const PxU16 flags = rp.flags[threadIndex];
+		const PxReal springStiffness = rp.mods.spring.stiffness[threadIndex];
+		const PxReal springDamping = rp.mods.spring.damping[threadIndex];
+		const PxReal restitution = rp.mods.bounce.restitution[threadIndex];
+		const PxReal bounceThreshold = rp.mods.bounce.velocityThreshold[threadIndex];
+		const PxReal geomError = rv.linear0XYZ_geometricErrorW[threadIndex].w;
+		const PxReal velocityTarget = rv.angular0XYZ_velocityTargetW[threadIndex].w;
+
+		PxReal coeff0, coeff1;
+		queryReduced1dConstraintSolverConstantsPGS(flags, springStiffness, springDamping, restitution, bounceThreshold, geomError,
+												   velocityTarget, jointSpeedForRestitutionBounce, erp, dt, recipdt, coeff0, coeff1);
+
+		scon.lin0XYZ_minImpulse[threadIndex] = make_float4(_linear0.x, _linear0.y, _linear0.z, _minImpulse);
+		scon.lin1XYZ_maxImpulse[threadIndex] = make_float4(_linear1.x, _linear1.y, _linear1.z, _maxImpulse);
+		scon.ang0XYZ_resp0[threadIndex] = make_float4(_angular0.x, _angular0.y, _angular0.z, resp0);
+		scon.ang1XYZ_resp1[threadIndex] = make_float4(_angular1.x, _angular1.y, _angular1.z, resp1);
+		scon.initJointSpeed[threadIndex] = initJointSpeed;
+
+		smod.coeff0[threadIndex] = coeff0;
+		smod.coeff1[threadIndex] = coeff1;
+
+		smod.appliedForce[threadIndex] = 0;
+		smod.residual[threadIndex] = 0;
+
+		// Instead of setting the flag to zero as in the previous implementation, the flag is used to mark spring and
+		// acceleration spring.
+		smod.flags[threadIndex] = 0;
+		if (flags & Px1DConstraintFlag::eSPRING)
+			smod.flags[threadIndex] |= DY_SC_FLAG_SPRING;
+
+		if (flags & Px1DConstraintFlag::eACCELERATION_SPRING)
+			smod.flags[threadIndex] |= DY_SC_FLAG_ACCELERATION_SPRING;
+	}
+}
+
+static __device__ void setUp1DConstraintBlock(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+								  PxVec3* angSqrtInvInertia0, PxVec3* angSqrtInvInertia1, PxgBlockSolverConstraint1DCon* constraintsCon, PxgBlockSolverConstraint1DMod* constraintsMod,
+									float dt, float recipdt, const physx::PxgSolverBodyPrepData* sBodyData0, const physx::PxgSolverBodyPrepData* sBodyData1,
+									const PxU32 threadIndex)
+{
+	using namespace physx;
+
+	//PxU32 stride = sizeof(PxgSolverConstraint1D);
+
+	const PxReal erp = 1.0f;
+	const float4 sBodyData0_initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 sBodyData1_initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+	
+	for(PxU32 i=0;i<numRows;i++)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = constraintsCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = constraintsMod[i];
+		//Pxg1DConstraintBlock& c = *sorted[i];
+		const PxU32 index = sortedRowIndices[i];
+		PxgBlockConstraint1DParameters& rp = rowParameters[index];
+		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];
+
+		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
+		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
+		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+
+		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x,	c_linear0XYZ_geometricErrorW.y,		c_linear0XYZ_geometricErrorW.z);
+		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x,		c_linear1XYZ_minImpulseW.y,			c_linear1XYZ_minImpulseW.z);
+		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y,	c_angular0XYZ_velocityTargetW.z);
+		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x,		c_angular1XYZ_maxImpulseW.y,		c_angular1XYZ_maxImpulseW.z);
+		const PxVec3 ang0 = angSqrtInvInertia0[i];
+		const PxVec3 ang1 = angSqrtInvInertia1[i];
+
+
+		PxReal minImpulse;
+		PxReal maxImpulse;
+		{
+			const bool hasDriveLimit = rp.flags[threadIndex] & Px1DConstraintFlag::eHAS_DRIVE_LIMIT;
+			const bool driveLimitsAreForces = constraintData->mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES;
+			Dy::computeMinMaxImpulseOrForceAsImpulse(
+				c_linear1XYZ_minImpulseW.w, c_angular1XYZ_maxImpulseW.w,
+				hasDriveLimit, driveLimitsAreForces, dt,
+				minImpulse, maxImpulse);
+		}	
+	
+		cmod.ang0Writeback[threadIndex] = cang0;
+
+		const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+	
+		PxReal resp0 = clin0.magnitudeSquared() * sBodyData0_initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x + ang0.magnitudeSquared() * lin0_ang0_lin1_ang1.y;
+		PxReal resp1 = clin1.magnitudeSquared() * sBodyData1_initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z + ang1.magnitudeSquared() * lin0_ang0_lin1_ang1.w;
+
+		const PxReal initJointSpeed = sBodyData0->projectVelocity(clin0, cang0) - sBodyData1->projectVelocity(clin1, cang1);
+
+		// Following the previous implementation, cfm is not used in unitResponse, thus it is set to 0.
+		intializeBlock1D(rv, rp, initJointSpeed, initJointSpeed, resp0, resp1, erp, dt, recipdt, ccon, cmod, clin0, clin1, ang0, ang1, minImpulse, maxImpulse, 0.0f, threadIndex);
+
+		if(rp.flags[threadIndex] & Px1DConstraintFlag::eOUTPUT_FORCE)
+			cmod.flags[threadIndex] |= DY_SC_FLAG_OUTPUT_FORCE;
+	}
+}
+
+
+template<int NbThreads>
+static __device__ void setupSolverConstraintBlockGPU(PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+													 const physx::PxgSolverBodyPrepData* sBodyData0, const physx::PxgSolverBodyPrepData* sBodyData1, PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+													float dt, float recipdt, PxgBlockConstraintBatch& batch, 
+													 const PxU32 threadIndex, PxgBlockSolverConstraint1DHeader* header, PxgBlockSolverConstraint1DCon* rowsCon, PxgBlockSolverConstraint1DMod* rowsMod,
+													 const PxgSolverConstraintManagerConstants& managerConstants)
+{
+	using namespace physx;
+
+	//distance constraint might have zero number of rows	
+	header->rowCounts[threadIndex] = PxU8(constraintData->mNumRows[threadIndex]);
+
+	header->writeBackOffset[threadIndex] = managerConstants.mConstraintWriteBackIndex;
+
+	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+
+	const float4 initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+	
+	const float4 raWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndex];
+	const float4 rbWorld_angBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndex];
+	const float linBreakImpulse = raWorld_linBreakForce.w * dt;
+	const float angBreakForce = rbWorld_angBreakForce.w;
+	const float angBreakImpulse = angBreakForce * dt;
+	header->body0WorldOffset_linBreakImpulse[threadIndex] = make_float4(raWorld_linBreakForce.x, raWorld_linBreakForce.y, raWorld_linBreakForce.z, linBreakImpulse);
+	header->angBreakImpulse[threadIndex] = angBreakImpulse;
+
+	header->invMass0D0[threadIndex] = initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x;
+	header->invMass1D1[threadIndex] = initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z;
+	header->invInertiaScale0[threadIndex] = lin0_ang0_lin1_ang1.y;
+	header->invInertiaScale1[threadIndex] = lin0_ang0_lin1_ang1.w;
+	
+	header->breakable[threadIndex] = PxU8((raWorld_linBreakForce.w != PX_MAX_F32) || (angBreakForce != PX_MAX_F32));
+
+	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia0[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia1[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	
+	preprocessRows(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], sBodyData0, sBodyData1, txIData0, txIData1, threadIndex, !!(constraintData->mFlags[threadIndex] & PxConstraintFlag::eDISABLE_PREPROCESSING));
+	setUp1DConstraintBlock(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], rowsCon, rowsMod, dt, recipdt, sBodyData0, sBodyData1, threadIndex);
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrepTGS.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/jointConstraintBlockPrepTGS.cuh
@@ -0,0 +1,321 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef	__JOINT_CONSTRAINT_BLOCK_PREP_TGS_CUH__
+#define	__JOINT_CONSTRAINT_BLOCK_PREP_TGS_CUH__
+
+#include "PxConstraintDesc.h"
+#include "PxConstraint.h"
+#include "DySolverConstraintTypes.h"
+#include "DyCpuGpu1dConstraint.h"
+#include "PxgSolverBody.h"
+#include "PxgConstraint.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgCudaMemoryAllocator.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraintPrep.h"
+#include "foundation/PxVec4.h"
+#include "MemoryAllocator.cuh"
+#include "PxgSolverKernelIndices.h"
+#include "jointConstraintBlockPrep.cuh"
+
+
+using namespace physx;
+  
+static __device__ PxU32 intializeBlock1DTGS
+(const physx::PxgBlockConstraint1DVelocities& rv, const physx::PxgBlockConstraint1DParameters& rp, const PxgBlockConstraint1DData& constraintData,
+ const PxReal jointSpeedForRestitutionBounce, const PxReal initJointSpeed,
+ const PxReal unitResponse, const PxReal minRowResponse,
+ const PxReal erp, const PxReal lengthScale,
+ const PxReal stepDt, const PxReal simDt, const PxReal recipStepDt, const PxReal recipSimDt,
+ const PxVec3& angSqrtInvInertia0, const PxVec3& angSqrtInvInertia1,
+ const PxReal invInertiaScale0, const PxReal invInertiaScale1,
+ const PxU32 eqCount, const PxU32 threadIndex, const bool disablePreprocessing,
+ PxgTGSBlockSolverConstraint1DHeader& hdr,
+ PxgTGSBlockSolverConstraint1DCon& scon)
+{
+	using namespace physx;
+
+	//Copy min and max impulse because the convention of
+	//function inputs and outputs is very confusing.
+
+	PxReal maxBiasVelocity = 0.0f;
+	PxReal recipUnitResponse = 0.0f;
+	const PxReal geometricError = rv.linear0XYZ_geometricErrorW[threadIndex].w;
+	Dy::Constraint1dSolverConstantsTGS desc = {0.0f, 0.0f, 0.0f, 0.0f};
+	{
+		const PxU16 flags = PxU16(rp.flags[threadIndex]);
+		const PxReal stiffness = rp.mods.spring.stiffness[threadIndex];
+		const PxReal damping = rp.mods.spring.damping[threadIndex];
+		const PxReal restitution = rp.mods.bounce.restitution[threadIndex];
+		const PxReal bounceVelocityThreshold = rp.mods.bounce.velocityThreshold[threadIndex];
+		const PxReal velocityTarget = rv.angular0XYZ_velocityTargetW[threadIndex].w;
+
+		maxBiasVelocity = Dy::computeMaxBiasVelocityTGS(flags, jointSpeedForRestitutionBounce, bounceVelocityThreshold, 
+			restitution, geometricError, false, lengthScale, recipSimDt);
+
+		recipUnitResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		desc = Dy::compute1dConstraintSolverConstantsTGS(
+			flags, 
+			stiffness, damping,
+			restitution, bounceVelocityThreshold,
+			geometricError, velocityTarget, 
+			jointSpeedForRestitutionBounce, initJointSpeed, 
+			unitResponse, recipUnitResponse, 
+			erp, 
+			stepDt, recipStepDt);
+	}
+
+	//Write to the w-components of each float4.
+	//set the biasScale
+	float4 lin1XYZ_biasScale = rv.linear1XYZ_minImpulseW[threadIndex];
+	lin1XYZ_biasScale.w = desc.biasScale;
+	//set the initBias
+	float4 lin0XYZ_initBiasW = rv.linear0XYZ_geometricErrorW[threadIndex];
+	lin0XYZ_initBiasW.w = desc.error;
+	//set the velMultiplier
+	float4 ang0XYZ_velMultiplierW =  rv.angular0XYZ_velocityTargetW[threadIndex];
+	ang0XYZ_velMultiplierW.w = desc.velMultiplier;
+	//set the velTarget
+	float4 ang1XYZ_velTargetW = rv.angular1XYZ_maxImpulseW[threadIndex];
+	ang1XYZ_velTargetW.w = desc.targetVel;
+	
+	PxReal angularErrorScale = 1.f;
+	if (!(rp.flags[threadIndex] & Px1DConstraintFlag::eANGULAR_CONSTRAINT))
+	{
+		ang0XYZ_velMultiplierW.x = ang0XYZ_velMultiplierW.y = ang0XYZ_velMultiplierW.z = 0.f;
+		ang1XYZ_velTargetW.x = ang1XYZ_velTargetW.y = ang1XYZ_velTargetW.z = 0.f;
+		angularErrorScale = 0.f;
+	}
+
+	scon.lin0XYZ_initBiasOrCoeff0[threadIndex] = lin0XYZ_initBiasW;
+	scon.lin1XYZ_biasScaleOrCoeff1[threadIndex] = lin1XYZ_biasScale;
+	scon.ang0XYZ_velMultiplierOrCoeff2[threadIndex] = ang0XYZ_velMultiplierW;
+	scon.ang1XYZ_velTargetOrCoeff3[threadIndex] = ang1XYZ_velTargetW;
+
+	scon.maxBias[threadIndex] = maxBiasVelocity;
+	scon.angularErrorScale[threadIndex] = angularErrorScale;
+	scon.appliedForce[threadIndex] = 0.f;
+	scon.residual[threadIndex] = 0.0f;
+
+	const bool hasDriveLimit = rp.flags[threadIndex] & Px1DConstraintFlag::eHAS_DRIVE_LIMIT;
+	const bool driveLimitsAreForces = constraintData.mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES;
+	Dy::computeMinMaxImpulseOrForceAsImpulse(
+			rv.linear1XYZ_minImpulseW[threadIndex].w, rv.angular1XYZ_maxImpulseW[threadIndex].w,
+			hasDriveLimit, driveLimitsAreForces, simDt,
+			scon.minImpulse[threadIndex], scon.maxImpulse[threadIndex]);
+
+	PxU32 outFlags = 0;
+	const PxU32 solveHint = rp.solveHint[threadIndex];
+	Dy::raiseInternalFlagsTGS(rp.flags[threadIndex], solveHint, outFlags);
+
+	PxU32 ret = 0;
+	if (!disablePreprocessing)
+	{
+		if (solveHint == PxConstraintSolveHint::eROTATIONAL_EQUALITY)
+		{
+			ret = 1;
+			outFlags |= DY_SC_FLAG_ROT_EQ;
+
+			hdr.angOrthoAxis0_recipResponseW[eqCount][threadIndex] = make_float4(angSqrtInvInertia0.x * invInertiaScale0, angSqrtInvInertia0.y * invInertiaScale0,
+				angSqrtInvInertia0.z * invInertiaScale0, recipUnitResponse);
+			hdr.angOrthoAxis1_ErrorW[eqCount][threadIndex] = make_float4(angSqrtInvInertia1.x * invInertiaScale1, angSqrtInvInertia1.y * invInertiaScale1,
+				angSqrtInvInertia1.z * invInertiaScale1, geometricError);
+		}
+		else if(solveHint & PxConstraintSolveHint::eEQUALITY)
+			outFlags |= DY_SC_FLAG_ORTHO_TARGET;
+	}
+
+	scon.flags[threadIndex] = outFlags;
+	
+	return ret;
+}
+
+
+static __device__ PxU32 setUp1DConstraintBlockTGS
+(PxU32* sortedRowIndices, PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+ PxVec3* angSqrtInvInertias0, PxVec3* angSqrtInvInertias1, PxgTGSBlockSolverConstraint1DHeader& header, PxgTGSBlockSolverConstraint1DCon* constraintsCon,
+ float stepDt, float recipStepDt, float simDt, float recipSimDt, float biasCoefficient, const physx::PxgSolverBodyData* sBodyData0, const physx::PxgSolverBodyData* sBodyData1,
+ const PxU32 threadIndex, const PxReal lengthScale, bool disablePreprocessing)
+{
+	using namespace physx;
+
+	//PxU32 stride = sizeof(PxgSolverConstraint1D);
+
+	const PxReal erp = 0.5f * biasCoefficient;
+	const float4 sBodyData0_initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 sBodyData1_initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+
+	const bool isKinematic0 = !!(sBodyData0->flags & PxRigidBodyFlag::eKINEMATIC);
+	const bool isKinematic1 = !!(sBodyData1->flags & PxRigidBodyFlag::eKINEMATIC);
+	
+	PxU32 eqCount = 0;
+	for(PxU32 i=0;i<numRows;i++)
+	{
+		PxgTGSBlockSolverConstraint1DCon& ccon = constraintsCon[i];
+		//Pxg1DConstraintBlock& c = *sorted[i];
+		const PxU32 index = sortedRowIndices[i];
+		PxgBlockConstraint1DParameters& rp = rowParameters[index];
+		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];
+
+		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
+		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
+		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
+		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];
+
+		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x,	c_linear0XYZ_geometricErrorW.y,		c_linear0XYZ_geometricErrorW.z);
+		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x,		c_linear1XYZ_minImpulseW.y,			c_linear1XYZ_minImpulseW.z);
+		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y,	c_angular0XYZ_velocityTargetW.z);
+		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x,		c_angular1XYZ_maxImpulseW.y,		c_angular1XYZ_maxImpulseW.z);
+		const PxVec3 angSqrtInvInertia0 = angSqrtInvInertias0[i];
+		const PxVec3 angSqrtInvInertia1 = angSqrtInvInertias1[i];
+
+		const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+	
+		PxReal unitResponse;
+		{
+			const PxReal resp0 = clin0.magnitudeSquared() * sBodyData0_initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x + angSqrtInvInertia0.magnitudeSquared() * lin0_ang0_lin1_ang1.y;
+			const PxReal resp1 = clin1.magnitudeSquared() * sBodyData1_initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z + angSqrtInvInertia1.magnitudeSquared() * lin0_ang0_lin1_ang1.w;
+			unitResponse  = resp0 + resp1;
+		}
+
+		PxReal jointSpeedForRestitutionBounce;
+		PxReal initJointSpeed;
+		{
+			const float vel0 = sBodyData0->projectVelocity(clin0, cang0);
+			const float vel1 = sBodyData1->projectVelocity(clin1, cang1);
+			Dy::computeJointSpeedTGS(
+				vel0, isKinematic0, vel1, isKinematic1, 
+				jointSpeedForRestitutionBounce, initJointSpeed);
+		}
+
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+
+		eqCount += intializeBlock1DTGS(
+			rv, rp, *constraintData,
+			jointSpeedForRestitutionBounce, initJointSpeed,
+			unitResponse, minRowResponse, erp, lengthScale,
+			stepDt, simDt, recipStepDt, recipSimDt, 
+			angSqrtInvInertia0, angSqrtInvInertia1, 
+			lin0_ang0_lin1_ang1.y, lin0_ang0_lin1_ang1.w,
+			eqCount, threadIndex, disablePreprocessing,			
+			header, ccon);
+	}
+
+	for (PxU32 i = eqCount; i < 3; ++i)
+	{
+		header.angOrthoAxis0_recipResponseW[i][threadIndex] = make_float4(0.f);
+		header.angOrthoAxis1_ErrorW[i][threadIndex] = make_float4(0.f);
+	}
+
+	return eqCount;
+}
+
+
+template<int NbThreads>
+static __device__ void setupSolverConstraintBlockGPUTGS(PxgBlockConstraint1DData* constraintData, PxgBlockConstraint1DVelocities* rowVelocities, PxgBlockConstraint1DParameters* rowParameters, 
+													const physx::PxgSolverBodyData* sBodyData0, const physx::PxgSolverBodyData* sBodyData1, PxgSolverTxIData* txIData0, PxgSolverTxIData* txIData1,
+													float dt, float recipdt, float totalDt, float recipTotalDt, float lengthScale, float biasCoefficient, PxgBlockConstraintBatch& batch, 
+													const PxU32 threadIndex, PxgTGSBlockSolverConstraint1DHeader* header, PxgTGSBlockSolverConstraint1DCon* rowsCon, const PxgSolverConstraintManagerConstants& managerConstants)
+{
+	using namespace physx;
+
+	//distance constraint might have zero number of rows	
+	const PxU32 numRows = constraintData->mNumRows[threadIndex];
+
+	uchar4 rowCounts_breakable_orthoAxisCount;
+
+
+
+	rowCounts_breakable_orthoAxisCount.x = PxU8(numRows);
+	
+
+	header->writeBackOffset[threadIndex] = managerConstants.mConstraintWriteBackIndex;
+
+	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;
+
+	const float4 initialLinVelXYZ_invMassW0 = sBodyData0->initialLinVelXYZ_invMassW;
+	const float4 initialLinVelXYZ_invMassW1 = sBodyData1->initialLinVelXYZ_invMassW;
+
+	const float4 rAWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndex];
+	const float4 rBWorld_AngBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndex];
+	
+	const float linBreakImpulse = rAWorld_linBreakForce.w * totalDt;
+	const float angBreakImpulse = rBWorld_AngBreakForce.w * totalDt;
+	header->linBreakImpulse[threadIndex] = linBreakImpulse;
+	header->angBreakImpulse[threadIndex] = angBreakImpulse;
+
+	const float invMass0 = initialLinVelXYZ_invMassW0.w * lin0_ang0_lin1_ang1.x;
+	const float invMass1 = initialLinVelXYZ_invMassW1.w * lin0_ang0_lin1_ang1.z;
+
+	
+
+	header->rAWorld_invMass0D0[threadIndex] = make_float4(rAWorld_linBreakForce.x, rAWorld_linBreakForce.y, rAWorld_linBreakForce.z, invMass0);
+	header->rBWorld_invMass1D1[threadIndex] = make_float4(rBWorld_AngBreakForce.x, rBWorld_AngBreakForce.y, rBWorld_AngBreakForce.z, invMass1);
+	header->invInertiaScale0[threadIndex] = lin0_ang0_lin1_ang1.y;
+	header->invInertiaScale1[threadIndex] = lin0_ang0_lin1_ang1.w;
+
+	rowCounts_breakable_orthoAxisCount.y = PxU8((rAWorld_linBreakForce.w != PX_MAX_F32) || (rBWorld_AngBreakForce.w != PX_MAX_F32));
+
+
+	for (PxU32 i = 0; i < numRows; ++i)
+	{
+		if (rowParameters[i].flags[threadIndex] & Px1DConstraintFlag::eANGULAR_CONSTRAINT)
+		{
+			PxU32 hint = rowParameters[i].solveHint[threadIndex];
+			if (hint == PxConstraintSolveHint::eEQUALITY)
+				hint = PxConstraintSolveHint::eROTATIONAL_EQUALITY;
+			else if (hint == PxConstraintSolveHint::eINEQUALITY)
+				hint = PxConstraintSolveHint::eROTATIONAL_INEQUALITY;
+
+			rowParameters[i].solveHint[threadIndex] = hint;
+		}
+	}
+
+
+	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia0[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+	__shared__ PxVec3 angSqrtInvInertia1[NbThreads][Dy::MAX_CONSTRAINT_ROWS];
+
+	bool disablePreprocessing = !!(constraintData->mFlags[threadIndex] & PxConstraintFlag::eDISABLE_PREPROCESSING);
+
+	preprocessRows(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x], 
+		sBodyData0, sBodyData1, txIData0, txIData1, threadIndex, disablePreprocessing);
+	rowCounts_breakable_orthoAxisCount.z = setUp1DConstraintBlockTGS(sortedRowIndices[threadIdx.x], constraintData, rowVelocities, rowParameters, angSqrtInvInertia0[threadIdx.x], angSqrtInvInertia1[threadIdx.x],
+		*header, rowsCon, dt, recipdt, totalDt, recipTotalDt, biasCoefficient, sBodyData0, sBodyData1, threadIndex, lengthScale, disablePreprocessing);
+
+	header->rowCounts_breakable_orthoAxisCount[threadIndex] = rowCounts_breakable_orthoAxisCount;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cu
@@ -0,0 +1,64 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "preIntegration.cuh"
+
+extern "C" __host__ void initSolverKernels4() {}
+
+extern "C" __global__ void preIntegrationLaunch(
+	const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool, PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* PX_RESTRICT bodySimPool, const PxNodeIndex* PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* gTransforms, float4* gOutVelocityPool, PxU32* solverBodyIndices)
+{
+	preIntegration(offset, nbSolverBodies, dt, gravity, solverBodyDataPool, solverBodySleepDataPool, solverTxIDataPool, 
+		bodySimPool, islandNodeIndices, gTransforms, gOutVelocityPool, solverBodyIndices);
+}
+
+extern "C" __global__ void initStaticKinematics(
+	const uint32_t nbStaticKinematics, const uint32_t nbSolverBodies, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverTxIData* PX_RESTRICT solverTxIDataPool, PxAlignedTransform* gTransforms, float4* gOutVelocityPool, 
+	PxNodeIndex* activeNodeIndices, PxU32* solverBodyIndices)
+{
+	const uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	if(idx < nbStaticKinematics)
+	{
+		//KS - TODO - Optimize these reads/writes
+		const PxNodeIndex index = activeNodeIndices[idx];
+		if (!index.isStaticBody())
+		{
+			solverBodyIndices[index.index()] = idx;
+		}
+		gTransforms[idx] = solverBodyDataPool[idx].body2World;
+		gOutVelocityPool[idx] = solverBodyDataPool[idx].initialLinVelXYZ_invMassW;
+		gOutVelocityPool[idx + nbSolverBodies] = solverBodyDataPool[idx].initialAngVelXYZ_penBiasClamp;
+		solverTxIDataPool[idx].deltaBody2World = PxTransform(PxIdentity);
+		solverTxIDataPool[idx].sqrtInvInertia = PxMat33(PxZero);
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegration.cuh
@@ -0,0 +1,439 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgBodySim.h"
+#include "PxgSolverBody.h"
+#include "PxvDynamics.h"
+#include "PxsRigidBody.h"
+#include "PxgSolverKernelIndices.h"
+#include "stdio.h"
+
+using namespace physx;
+
+__device__ __forceinline__ PxVec3 computeSafeSqrtInertia(const PxVec3& v)
+{
+	return PxVec3(v.x == 0.f ? 0.f : PxSqrt(v.x), v.y == 0.f ? 0.f : PxSqrt(v.y), v.z == 0.f ? 0.f : PxSqrt(v.z));
+}
+
+__device__ __forceinline__ void transformInertiaTensor(const PxVec3& invD, const PxMat33& M, PxAlignedMat33& mIInv)
+{
+	const float	axx = invD.x*M(0, 0), axy = invD.x*M(1, 0), axz = invD.x*M(2, 0);
+	const float	byx = invD.y*M(0, 1), byy = invD.y*M(1, 1), byz = invD.y*M(2, 1);
+	const float	czx = invD.z*M(0, 2), czy = invD.z*M(1, 2), czz = invD.z*M(2, 2);
+
+	mIInv(0, 0) = axx*M(0, 0) + byx*M(0, 1) + czx*M(0, 2);
+	mIInv(1, 1) = axy*M(1, 0) + byy*M(1, 1) + czy*M(1, 2);
+	mIInv(2, 2) = axz*M(2, 0) + byz*M(2, 1) + czz*M(2, 2);
+
+	mIInv(0, 1) = mIInv(1, 0) = axx*M(1, 0) + byx*M(1, 1) + czx*M(1, 2);
+	mIInv(0, 2) = mIInv(2, 0) = axx*M(2, 0) + byx*M(2, 1) + czx*M(2, 2);
+	mIInv(1, 2) = mIInv(2, 1) = axy*M(2, 0) + byy*M(2, 1) + czy*M(2, 2);
+}
+
+__device__ __forceinline__ void transformInertiaTensor(const PxVec3& invD, const PxMat33& M, PxMat33& mIInv)
+{
+	const float	axx = invD.x*M(0, 0), axy = invD.x*M(1, 0), axz = invD.x*M(2, 0);
+	const float	byx = invD.y*M(0, 1), byy = invD.y*M(1, 1), byz = invD.y*M(2, 1);
+	const float	czx = invD.z*M(0, 2), czy = invD.z*M(1, 2), czz = invD.z*M(2, 2);
+
+	mIInv(0, 0) = axx*M(0, 0) + byx*M(0, 1) + czx*M(0, 2);
+	mIInv(1, 1) = axy*M(1, 0) + byy*M(1, 1) + czy*M(1, 2);
+	mIInv(2, 2) = axz*M(2, 0) + byz*M(2, 1) + czz*M(2, 2);
+
+	mIInv(0, 1) = mIInv(1, 0) = axx*M(1, 0) + byx*M(1, 1) + czx*M(1, 2);
+	mIInv(0, 2) = mIInv(2, 0) = axx*M(2, 0) + byx*M(2, 1) + czx*M(2, 2);
+	mIInv(1, 2) = mIInv(2, 1) = axy*M(2, 0) + byy*M(2, 1) + czy*M(2, 2);
+}
+
+
+PX_FORCE_INLINE __device__ PxVec3 getGravityAcceleration(const PxU32 disableGravity, const PxReal accelScale, const PxVec3& gravity, const PxReal dt)
+{
+	if(!(disableGravity))
+	{
+		return gravity * accelScale * dt;
+	}
+
+	return PxVec3(0);
+}
+
+
+__device__  __forceinline__ void bodyCoreComputeUnconstrainedVelocity(const float4& maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW, 
+	float4& linearVelocityXYZ_inverseMassW, float4& angularVelocityXYZ_maxPenBiasW,
+	const PxU32 lockFlags, const PxU32 disableGravity, const PxReal accelScale, const PxVec3& gravity, 
+	const float4& linearAccel, const float4& angularAccel, const PxReal dt)
+{
+	PxVec3 linearVelocity(linearVelocityXYZ_inverseMassW.x, linearVelocityXYZ_inverseMassW.y, linearVelocityXYZ_inverseMassW.z);
+	PxVec3 angularVelocity(angularVelocityXYZ_maxPenBiasW.x, angularVelocityXYZ_maxPenBiasW.y, angularVelocityXYZ_maxPenBiasW.z);
+	const float4 temp = maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW;
+
+
+
+	//Multiply everything that needs multiplied by dt to improve code generation.
+	const PxVec3 linearAccelTimesDT = getGravityAcceleration(disableGravity, accelScale, gravity, dt) + PxVec3(linearAccel.x, linearAccel.y, linearAccel.z) * dt;
+	const PxVec3 angularAccelTimesDT = PxVec3(angularAccel.x, angularAccel.y, angularAccel.z) * dt;
+	//const PxVec3 angularAccelTimesDT = PxVec3(0.f);
+	const PxReal linearDampingTimesDT = temp.z*dt;
+	const PxReal angularDampingTimesDT = temp.w*dt;
+	const PxReal oneMinusLinearDampingTimesDT = 1.0f - linearDampingTimesDT;
+	const PxReal oneMinusAngularDampingTimesDT = 1.0f - angularDampingTimesDT;
+
+	//TODO context-global gravity
+	linearVelocity += linearAccelTimesDT;
+	angularVelocity += angularAccelTimesDT;
+
+	//Apply damping.
+	const PxReal linVelMultiplier = physx::intrinsics::fsel(oneMinusLinearDampingTimesDT, oneMinusLinearDampingTimesDT, 0.0f);
+	const PxReal angVelMultiplier = physx::intrinsics::fsel(oneMinusAngularDampingTimesDT, oneMinusAngularDampingTimesDT, 0.0f);
+	linearVelocity *= linVelMultiplier;
+	angularVelocity *= angVelMultiplier;
+
+	// Clamp velocity
+	const PxReal angVelSq = angularVelocity.magnitudeSquared();
+	if (angVelSq > temp.y)
+	{
+		angularVelocity *= PxSqrt(temp.y / angVelSq);
+	}
+
+	const PxReal linVelSq = linearVelocity.magnitudeSquared();
+	if (linVelSq > temp.x)
+	{
+		linearVelocity *= PxSqrt(temp.x / linVelSq);
+	}
+
+	//printf("%i, LV = (%f, %f, %f)\n", threadIdx.x, linearVelocity.x, linearVelocity.y, linearVelocity.z);
+	//printf("%i, AV = (%f, %f, %f)\n", threadIdx.x, angularVelocity.x, angularVelocity.y, angularVelocity.z);
+
+
+	linearVelocityXYZ_inverseMassW.x = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X ? 0.f : linearVelocity.x;
+	linearVelocityXYZ_inverseMassW.y = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y ? 0.f : linearVelocity.y;
+	linearVelocityXYZ_inverseMassW.z = lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z ? 0.f : linearVelocity.z;
+	angularVelocityXYZ_maxPenBiasW.x = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X ? 0.f : angularVelocity.x;
+	angularVelocityXYZ_maxPenBiasW.y = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y ? 0.f : angularVelocity.y;
+	angularVelocityXYZ_maxPenBiasW.z = lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z ? 0.f : angularVelocity.z;
+}
+
+//Reads a 4 byte element from shared buffer, where data is swizzled to avoid bank conflicts
+template<typename RetType, typename ContainedClass>
+PX_FORCE_INLINE PX_CUDA_CALLABLE RetType readSwizzledWord(const uint* sharedBuffer, uint element, uint quadwordWithinElem, uint elemInQuadword)
+{
+	const uint quadword = (element * sizeof(ContainedClass) / sizeof(uint4)) + quadwordWithinElem; //Which quadword within the structure am I reading?
+
+	const uint page = quadword / 32; //Which page is it in?
+	const uint subpage = quadword & 31; //Which quadword within the page is it in?
+
+	const uint address = page * 128 + elemInQuadword * 32 + subpage;
+
+	return reinterpret_cast<const RetType*>(sharedBuffer)[address];
+}
+
+template<typename ContainedClass>
+PX_FORCE_INLINE PX_CUDA_CALLABLE float4 readSwizzledFloat4(const uint* sharedBuffer, uint element, uint quadwordWithinElem)
+{
+	const uint quadword = (element * sizeof(ContainedClass) / sizeof(uint4)) + quadwordWithinElem; //Which quadword within the structure am I reading?
+
+	const uint page = quadword / 32; //Which page is it in?
+	const uint subpage = quadword & 31; //Which quadword within the page is it in?
+
+	const uint address = page * 128 + subpage;
+
+	return make_float4(reinterpret_cast<const float*>(sharedBuffer)[address], reinterpret_cast<const float*>(sharedBuffer)[address + 32], reinterpret_cast<const float*>(sharedBuffer)[address + 64],
+		reinterpret_cast<const float*>(sharedBuffer)[address + 96]);
+}
+
+//fill in all the dynamic bodies data
+static __device__ void preIntegration(const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity,
+	PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool,
+	PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* PX_RESTRICT bodySimPool, 
+	const PxNodeIndex* PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* PX_RESTRICT gTransforms,
+	float4* PX_RESTRICT gOutVelocityPool,
+	PxU32* PX_RESTRICT solverBodyIndices,
+	bool skipGravityApplication = false)
+{
+	const uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+	const uint32_t a = idx + offset;
+
+	//	const uint32_t warpStartIdx = offset + (idx&(~31));
+
+	const PxU32 BodySimSize = sizeof(PxgBodySim) / sizeof(float4);
+
+	__shared__ uint sharedBufferSpace[PxgKernelBlockDim::PRE_INTEGRATION / 32][16 * 33];
+
+	__shared__ PxU32 sharedIslandNodeIndices[PxgKernelBlockDim::PRE_INTEGRATION / 32][32];
+
+	const PxU32 warpIndex = threadIdx.x / 32;
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+	if (a < nbSolverBodies)
+	{
+		PxU32 index = islandNodeIndices[a].index();
+		sharedIslandNodeIndices[warpIndex][threadIndexInWarp] = index;
+		solverBodyIndices[index] = a;
+		//printf("%i: SharedIslandNodeIndices[%i][%i] = %i, %i\n", a, warpIndex, threadIndexInWarp, sharedIslandNodeIndices[warpIndex][threadIndexInWarp], islandNodeIndices[a]);
+	}
+
+	__syncwarp();
+
+
+	PX_COMPILE_TIME_ASSERT((sizeof(uint4) * 4 * 4) >= BodySimSize);
+
+	const PxU32 startReadIndex = a - threadIndexInWarp;
+
+	const PxU32 NbToRead = startReadIndex < nbSolverBodies ? PxMin(32u, nbSolverBodies - startReadIndex) : 0;
+
+
+	float4 linearVelocityXYZ_inverseMassW;
+	float4 angularVelocityXYZ_maxPenBiasW;
+	float4 sleepAngVelAccXYZ_accelScaleW;
+	float4 inverseInertiaXYZ_contactReportThresholdW;
+	float4 maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW;
+	float4 linearAccel;
+	float4 angularAccel;
+	float maxImpulse;
+	PxAlignedTransform body2World;
+	PxMat33 sqrtInvInertia;
+	PxU32 internalFlags;
+	PxU16 lockFlags;
+	PxU16 disableGravity;
+	PxReal offsetSlop;
+
+	for (PxU32 i = 0; i < NbToRead; i += 8)
+	{
+		const PxU32 TotalUint4ToRead = BodySimSize * PxMin(NbToRead - i, 8u);
+
+		for (PxU32 j = threadIndexInWarp, iter = 0; j < TotalUint4ToRead; j += 32, iter++)
+		{
+			const PxU32 ind = j / BodySimSize;
+			const PxU32 nodeIndex = sharedIslandNodeIndices[warpIndex][ind + i];
+			const PxU32 offset = j - (ind*BodySimSize);
+
+			const uint4* src = reinterpret_cast<const uint4*>(&bodySimPool[nodeIndex]);
+			const uint4 val = src[offset];
+			sharedBufferSpace[warpIndex][iter * 128 + threadIndexInWarp] = val.x;
+			sharedBufferSpace[warpIndex][iter * 128 + 32 + threadIndexInWarp] = val.y;
+			sharedBufferSpace[warpIndex][iter * 128 + 64 + threadIndexInWarp] = val.z;
+			sharedBufferSpace[warpIndex][iter * 128 + 96 + threadIndexInWarp] = val.w;
+		}
+
+		__syncwarp();
+
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 8) && a < nbSolverBodies)
+		{
+			const uint* bSims = reinterpret_cast<uint*>(sharedBufferSpace[warpIndex]);
+
+			const PxU32 readIndex = threadIndexInWarp & 7;
+
+			linearVelocityXYZ_inverseMassW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 0);
+			angularVelocityXYZ_maxPenBiasW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 1);
+			maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 2);
+			inverseInertiaXYZ_contactReportThresholdW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 3);
+			sleepAngVelAccXYZ_accelScaleW = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 5);
+			
+			float4 q = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 7);
+			float4 p = readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 8);
+			body2World.p = make_float4(p.x, p.y, p.z, 0.f);
+			body2World.q = PxAlignedQuat(q.x, q.y, q.z, q.w);
+			maxImpulse = readSwizzledWord<float, PxgBodySim>(bSims, readIndex, 10, 3);
+			internalFlags = readSwizzledWord<PxU32, PxgBodySim>(bSims, readIndex, 11, 1);
+
+			ushort2 tmp = readSwizzledWord<ushort2, PxgBodySim>(bSims, readIndex, 11, 2);
+
+			lockFlags = tmp.x;
+			disableGravity = tmp.y;
+			offsetSlop = readSwizzledWord<PxReal, PxgBodySim>(bSims, readIndex, 11, 3);
+
+			linearAccel = skipGravityApplication ? make_float4(0.0f) : readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 12);
+			angularAccel = skipGravityApplication ? make_float4(0.0f) : readSwizzledFloat4<PxgBodySim>(bSims, readIndex, 13);
+		}
+
+		__syncwarp();
+	}
+
+	if (a < nbSolverBodies)
+	{
+		bodyCoreComputeUnconstrainedVelocity(maxLinearVelocitySqX_maxAngularVelocitySqY_linearDampingZ_angularDampingW, linearVelocityXYZ_inverseMassW, angularVelocityXYZ_maxPenBiasW,
+			lockFlags, disableGravity || skipGravityApplication, sleepAngVelAccXYZ_accelScaleW.w, gravity, linearAccel, angularAccel, dt);
+
+		//initialize solver bodyData
+		//const float4 inverseInertiaXYZ_contactReportThresholdW = bodySim.inverseInertiaXYZ_contactReportThresholdW;
+		const PxVec3 inverseInertia(inverseInertiaXYZ_contactReportThresholdW.x, inverseInertiaXYZ_contactReportThresholdW.y, inverseInertiaXYZ_contactReportThresholdW.z);
+		const PxVec3 safeSqrtInvInertia = computeSafeSqrtInertia(inverseInertia);
+		const PxMat33 rotation(reinterpret_cast<PxQuat&>(body2World.q));
+		transformInertiaTensor(safeSqrtInvInertia, rotation, sqrtInvInertia);
+
+		gOutVelocityPool[a] = linearVelocityXYZ_inverseMassW;
+
+		//KS - to make this compatible with the rigid body particle system, we store the angular velocity in the gOutVelocityPool
+		//in momocity format!
+		const PxVec3 sqrtInertiaV(safeSqrtInvInertia.x == 0.f ? 0.f : 1.f / safeSqrtInvInertia.x, safeSqrtInvInertia.y == 0.f ? 0.f : 1.f / safeSqrtInvInertia.y, 
+			safeSqrtInvInertia.z == 0.f ? 0.f : 1.f / safeSqrtInvInertia.z);
+
+		PxMat33 sqrtInertia;
+		transformInertiaTensor(sqrtInertiaV, rotation, sqrtInertia);
+
+		PxVec3 angVel(angularVelocityXYZ_maxPenBiasW.x, angularVelocityXYZ_maxPenBiasW.y, angularVelocityXYZ_maxPenBiasW.z);
+
+		if (internalFlags & PxsRigidBody::eENABLE_GYROSCOPIC)
+		{
+			const PxVec3 localInertia(
+				inverseInertia.x == 0.f ? 0.f : 1.f / inverseInertia.x,
+				inverseInertia.y == 0.f ? 0.f : 1.f / inverseInertia.y,
+				inverseInertia.z == 0.f ? 0.f : 1.f / inverseInertia.z);
+
+			const PxVec3 localAngVel = body2World.q.rotateInv(angVel);
+			const PxVec3 origMom = localInertia.multiply(localAngVel);
+			const PxVec3 torque = -localAngVel.cross(origMom);
+			PxVec3 newMom = origMom + torque * dt;
+			const PxReal denom = newMom.magnitude();
+			PxReal ratio = denom > 0.f ? origMom.magnitude() / denom : 0.f;
+			newMom *= ratio;
+			PxVec3 newDeltaAngVel = body2World.q.rotate(inverseInertia.multiply(newMom) - localAngVel);
+
+			angVel += newDeltaAngVel;
+		}
+
+		angularVelocityXYZ_maxPenBiasW.x = angVel.x; angularVelocityXYZ_maxPenBiasW.y = angVel.y; angularVelocityXYZ_maxPenBiasW.z = angVel.z;
+
+		angVel = sqrtInertia * (angVel);
+		
+		gOutVelocityPool[a + nbSolverBodies] = make_float4(angVel.x, angVel.y, angVel.z, angularVelocityXYZ_maxPenBiasW.w);
+
+		//KS - TODO - coalesce theses, probably by writing out 2x float4.
+		gTransforms[a] = body2World;
+	}
+
+	for (PxU32 i = 0; i < NbToRead; i += 16)
+	{
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		{
+			PxgSolverBodyData* bData = reinterpret_cast<PxgSolverBodyData*>(&sharedBufferSpace[warpIndex][0]);
+
+			const PxU32 nodeIndex = sharedIslandNodeIndices[warpIndex][threadIndexInWarp];
+
+			PxgSolverBodyData& data = bData[threadIndexInWarp & 15];
+			//PxgSolverBodyData& data = solverBodyDataPool[a];
+
+			//data.sqrtInvInertia = sqrtInvInertia;
+			data.body2World = body2World;
+			data.initialLinVelXYZ_invMassW = linearVelocityXYZ_inverseMassW;
+			data.initialAngVelXYZ_penBiasClamp = angularVelocityXYZ_maxPenBiasW;
+			data.offsetSlop = offsetSlop;
+
+			data.reportThreshold = inverseInertiaXYZ_contactReportThresholdW.w;
+
+			data.islandNodeIndex = PxNodeIndex(nodeIndex);
+			//data.inverseInertia  = make_float4(bodySim.inverseInertiaXYZ_contactReportThresholdW.x,bodySim.inverseInertiaXYZ_contactReportThresholdW.y, bodySim.inverseInertiaXYZ_contactReportThresholdW.z, 0.f);
+
+			data.maxImpulse = maxImpulse; //KS - can this be read in in a more efficient/better way?
+			PxU32 flags = 0;
+			if (internalFlags & PxsRigidBody::eSPECULATIVE_CCD)
+				flags |= PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD;
+			if (internalFlags & PxsRigidBody::eENABLE_GYROSCOPIC)
+				flags |= PxRigidBodyFlag::eENABLE_GYROSCOPIC_FORCES;
+			data.flags = flags;
+		}
+
+		__syncwarp();
+
+		const PxU32 solverBodyDataSize = sizeof(PxgSolverBodyData) / sizeof(uint);
+
+		PxU32 TotalUintToWrite = solverBodyDataSize * PxMin(NbToRead - i, 16u);
+
+		uint* dst = reinterpret_cast<uint*>(&solverBodyDataPool[startReadIndex + i]);
+		uint* src = reinterpret_cast<uint*>(&sharedBufferSpace[warpIndex][0]);
+
+		for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		{
+			dst[j] = src[j];
+		}
+		__syncwarp();
+
+		//if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		//{
+		//	//KS - TODO - is this necessary? We now have PxgBodySim, which stores all this data.
+		//	//This entire sleep data thing looks redundant now!
+		//	PxgSolverBodySleepData* sData = reinterpret_cast<PxgSolverBodySleepData*>(sharedBufferSpace[warpIndex]);
+		//	PxgSolverBodySleepData& sleepData = sData[threadIndexInWarp & 15];
+		//	//PxgSolverBodySleepData& sleepData = solverBodySleepDataPool[a];
+		//	//initialize solver body sleep data
+		//	sleepData.freezeCount = sleepLinVelAccXYZ_freezeCountW.w;
+		//	sleepData.sleepLinVelAcc = *reinterpret_cast<PxVec3*>(&(make_float3(sleepLinVelAccXYZ_freezeCountW).x));
+		//	sleepData.accelScale = sleepAngVelAccXYZ_accelScaleW.w;
+		//	sleepData.sleepAngVelAcc = *reinterpret_cast<PxVec3*>(&(make_float3(sleepAngVelAccXYZ_accelScaleW).x));
+
+		//	sleepData.freezeThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.x;
+		//	sleepData.wakeCounter = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.y;
+		//	sleepData.sleepThreshold = freezeThresholdX_wakeCounterY_sleepThresholdZ_bodySimIndex.z;
+
+		//	sleepData.internalFlags = internalFlags;
+		//}
+
+		//__syncwarp();
+
+		//const PxU32 sleepDataSize = sizeof(PxgSolverBodySleepData) / sizeof(uint);
+
+		//TotalUintToWrite = sleepDataSize * PxMin(NbToRead - i, 16u);
+		//dst = reinterpret_cast<uint*>(&solverBodySleepDataPool[startReadIndex + i]);
+
+		//for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		//{
+		//	dst[j] = sharedBufferSpace[warpIndex][j];
+		//}
+
+		//__syncwarp();
+
+		if (threadIndexInWarp >= i && threadIndexInWarp < (i + 16) && a < nbSolverBodies)
+		{
+			PxgSolverTxIData* bData = reinterpret_cast<PxgSolverTxIData*>(&sharedBufferSpace[warpIndex][0]);
+
+			PxgSolverTxIData& data = bData[threadIndexInWarp & 15];
+			//PxgSolverBodyData& data = solverBodyDataPool[a];
+
+			data.sqrtInvInertia = sqrtInvInertia;
+			data.deltaBody2World = PxTransform(PxIdentity);
+		}
+		__syncwarp();
+
+		const PxU32 txISize = sizeof(PxgSolverTxIData) / sizeof(uint);
+
+		TotalUintToWrite = txISize * PxMin(NbToRead - i, 16u);
+		dst = reinterpret_cast<uint*>(&solverTxIDataPool[startReadIndex + i]);
+
+		for (PxU32 j = threadIndexInWarp; j < TotalUintToWrite; j += 32)
+		{
+			dst[j] = sharedBufferSpace[warpIndex][j];
+		}
+
+		__syncwarp();
+
+	}
+}
+
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegrationTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/preIntegrationTGS.cu
@@ -0,0 +1,43 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#define IS_TGS_SOLVER
+
+#include "preIntegration.cuh"
+
+extern "C" __host__ void initSolverKernels9() {}
+
+extern "C" __global__ void preIntegrationLaunchTGS(
+	const uint32_t offset, const uint32_t nbSolverBodies, const PxReal dt, const PxVec3 gravity, PxgSolverBodyData* PX_RESTRICT solverBodyDataPool,
+	PxgSolverBodySleepData* PX_RESTRICT solverBodySleepDataPool, PxgSolverTxIData* PX_RESTRICT solverTxIDataPool,
+	const PxgBodySim* const PX_RESTRICT bodySimPool, const PxNodeIndex* const PX_RESTRICT islandNodeIndices,
+	PxAlignedTransform* gTransforms, float4* gOutVelocityPool, PxU32* solverBodyIndices, bool skipGravityApplication)
+{
+	preIntegration(offset, nbSolverBodies, dt, gravity, solverBodyDataPool, solverBodySleepDataPool, solverTxIDataPool, 
+		bodySimPool, islandNodeIndices, gTransforms, gOutVelocityPool, solverBodyIndices, skipGravityApplication);
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cu
@@ -0,0 +1,266 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgSolverBody.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgIslandContext.h"
+#include "PxgIntrinsics.h"
+#include "PxgSolverCoreDesc.h"
+#include "solver.cuh"
+#include "solverBlock.cuh"
+#include "PxgArticulation.h"
+#include "assert.h"
+#include "PxgArticulationCoreDesc.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSolverKernels7() {}
+
+extern "C" __global__
+//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 8)
+void artiSolveBlockPartition(PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
+	const PxU32 islandIndex, const PxU32 partitionIndex, bool doFriction_, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
+{
+	const PxU32 warpIndex = threadIdx.y;
+	PxU32 globalWarpIndex = blockIdx.x * blockDim.y + warpIndex;
+
+	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
+
+	//PxgBodySim* gBodySims = sharedDesc->bodySims;
+	PxgArticulationBlockData* gArticulations = artiDesc->mArticulationBlocks;
+
+	const PxU32 maxLinks = artiDesc->mMaxLinksPerArticulation;
+
+	Cm::UnAlignedSpatialVector* deferredZ = sharedDesc->articulationDeferredZ;
+
+	const PxU32 startPartitionIndex = island.mStartPartitionIndex;
+
+	PxU32 startIndex = partitionIndex == 0 ? island.mArtiBatchStartIndex : solverDesc->artiConstraintsPerPartition[partitionIndex + startPartitionIndex - 1];
+	//PxU32 startIndex = solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex];
+	
+	const PxU32 articulationBatchOffset = solverDesc->islandContextPool->mBatchCount;
+
+	//const PxU32 nbArticulations = artiDesc->nbArticulations;
+
+	PxU32 endIndex = solverDesc->artiConstraintsPerPartition[partitionIndex + startPartitionIndex] + articulationBatchOffset;
+
+	//const PxU32 articOffset = solverDesc->numBatches * 32 * 2 * 2;
+
+	uint2* isSlabDirty = artiDesc->slabHasChanges;
+
+	//This identifies which thread within a warp a specific thread is
+	const uint threadIndexInWarp = threadIdx.x;
+
+	uint k = startIndex + globalWarpIndex + articulationBatchOffset;	
+
+	PxgErrorAccumulator error;
+	const bool accumulateError = solverDesc->contactErrorAccumulator.mCounter >= 0;
+
+	if (k < endIndex)
+	{
+		/*if(threadIndexInWarp == 0)
+			printf("arti Contact Batch k = %i, endIndex = %i\n", k, endIndex);*/
+		/*	if (threadIndexInWarp == 0)
+		printf("============================================\n");*/
+
+		const bool doFriction = doFriction_;
+
+		const IterativeSolveData& msIterativeData = sharedDesc->iterativeData;
+
+		const PxgBlockConstraintBatch& batch = msIterativeData.blockConstraintBatch[k];
+
+		if(threadIndexInWarp >= batch.mDescStride)
+			return;
+
+		//printf("threadIndexInWarp %i descStride %i\n", threadIndexInWarp, batch.mDescStride);
+		//printf("constraintBatchIndex %i startConstraintIndex %i", batch.mConstraintBatchIndex, batch.startConstraintIndex);
+		const PxU32 nbArticulations = solverDesc->islandContextPool->mArticulationCount;
+
+		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
+		const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];
+
+		const PxU32 slabId = batch.slabId[threadIndexInWarp];
+
+		const PxU32 nodeIndexA = igNodeIndexA.index();
+		const PxU32 nodeIndexB = igNodeIndexB.index();
+
+		const PxU32 solverBodyId0 = batch.bodyAIndex[threadIndexInWarp];
+		const PxU32 solverBodyId1 = batch.bodyBIndex[threadIndexInWarp];
+
+		const PxU32 readIndex = k * 128 + threadIndexInWarp;
+
+		PxgArticulationBlockResponse* responses = sharedDesc->iterativeData.artiResponse;
+		const PxU32 responseIndex = batch.mArticulationResponseIndex;
+
+		//printf("responseIndex %i\n", responseIndex);
+
+		//Cm::UnAlignedSpatialVector v0, v1;
+		PxU32 linkIndexA = igNodeIndexA.articulationLinkId(); 
+		PxU32 linkIndexB = igNodeIndexB.articulationLinkId();
+
+		Cm::UnAlignedSpatialVector vel0, vel1;
+		{
+			float4 lin = Pxldcg(msIterativeData.solverBodyVelPool[readIndex]);
+			float4 ang = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 32]);
+			vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
+		}
+
+		{
+			float4 lin = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 64]);
+			float4 ang = Pxldcg(msIterativeData.solverBodyVelPool[readIndex + 96]);
+			vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
+		}
+
+		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
+		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();
+
+		PxReal curRef0 = 1.f;
+		PxReal curRef1 = 1.f;
+
+		const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;
+		const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
+
+		const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
+		const PxU32 numSolverBodies = bodyOffset + numDynamicBodies + numArticulations;
+
+		const PxU32* const PX_RESTRICT encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;
+
+		if(igNodeIndexA.isArticulation())
+		{
+			const PxU32 articulationBodyIdA = batch.remappedBodyAIndex[threadIndexInWarp];
+
+			// Articulation IDs are at the back of rigid body IDs.
+			const PxU32 globalBodyIdA = articulationBodyIdA + numDynamicBodies + bodyOffset;
+
+			// Counting the number of active slabs
+			curRef0 = static_cast<PxReal>(countActiveSlabs(globalBodyIdA, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+		else if(solverBodyId0 >= bodyOffset)
+		{
+			// Counting the number of active slabs
+			curRef0 = static_cast<PxReal>(countActiveSlabs(solverBodyId0, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+
+		if(igNodeIndexB.isArticulation())
+		{
+			const PxU32 articulationBodyIdB = batch.remappedBodyBIndex[threadIndexInWarp];
+
+			// Articulation IDs are at the back of rigid body IDs.
+			const PxU32 globalBodyIdB = articulationBodyIdB + numDynamicBodies + bodyOffset;
+
+			// Counting the number of active slabs
+			curRef1 = static_cast<PxReal>(countActiveSlabs(globalBodyIdB, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+		else if(solverBodyId1 >= bodyOffset)
+		{
+			// Counting the number of active slabs
+			curRef1 = static_cast<PxReal>(countActiveSlabs(solverBodyId1, solverDesc->numSlabs, numSolverBodies, encodedReferenceCount));
+		}
+
+		if(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
+		{
+			solveExtContactsBlock(batch, vel0, vel1, doFriction, msIterativeData.blockContactHeaders,
+			                      msIterativeData.blockFrictionHeaders, msIterativeData.blockContactPoints,
+			                      msIterativeData.blockFrictions, msIterativeData.artiResponse, impulse0, impulse1,
+			                      threadIndexInWarp, accumulateError ? &error : NULL, curRef0, curRef1);
+		}
+		else
+		{
+			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D);
+			solveExt1DBlock(batch, vel0, vel1, threadIndexInWarp, msIterativeData.blockJointConstraintHeaders,
+			                msIterativeData.blockJointConstraintRowsCon, msIterativeData.blockJointConstraintRowsMod,
+			                &responses[responseIndex], impulse0, impulse1,
+			                solverDesc->contactErrorAccumulator.mCounter >= 0, curRef0, curRef1);
+		}
+
+		//Pull impulse from threads 6-12
+		if (!igNodeIndexA.isArticulation())
+		{
+			const PxU32 outIndex = (batch.remappedBodyAIndex[threadIndexInWarp]);
+			{
+				msIterativeData.solverBodyVelPool[outIndex] = make_float4(vel0.bottom.x, vel0.bottom.y, vel0.bottom.z, 0.f);
+				msIterativeData.solverBodyVelPool[outIndex+32] = make_float4(vel0.top.x, vel0.top.y, vel0.top.z, 0.f);
+			}
+		}
+		else
+		{
+			//Write to GPU articulation!
+			const PxU32 index = computeDeltaVIndex(nbArticulations, maxLinks, solverBodyId0, linkIndexA, slabId);
+			//deferredZ[index] = impulse0;
+			//printf("A index %i\n", index);
+			storeSpatialVector(deferredZ + index, impulse0);
+
+			/*printf("A expected final LinkID = %i, articId = %i, vel = (%f, %f, %f, %f, %f, %f)\n", linkIndexA, solverBodyId0,
+				vel0.top.x, vel0.top.y, vel0.top.z, vel0.bottom.x, vel0.bottom.y, vel0.bottom.z);*/
+
+			/*printf("%i: Output ArticA index = %i, articId = %i (%f, %f, %f, %f, %f, %f)\n", threadIndexInWarp, index, solverBodyId0,
+				impulse0.top.x, impulse0.top.y, impulse0.top.z, impulse0.bottom.x, impulse0.bottom.y, impulse0.bottom.z);*/
+
+			//KS - TODO - let's see if we can skip *all* of this code below. It should be avoidable!
+			isSlabDirty[solverBodyId0 + slabId*nbArticulations].x = linkIndexA; // this works because the articulations are enumerated first in solverBodyIds
+			PxU32 articBlockId = solverBodyId0 / 32;
+			gArticulations[articBlockId].mStateDirty[solverBodyId0&31] = PxgArtiStateDirtyFlag::eHAS_IMPULSES | PxgArtiStateDirtyFlag::eVEL_DIRTY;
+		}
+
+		if (!igNodeIndexB.isArticulation())
+		{
+			//const PxU32 indexB = (2 * batch.remappedBodyBIndex);
+			const PxU32 outIndex = (batch.remappedBodyBIndex[threadIndexInWarp]);
+			{
+				msIterativeData.solverBodyVelPool[outIndex] = make_float4(vel1.bottom.x, vel1.bottom.y, vel1.bottom.z, 0.f);
+				msIterativeData.solverBodyVelPool[outIndex + 32] = make_float4(vel1.top.x, vel1.top.y, vel1.top.z, 0.f);
+
+				//printf("Output rigid B to %i\n", outIndex);
+			}
+		}
+		else
+		{
+			//Write to GPU articulation!
+			const PxU32 index = computeDeltaVIndex(nbArticulations, maxLinks, solverBodyId1, linkIndexB, slabId);
+			//printf("B index %i\n", index);
+			deferredZ[index] = impulse1;
+			storeSpatialVector(deferredZ + index, impulse1);
+			isSlabDirty[solverBodyId1 + slabId*nbArticulations].y = linkIndexB;
+			PxU32 articBlockId = solverBodyId1 / 32;
+			/*gArticulations[articBlockId].mJointDirty[solverBodyId1&31] = true;
+			gArticulations[articBlockId].mHasInternalImpulses[solverBodyId1&31] = true;*/
+			gArticulations[articBlockId].mStateDirty[solverBodyId1 & 31] = PxgArtiStateDirtyFlag::eHAS_IMPULSES | PxgArtiStateDirtyFlag::eVEL_DIRTY;
+			
+			/*printf("A expected final LinkID = %i, articId = %i, vel = (%f, %f, %f, %f, %f, %f)\n", linkIndexB, solverBodyId0,
+				vel1.top.x, vel1.top.y, vel1.top.z, vel1.bottom.x, vel1.bottom.y, vel1.bottom.z);*/
+			//printf("%i: Output ArticB index = %i, articId = %i (%f, %f, %f, %f, %f, %f)\n", threadIndexInWarp, index, solverBodyId1,
+			//	impulse1.top.x, impulse1.top.y, impulse1.top.z, impulse1.bottom.x, impulse1.bottom.y, impulse1.bottom.z);
+		}		
+	}
+
+	if (accumulateError)
+		error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
+}
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solver.cuh
@@ -0,0 +1,834 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_CUH__
+#define __SOLVER_CUH__
+
+#include "PxgSolverBody.h"
+#include "PxgSolverConstraint1D.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgSolverContext.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxgCommonDefines.h"
+#include "PxgIntrinsics.h"
+#include "PxgArticulation.h"
+#include "solverResidual.cuh"
+#include "constraintPrepShared.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+using namespace physx;
+
+// This function is for contacts involving articulations.
+// To apply mass-splitting, different data is stored and used when computing impulses.
+// Apart from mass-splitting, the formulation is the same as the previous implementation, see "setupFinalizeExtSolverConstraintsBlock"
+static __device__ void solveExtContactsBlock(const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0,
+	Cm::UnAlignedSpatialVector& vel1, const bool doFriction, PxgBlockSolverContactHeader* contactHeaders,
+	PxgBlockSolverFrictionHeader* frictionHeaders, PxgBlockSolverContactPoint* contactPoints,
+	PxgBlockSolverContactFriction* frictionPoints, const PxgArticulationBlockResponse* const PX_RESTRICT responses, Cm::UnAlignedSpatialVector& impulse0,
+	Cm::UnAlignedSpatialVector& impulse1, const PxU32 threadIndexInWarp, PxgErrorAccumulator* error, 
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	Cm::UnAlignedSpatialVector imp0(PxVec3(0.f), PxVec3(0.f));
+	Cm::UnAlignedSpatialVector imp1(PxVec3(0.f), PxVec3(0.f));
+
+	PxgBlockSolverContactHeader& contactHeader = contactHeaders[batch.mConstraintBatchIndex];
+	PxgBlockSolverFrictionHeader& frictionHeader = frictionHeaders[batch.mConstraintBatchIndex];
+
+	const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader.invMass0_1_angDom0_1[threadIndexInWarp]);
+
+	const PxgArticulationBlockResponse* resp = &responses[batch.mArticulationResponseIndex];
+
+	const uint numNormalConstr = Pxldcg(contactHeader.numNormalConstr[threadIndexInWarp]);
+	const uint	numFrictionConstr = Pxldcg(frictionHeader.numFrictionConstr[threadIndexInWarp]);
+
+	const float restitution = contactHeader.restitution[threadIndexInWarp];
+	const float p8 = 0.8f;
+	const float cfm = contactHeader.cfm[threadIndexInWarp];
+	const PxU8 flags = contactHeader.flags[threadIndexInWarp];
+
+	PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+
+	PxReal accumulatedNormalImpulse = 0.f;
+
+	Cm::UnAlignedSpatialVector v0 = vel0;
+	Cm::UnAlignedSpatialVector v1 = vel1;
+
+	const float4 normal_staticFriction = Pxldcg(contactHeader.normal_staticFriction[threadIndexInWarp]);
+
+	const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+	const PxReal staticCof = normal_staticFriction.w;
+
+	float4 nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndexInWarp]);
+	float4 nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndexInWarp]);
+	float nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndexInWarp]);
+
+	float nextResp0 = Pxldcg(contacts[0].resp0[threadIndexInWarp]);
+	float nextResp1 = Pxldcg(contacts[0].resp1[threadIndexInWarp]);
+
+	float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndexInWarp]);
+	float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndexInWarp]);
+
+	float3 nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+	float3 nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+	float3 nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+	float3 nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+
+	PxgBlockSolverContactFriction* frictions = &frictionPoints[batch.startFrictionIndex];
+
+	for (uint i = 0; i < numNormalConstr; i++)
+	{
+		PxgBlockSolverContactPoint& c = contacts[i];
+		resp++;
+
+		const float4 raXn_targetVelocity = nextRaxn_extraCoeff;
+		const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+		const float appliedForce = nextAppliedForce;
+		const float resp0 = nextResp0;
+		const float resp1 = nextResp1;
+		const float coeff0 = nextCoeff0;
+		const float coeff1 = nextCoeff1;
+
+		const float3 deltaRALin = nextDeltaRALin;
+		const float3 deltaRAAng = nextDeltaRAAng;
+		const float3 deltaRBLin = nextDeltaRBLin;
+		const float3 deltaRBAng = nextDeltaRBAng;
+
+		if ((i + 1) < numNormalConstr)
+		{
+			const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+
+			nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndexInWarp]);
+
+			nextResp0 = Pxldcg(nextC.resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(nextC.resp1[threadIndexInWarp]);
+
+			nextCoeff0 = Pxldcg(nextC.coeff0[threadIndexInWarp]);
+			nextCoeff1 = Pxldcg(nextC.coeff1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+		else if (numFrictionConstr && doFriction)
+		{
+			nextRaxn_extraCoeff = Pxldcg(frictions[0].raXn_bias[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(frictions[0].rbXn_targetVelW[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(frictions[0].appliedForce[threadIndexInWarp]);
+			nextResp0 = Pxldcg(frictions[0].resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(frictions[0].resp1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+
+		const PxVec3 raXn = PxVec3(raXn_targetVelocity.x, raXn_targetVelocity.y, raXn_targetVelocity.z);
+		const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+		const float targetVelocity = raXn_targetVelocity.w;
+		const float maxImpulse = rbXn_maxImpulse.w;
+
+		float unitResponse = ref0 * resp0 + ref1 * resp1;
+		float recipResponse = (unitResponse > 0.0f) ? 1.0f / (unitResponse + cfm) : 0.0f;
+		float velMultiplier = recipResponse;
+		float impulseMul = 1.0f;
+		float unbiasedError;
+		float biasedErr;
+
+		computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0, coeff1,
+			velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+		const Cm::UnAlignedSpatialVector deltaVA(ref0 * PxVec3(deltaRAAng.x, deltaRAAng.y, deltaRAAng.z),
+												 ref0 * PxVec3(deltaRALin.x, deltaRALin.y, deltaRALin.z));
+
+		const Cm::UnAlignedSpatialVector deltaVB(ref1 * PxVec3(deltaRBAng.x, deltaRBAng.y, deltaRBAng.z),
+												 ref1 * PxVec3(deltaRBLin.x, deltaRBLin.y, deltaRBLin.z));
+
+		const float v0_ = v0.bottom.dot(normal) + v0.top.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+		const float v1_ = v1.bottom.dot(normal) + v1.top.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+		const float normalVel = v0_ - v1_;
+
+		//KS - clamp the maximum force
+		const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+		const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const float _newForce = appliedForce * impulseMul + _deltaF;
+		const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+		const float deltaF = newForce - appliedForce;
+
+		if(error)
+			error->accumulateErrorLocal(deltaF, velMultiplier);
+		Pxstcg(&c.appliedForce[threadIndexInWarp], newForce);
+
+		imp0.bottom -= raXn * deltaF;
+		imp0.top -= normal * deltaF;
+		imp1.bottom += rbXn * deltaF;
+		imp1.top += normal * deltaF;
+
+		v0 += deltaVA * deltaF;
+		v1 += deltaVB * deltaF;
+
+		accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+	}
+
+	//Force a minimum normal force for friction. This is required for articulations with multi-link collisions
+	//because often normal force can be solved with just 1 link's collisions. However, this means that other links can slide on
+	//a surface friction-free because there was no normal force applied.
+	accumulatedNormalImpulse = PxMax(accumulatedNormalImpulse, contactHeader.minNormalForce[threadIndexInWarp]);
+
+	if (numFrictionConstr && doFriction)
+	{
+
+		//printf("FrictionHeader = %i, count = %i\n", batch.startFrictionIndex, numFrictionConstr);
+		const float dynamicFrictionCof = frictionHeader.dynamicFriction[threadIndexInWarp];
+		const float maxFrictionImpulse = staticCof * accumulatedNormalImpulse;
+		const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+		//const float negMaxDynFrictionImpulse = -maxDynFrictionImpulse;
+
+		PxU32 broken = 0;
+
+
+		for (uint i = 0; i < numFrictionConstr; i++)
+		{
+			PxgBlockSolverContactFriction& f = frictions[i];
+			resp++;
+
+			const float4 frictionNormal = frictionHeader.frictionNormals[i & 1][threadIndexInWarp];
+
+			const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+			const float4 rbXn_targetVelW = nextRbxn_maxImpulseW;
+			const float resp0 = nextResp0;
+			const float resp1 = nextResp1;
+
+			const float appliedForce = nextAppliedForce;
+
+
+			const float3 deltaRALin = nextDeltaRALin;
+			const float3 deltaRAAng = nextDeltaRAAng;
+			const float3 deltaRBLin = nextDeltaRBLin;
+			const float3 deltaRBAng = nextDeltaRBAng;
+
+			if ((i + 1) < numFrictionConstr)
+			{
+				const PxgBlockSolverContactFriction& f2 = frictions[i + 1];
+
+				nextRaxn_extraCoeff = Pxldcg(f2.raXn_bias[threadIndexInWarp]);
+				nextRbxn_maxImpulseW = Pxldcg(f2.rbXn_targetVelW[threadIndexInWarp]);
+				nextResp0 = Pxldcg(f2.resp0[threadIndexInWarp]);
+				nextResp1 = Pxldcg(f2.resp1[threadIndexInWarp]);
+
+				nextAppliedForce = Pxldcg(f2.appliedForce[threadIndexInWarp]);
+				nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+				nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+				nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+				nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+			}
+
+			const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+			const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+			const float resp = ref0 * resp0 + ref1 * resp1;
+			const float velMultiplier = (resp > PX_EPS_REAL) ? (p8 / resp) : 0.f;
+			const float bias = raXn_extraCoeff.w;
+			const float targetVel = rbXn_targetVelW.w;
+
+			const Cm::UnAlignedSpatialVector deltaVA(ref0 * PxVec3(deltaRAAng.x, deltaRAAng.y, deltaRAAng.z),
+													 ref0 * PxVec3(deltaRALin.x, deltaRALin.y, deltaRALin.z));
+
+			const Cm::UnAlignedSpatialVector deltaVB(ref1 * PxVec3(deltaRBAng.x, deltaRBAng.y, deltaRBAng.z),
+													 ref1 * PxVec3(deltaRBLin.x, deltaRBLin.y, deltaRBLin.z));
+
+			const PxVec3 normal = PxVec3(frictionNormal.x, frictionNormal.y, frictionNormal.z);
+
+			const float v0_ = v0.top.dot(raXn) + v0.bottom.dot(normal);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+			const float v1_ = v1.top.dot(rbXn) + v1.bottom.dot(normal);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+			const float normalVel = v0_ - v1_;
+
+			const float tmp1 = appliedForce - (bias - targetVel) * velMultiplier;
+
+			const float totalImpulse = tmp1 - normalVel * velMultiplier;
+
+			const bool clamp = fabsf(totalImpulse) > maxFrictionImpulse;
+
+			const float totalClamped = fminf(maxDynFrictionImpulse, fmaxf(-maxDynFrictionImpulse, totalImpulse));
+
+			const float newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+			float deltaF = newAppliedForce - appliedForce;//FSub(newAppliedForce, appliedForce);
+
+			if (error)
+				error->accumulateErrorLocal(deltaF, velMultiplier);
+
+			//printf("v0 = (%f, %f, %f, %f, %f, %f), v1 = (%f, %f, %f, %f, %f, %f)\n", v0.top.x, v0.top.y, v0.top.z, v0.bottom.x, v0.bottom.y, v0.bottom.z,
+			//	v1.top.x, v1.top.y, v1.top.z, v1.bottom.x, v1.bottom.y, v1.bottom.z);
+			//printf("normal = (%f, %f, %f), raXn = (%f, %f, %f)\n", normal.x, normal.y, normal.z, raXn.x, raXn.y, raXn.z);
+
+			//printf("Friction velMultiplier = %f, normalVel = %f, deltaF = %f\n", velMultiplier, normalVel, deltaF);
+
+			v0 += deltaVA * deltaF;
+			v1 += deltaVB * deltaF;
+
+			imp0.bottom -= raXn * deltaF;
+			imp0.top -= normal * deltaF;
+			imp1.bottom += rbXn * deltaF;
+			imp1.top += normal * deltaF;
+
+			//f.appliedForce[threadIndex] = newAppliedForce;
+			Pxstcg(&f.appliedForce[threadIndexInWarp], newAppliedForce);
+			broken = broken | clamp;
+		}
+		Pxstcg(&frictionHeader.broken[threadIndexInWarp], broken);
+	}
+
+	impulse0 = imp0.scale(ref0 * invMass0_1_angDom0_1.z, ref0 * invMass0_1_angDom0_1.x);
+	impulse1 = imp1.scale(ref1 * invMass0_1_angDom0_1.y, ref1 * invMass0_1_angDom0_1.w);
+
+	vel0 = v0;
+	vel1 = v1;
+}
+
+// A light version of the function "solveExtContactsBlock" to quickly check if there is any active contact.
+static __device__ bool checkExtActiveContactBlock(const PxgBlockConstraintBatch& batch, const Cm::UnAlignedSpatialVector& vel0,
+	const Cm::UnAlignedSpatialVector& vel1, PxgBlockSolverContactHeader* contactHeaders,
+	PxgBlockSolverContactPoint* contactPoints, const PxgArticulationBlockResponse* const PX_RESTRICT responses, 
+	const PxU32 threadIndexInWarp)
+{
+	PxgBlockSolverContactHeader& contactHeader = contactHeaders[batch.mConstraintBatchIndex];
+	const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader.invMass0_1_angDom0_1[threadIndexInWarp]);
+	const PxgArticulationBlockResponse* resp = &responses[batch.mArticulationResponseIndex];
+	const uint numNormalConstr = Pxldcg(contactHeader.numNormalConstr[threadIndexInWarp]);
+
+	const float restitution = contactHeader.restitution[threadIndexInWarp];
+	const float cfm = contactHeader.cfm[threadIndexInWarp];
+	const PxU8 flags = contactHeader.flags[threadIndexInWarp];
+
+	PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+
+	Cm::UnAlignedSpatialVector v0 = vel0;
+	Cm::UnAlignedSpatialVector v1 = vel1;
+
+	const float4 normal_staticFriction = Pxldcg(contactHeader.normal_staticFriction[threadIndexInWarp]);
+	const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+	float4 nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndexInWarp]);
+	float4 nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndexInWarp]);
+	float nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndexInWarp]);
+
+	float nextResp0 = Pxldcg(contacts[0].resp0[threadIndexInWarp]);
+	float nextResp1 = Pxldcg(contacts[0].resp1[threadIndexInWarp]);
+
+	float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndexInWarp]);
+	float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndexInWarp]);
+
+	float3 nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+	float3 nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+	float3 nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+	float3 nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+
+	for (uint i = 0; i < numNormalConstr; i++)
+	{
+		resp++;
+
+		const float4 raXn_targetVelocity = nextRaxn_extraCoeff;
+		const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+		const float appliedForce = nextAppliedForce;
+		const float resp0 = nextResp0;
+		const float resp1 = nextResp1;
+		const float coeff0 = nextCoeff0;
+		const float coeff1 = nextCoeff1;
+
+		if ((i + 1) < numNormalConstr)
+		{
+			const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+
+			nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndexInWarp]);
+			nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndexInWarp]);
+			nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndexInWarp]);
+
+			nextResp0 = Pxldcg(nextC.resp0[threadIndexInWarp]);
+			nextResp1 = Pxldcg(nextC.resp1[threadIndexInWarp]);
+
+			nextCoeff0 = Pxldcg(nextC.coeff0[threadIndexInWarp]);
+			nextCoeff1 = Pxldcg(nextC.coeff1[threadIndexInWarp]);
+
+			nextDeltaRALin = make_float3(Pxldcg(resp->deltaRALin_x[threadIndexInWarp]), Pxldcg(resp->deltaRALin_y[threadIndexInWarp]), Pxldcg(resp->deltaRALin_z[threadIndexInWarp]));
+			nextDeltaRAAng = make_float3(Pxldcg(resp->deltaRAAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRAAng_z[threadIndexInWarp]));
+			nextDeltaRBLin = make_float3(Pxldcg(resp->deltaRBLin_x[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_y[threadIndexInWarp]), Pxldcg(resp->deltaRBLin_z[threadIndexInWarp]));
+			nextDeltaRBAng = make_float3(Pxldcg(resp->deltaRBAng_x[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_y[threadIndexInWarp]), Pxldcg(resp->deltaRBAng_z[threadIndexInWarp]));
+		}
+
+		const PxVec3 raXn = PxVec3(raXn_targetVelocity.x, raXn_targetVelocity.y, raXn_targetVelocity.z);
+		const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+		const float targetVelocity = raXn_targetVelocity.w;
+		const float maxImpulse = rbXn_maxImpulse.w;
+
+		float unitResponse = resp0 + resp1;
+		float recipResponse = (unitResponse > 0.f) ? 1.f / (unitResponse + cfm) : 0.f;
+		float velMultiplier = recipResponse;
+		float impulseMul = 1.f;
+		float unbiasedError;
+		float biasedErr;
+
+		computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0, coeff1,
+			velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+		const float v0_ = v0.bottom.dot(normal) + v0.top.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+		const float v1_ = v1.bottom.dot(normal) + v1.top.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+		const float normalVel = v0_ - v1_;
+
+		//KS - clamp the maximum force
+		const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+		const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const float _newForce = appliedForce * impulseMul + _deltaF;
+		const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+		const float deltaF = newForce - appliedForce;
+
+		// Check for active contact.
+		if (PxAbs(deltaF) > 1.0e-8f)
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+// To apply mass-splitting, different data is stored and used when computing impulses.
+// Apart from mass-splitting, the formulation is the same as the previous implementation, see "setupFinalizeExtSolverConstraintsBlock"
+static __device__ PX_FORCE_INLINE void solveExtContactBlockTGS(const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0, Cm::UnAlignedSpatialVector& vel1, const Cm::UnAlignedSpatialVector& delta0, const Cm::UnAlignedSpatialVector& delta1,
+	const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, PxgTGSBlockSolverContactFriction* PX_RESTRICT frictionPoints,
+	PxgArticulationBlockResponse* PX_RESTRICT responses, const PxReal elapsedTime, const PxReal minPen,
+	Cm::UnAlignedSpatialVector& impulse0, Cm::UnAlignedSpatialVector& impulse1, PxgErrorAccumulator* error,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	float accumulatedNormalImpulse = 0.f;
+
+	Cm::UnAlignedSpatialVector imp0(PxVec3(0.f), PxVec3(0.f)), imp1(PxVec3(0.f), PxVec3(0.f));
+	{
+		PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		PxgTGSBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const uint	totalFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+		const uint	numFrictionConstr = totalFrictionConstr & (~0x1);
+
+		const PxReal maxPenBias = contactHeader->maxPenBias[threadIndex];
+
+		PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+
+		PxgArticulationBlockResponse* PX_RESTRICT resp = &responses[batch.mArticulationResponseIndex];
+
+		const float4 invMass0_1_angDom0_1 = contactHeader->invMass0_1_angDom0_1[threadIndex];
+
+		const float4 normal_staticFriction = contactHeader->normal_staticFriction[threadIndex];
+
+		const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
+		const float p8 = contactHeader->p8[threadIndex];
+		const float cfm = contactHeader->cfm[threadIndex];
+		const PxU8 flags = (PxU8)contactHeader->flags[threadIndex];
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		//Bring forward a read event
+		const float staticFrictionCof = normal_staticFriction.w;
+
+		const PxVec3 relMotion = delta0.bottom - delta1.bottom;
+
+		const float deltaV = normal.dot(relMotion);
+
+		{
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				PxgTGSBlockSolverContactPoint& c = contacts[i];
+				PxgArticulationBlockResponse& r = *resp;
+				resp++;
+
+				const float4 raXn_extraCoeff = c.raXn_extraCoeff[threadIndex];
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const float compliantContactCoef = raXn_extraCoeff.w;
+
+				const float4 rbXn_targetVelW = c.rbXn_targetVelW[threadIndex];
+				const float appliedForce = c.appliedForce[threadIndex];
+				const float separation = c.separation[threadIndex];
+
+				const float maxImpulse = c.maxImpulse[threadIndex];
+
+				const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+				const float targetVel = rbXn_targetVelW.w;
+
+				float biasCoefficient = c.biasCoefficient[threadIndex];
+
+				const float resp0 = ref0 * Pxldcs(c.resp0[threadIndex]);
+				const float resp1 = ref1 * Pxldcs(c.resp1[threadIndex]);
+
+				const float unitResponse = resp0 + resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / (unitResponse + cfm)) : 0.f;
+
+				float velMultiplier = recipResponse;
+
+				if (restitutionXdt < 0.f)
+				{
+					computeCompliantContactCoefficientsTGS(flags, restitutionXdt, unitResponse, recipResponse,
+						compliantContactCoef, velMultiplier, biasCoefficient);
+				}
+
+				//Compute the normal velocity of the constraint.
+				const PxReal v0 = angVel0.dot(raXn) + linVel0.dot(normal);
+				const PxReal v1 = angVel1.dot(rbXn) + linVel1.dot(normal);
+				const float normalVel = (v0 - v1);
+
+				const PxReal deltaBias = deltaV + delta0.top.dot(raXn) - delta1.top.dot(rbXn) - targetVel * elapsedTime;
+
+				const float sep = PxMax(minPen, separation + deltaBias);
+
+				const PxReal biased = PxMin(-maxPenBias, biasCoefficient * sep);
+				const PxReal tVelBias = recipResponse * biased;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = tVelBias - (normalVel - targetVel) * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+
+				PxVec3 deltaRALin = ref0 * PxVec3(r.deltaRALin_x[threadIndex], r.deltaRALin_y[threadIndex], r.deltaRALin_z[threadIndex]);
+				PxVec3 deltaRAAng = ref0 * PxVec3(r.deltaRAAng_x[threadIndex], r.deltaRAAng_y[threadIndex], r.deltaRAAng_z[threadIndex]);
+				PxVec3 deltaRBLin = ref1 * PxVec3(r.deltaRBLin_x[threadIndex], r.deltaRBLin_y[threadIndex], r.deltaRBLin_z[threadIndex]);
+				PxVec3 deltaRBAng = ref1 * PxVec3(r.deltaRBAng_x[threadIndex], r.deltaRBAng_y[threadIndex], r.deltaRBAng_z[threadIndex]);
+				linVel0 += deltaRALin * deltaF;
+				linVel1 += deltaRBLin * deltaF;
+				angVel0 += deltaRAAng * deltaF;
+				angVel1 += deltaRBAng * deltaF;
+
+				imp0.top -= normal * deltaF;
+				imp0.bottom -= raXn * deltaF;
+				imp1.top += normal * deltaF;
+				imp1.bottom += rbXn * deltaF;
+
+				if(error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				c.appliedForce[threadIndex] = newForce;
+
+				accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+			}
+
+			accumulatedNormalImpulse = PxMax(accumulatedNormalImpulse, contactHeader->minNormalForce[threadIndex]);
+		}
+
+		if (numFrictionConstr)
+		{
+			PxgTGSBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex];
+
+			const float biasCoefficient = frictionHeader->biasCoefficient[threadIndex];
+
+			const float dynamicFrictionCof = frictionHeader->dynamicFriction[threadIndex];
+			const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse;
+			const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+
+			PxU32 broken = 0;
+
+			const float4 frictionNormal0 = frictionHeader->frictionNormals[0][threadIndex];
+			const float4 frictionNormal1 = frictionHeader->frictionNormals[1][threadIndex];
+			const PxVec3 normal0 = PxVec3(frictionNormal0.x, frictionNormal0.y, frictionNormal0.z);
+			const PxVec3 normal1 = PxVec3(frictionNormal1.x, frictionNormal1.y, frictionNormal1.z);
+
+			const PxReal deltaMotion0 = normal0.dot(relMotion);
+			const PxReal deltaMotion1 = normal1.dot(relMotion);
+			for (uint i = 0; i < numFrictionConstr; i += 2)
+			{
+				PxgTGSBlockSolverContactFriction& f0 = frictions[i];
+				PxgArticulationBlockResponse& r0 = *resp;
+				resp++;
+				PxgTGSBlockSolverContactFriction& f1 = frictions[i + 1];
+				PxgArticulationBlockResponse& r1 = *resp;
+				resp++;
+
+				const float4 raXn_error0 = f0.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW0 = f0.rbXn_targetVelW[threadIndex];
+				const float initialError0 = raXn_error0.w;
+				const float appliedForce0 = f0.appliedForce[threadIndex];
+				const float targetVel0 = rbXn_targetVelW0.w;
+
+				const float4 raXn_error1 = f1.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW1 = f1.rbXn_targetVelW[threadIndex];
+				const float initialError1 = raXn_error1.w;
+				const float appliedForce1 = f1.appliedForce[threadIndex];
+				const float targetVel1 = rbXn_targetVelW1.w;
+
+				const PxVec3 raXn0 = PxVec3(raXn_error0.x, raXn_error0.y, raXn_error0.z);
+				const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
+
+				const PxVec3 raXn1 = PxVec3(raXn_error1.x, raXn_error1.y, raXn_error1.z);
+				const PxVec3 rbXn1 = PxVec3(rbXn_targetVelW1.x, rbXn_targetVelW1.y, rbXn_targetVelW1.z);
+
+				const float resp0_0 = ref0 * f0.resp0[threadIndex];
+				const float resp0_1 = ref1 * f0.resp1[threadIndex];
+				const float resp0 = resp0_0 + resp0_1;
+				const float velMultiplier0 = (resp0 > PX_EPS_REAL) ? (p8 / (resp0 + cfm)) : 0.f;
+
+				const float resp1_0 = ref0 * f1.resp0[threadIndex];
+				const float resp1_1 = ref1 * f1.resp1[threadIndex];
+				const float resp1 = resp1_0 + resp1_1;
+				const float velMultiplier1 = (resp1 > PX_EPS_REAL) ? (p8 / (resp1 + cfm)) : 0.f;
+
+				const PxReal v00 = angVel0.dot(raXn0) + linVel0.dot(normal0);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const PxReal v10 = angVel1.dot(rbXn0) + linVel1.dot(normal0);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel0 = v00 - v10;
+
+				const PxReal v01 = angVel0.dot(raXn1) + linVel0.dot(normal1);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const PxReal v11 = angVel1.dot(rbXn1) + linVel1.dot(normal1);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel1 = v01 - v11;
+
+				const float error0 = initialError0 - targetVel0 * elapsedTime + (raXn0.dot(delta0.top) - rbXn0.dot(delta1.top) + deltaMotion0);
+				const float bias0 = error0 * biasCoefficient;
+				const float tmp10 = appliedForce0 - (bias0 - targetVel0) * velMultiplier0;
+				const float totalImpulse0 = tmp10 - normalVel0 * velMultiplier0;
+
+				const float error1 = initialError1 - targetVel1 * elapsedTime + (raXn1.dot(delta0.top) - rbXn1.dot(delta1.top) + deltaMotion1);
+				const float bias1 = error1 * biasCoefficient;
+				const float tmp11 = appliedForce1 - (bias1 - targetVel1) * velMultiplier1;
+				const float totalImpulse1 = tmp11 - normalVel1 * velMultiplier1;
+
+				const float totalImpulse = PxSqrt(totalImpulse0 * totalImpulse0 + totalImpulse1 * totalImpulse1);
+
+				const bool clamp = totalImpulse > maxFrictionImpulse;
+
+				const float ratio = clamp ? fminf(maxDynFrictionImpulse, totalImpulse) / totalImpulse : 1.f;
+
+				const PxReal newAppliedForce0 = totalImpulse0 * ratio;
+				const PxReal newAppliedForce1 = totalImpulse1 * ratio;
+
+				float deltaF0 = newAppliedForce0 - appliedForce0;
+				float deltaF1 = newAppliedForce1 - appliedForce1;
+
+				if (error)
+					error->accumulateErrorLocal(deltaF0, deltaF1, velMultiplier0, velMultiplier1);
+
+				linVel0 += ref0 * PxVec3(r0.deltaRALin_x[threadIndex], r0.deltaRALin_y[threadIndex], r0.deltaRALin_z[threadIndex]) * deltaF0;
+				linVel1 += ref1 * PxVec3(r0.deltaRBLin_x[threadIndex], r0.deltaRBLin_y[threadIndex], r0.deltaRBLin_z[threadIndex]) * deltaF0;
+				angVel0 += ref0 * PxVec3(r0.deltaRAAng_x[threadIndex], r0.deltaRAAng_y[threadIndex], r0.deltaRAAng_z[threadIndex]) * deltaF0;
+				angVel1 += ref1 * PxVec3(r0.deltaRBAng_x[threadIndex], r0.deltaRBAng_y[threadIndex], r0.deltaRBAng_z[threadIndex]) * deltaF0;
+
+				linVel0 += ref0 * PxVec3(r1.deltaRALin_x[threadIndex], r1.deltaRALin_y[threadIndex], r1.deltaRALin_z[threadIndex]) * deltaF1;
+				linVel1 += ref1 * PxVec3(r1.deltaRBLin_x[threadIndex], r1.deltaRBLin_y[threadIndex], r1.deltaRBLin_z[threadIndex]) * deltaF1;
+				angVel0 += ref0 * PxVec3(r1.deltaRAAng_x[threadIndex], r1.deltaRAAng_y[threadIndex], r1.deltaRAAng_z[threadIndex]) * deltaF1;
+				angVel1 += ref1 * PxVec3(r1.deltaRBAng_x[threadIndex], r1.deltaRBAng_y[threadIndex], r1.deltaRBAng_z[threadIndex]) * deltaF1;
+
+				f0.appliedForce[threadIndex] = newAppliedForce0;
+				f1.appliedForce[threadIndex] = newAppliedForce1;
+				broken = broken | clamp;
+
+				imp0.top -= normal0 * deltaF0;
+				imp0.bottom -= raXn0 * deltaF0;
+				imp1.top += normal0 * deltaF0;
+				imp1.bottom += rbXn0 * deltaF0;
+
+				imp0.top -= normal1 * deltaF1;
+				imp0.bottom -= raXn1 * deltaF1;
+				imp1.top += normal1 * deltaF1;
+				imp1.bottom += rbXn1 * deltaF1;
+			}
+
+			if (numFrictionConstr < totalFrictionConstr)
+			{
+				//We have a torsional friction constraint
+
+				const PxReal frictionScale = frictionHeader->torsionalFrictionScale[threadIndex];
+
+				PxgTGSBlockSolverContactFriction& f0 = frictions[numFrictionConstr];
+				PxgArticulationBlockResponse& r0 = *resp;
+				resp++;
+
+				const float4 raXn_error0 = f0.raXn_error[threadIndex];
+				const float4 rbXn_targetVelW0 = f0.rbXn_targetVelW[threadIndex];
+				const float appliedForce0 = f0.appliedForce[threadIndex];
+				const float targetVel0 = rbXn_targetVelW0.w;
+
+				const PxVec3 raXn0 = PxVec3(raXn_error0.x, raXn_error0.y, raXn_error0.z);
+				const PxVec3 rbXn0 = PxVec3(rbXn_targetVelW0.x, rbXn_targetVelW0.y, rbXn_targetVelW0.z);
+
+				const float resp0_0 = ref0 * f0.resp0[threadIndex];
+				const float resp0_1 = ref1 * f0.resp1[threadIndex];
+				const float resp0 = resp0_0 + resp0_1;
+				const float velMultiplier0 = (resp0 > 0.f) ? (p8 / (resp0 + cfm)) : 0.f;
+
+				const PxReal v00 = angVel0.dot(raXn0);
+				const PxReal v10 = angVel1.dot(rbXn0);
+				const float normalVel0 = v00 - v10;
+
+				const float tmp10 = appliedForce0 - (-targetVel0) * velMultiplier0;
+				const float totalImpulse = tmp10 - normalVel0 * velMultiplier0;
+
+				const bool clamp = PxAbs(totalImpulse) > (maxFrictionImpulse * frictionScale);
+
+				const PxReal totalClamped = PxClamp(totalImpulse, -maxDynFrictionImpulse * frictionScale, maxDynFrictionImpulse * frictionScale);
+
+				const PxReal newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+				const PxReal deltaF0 = newAppliedForce - appliedForce0;
+				if (error)
+					error->accumulateErrorLocal(deltaF0, velMultiplier0);
+
+				linVel0 += ref0 * PxVec3(r0.deltaRALin_x[threadIndex], r0.deltaRALin_y[threadIndex], r0.deltaRALin_z[threadIndex]) * deltaF0;
+				linVel1 += ref1 * PxVec3(r0.deltaRBLin_x[threadIndex], r0.deltaRBLin_y[threadIndex], r0.deltaRBLin_z[threadIndex]) * deltaF0;
+				angVel0 += ref0 * PxVec3(r0.deltaRAAng_x[threadIndex], r0.deltaRAAng_y[threadIndex], r0.deltaRAAng_z[threadIndex]) * deltaF0;
+				angVel1 += ref1 * PxVec3(r0.deltaRBAng_x[threadIndex], r0.deltaRBAng_y[threadIndex], r0.deltaRBAng_z[threadIndex]) * deltaF0;
+
+				f0.appliedForce[threadIndex] = newAppliedForce;
+				broken = broken | clamp;
+
+				imp0.bottom -= raXn0 * deltaF0;
+				imp1.bottom += rbXn0 * deltaF0;
+			}
+
+			frictionHeader->broken[threadIndex] = broken;
+		}
+
+		vel0.bottom = linVel0;
+		vel0.top = angVel0;
+		vel1.bottom = linVel1;
+		vel1.top = angVel1;
+
+		impulse0 = imp0.scale(invMass0_1_angDom0_1.x * ref0, invMass0_1_angDom0_1.z * ref0);
+		impulse1 = imp1.scale(invMass0_1_angDom0_1.y * ref1, invMass0_1_angDom0_1.w * ref1);
+	}
+}
+
+// A light version of the function "solveExtContactBlockTGS" to quickly check if there is any active contact.
+static __device__ PX_FORCE_INLINE bool checkExtActiveContactBlockTGS(const PxgBlockConstraintBatch& batch,
+	const Cm::UnAlignedSpatialVector& vel0, const Cm::UnAlignedSpatialVector& vel1, const Cm::UnAlignedSpatialVector& delta0, const Cm::UnAlignedSpatialVector& delta1,
+	const PxU32 threadIndex, PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeaders, PxgTGSBlockSolverContactPoint* PX_RESTRICT contactPoints, 
+	PxgArticulationBlockResponse* PX_RESTRICT responses, const PxReal elapsedTime, const PxReal minPen)
+{
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	{
+		PxgTGSBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const uint numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const PxReal maxPenBias = contactHeader->maxPenBias[threadIndex];
+		PxgTGSBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+		PxgArticulationBlockResponse* PX_RESTRICT resp = &responses[batch.mArticulationResponseIndex];
+
+		const float4 normal_staticFriction = contactHeader->normal_staticFriction[threadIndex];
+
+		const float restitutionXdt = contactHeader->restitutionXdt[threadIndex];
+		const float cfm = contactHeader->cfm[threadIndex];
+		const PxU8 flags = (PxU8)contactHeader->flags[threadIndex];
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		// Bring forward a read event
+		const PxVec3 relMotion = delta0.bottom - delta1.bottom;
+		const float deltaV = normal.dot(relMotion);
+
+		for(uint i = 0; i < numNormalConstr; i++)
+		{
+			PxgTGSBlockSolverContactPoint& c = contacts[i];
+			resp++;
+
+			const float4 raXn_extraCoeff = c.raXn_extraCoeff[threadIndex];
+			const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+			const float compliantContactCoef = raXn_extraCoeff.w;
+
+			const float4 rbXn_targetVelW = c.rbXn_targetVelW[threadIndex];
+			const float appliedForce = c.appliedForce[threadIndex];
+			const float separation = c.separation[threadIndex];
+
+			const float maxImpulse = c.maxImpulse[threadIndex];
+
+			const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+			const float targetVel = rbXn_targetVelW.w;
+
+			float biasCoefficient = c.biasCoefficient[threadIndex];
+
+			const float resp0 = Pxldcs(c.resp0[threadIndex]);
+			const float resp1 = Pxldcs(c.resp1[threadIndex]);
+
+			const float unitResponse = resp0 + resp1;
+			const float recipResponse = (unitResponse > 0.f) ? (1.f / (unitResponse + cfm)) : 0.f;
+
+			float velMultiplier = recipResponse;
+
+			if(restitutionXdt < 0.f)
+			{
+				computeCompliantContactCoefficientsTGS(flags, restitutionXdt, unitResponse, recipResponse,
+				                                       compliantContactCoef, velMultiplier, biasCoefficient);
+			}
+
+			// Compute the normal velocity of the constraint.
+			const PxReal v0 = angVel0.dot(raXn) + linVel0.dot(normal);
+			const PxReal v1 = angVel1.dot(rbXn) + linVel1.dot(normal);
+			const float normalVel = (v0 - v1);
+
+			const PxReal deltaBias = deltaV + delta0.top.dot(raXn) - delta1.top.dot(rbXn) - targetVel * elapsedTime;
+
+			const float sep = PxMax(minPen, separation + deltaBias);
+
+			const PxReal biased = PxMin(-maxPenBias, biasCoefficient * sep);
+			const PxReal tVelBias = recipResponse * biased;
+
+			// KS - clamp the maximum force
+			const float tempDeltaF = tVelBias - (normalVel - targetVel) * velMultiplier;
+			const float _deltaF = fmaxf(tempDeltaF, -appliedForce); // FMax(FNegScaleSub(normalVel, velMultiplier,
+			                                                        // biasedErr), FNeg(appliedForce));
+			const float _newForce = appliedForce + _deltaF;
+			const float newForce = fminf(_newForce, maxImpulse); // FMin(_newForce, maxImpulse);
+			const float deltaF = newForce - appliedForce;
+
+			// Check for active contact.
+			if(PxAbs(deltaF) > 1.0e-8f)
+			{
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlock.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlock.cuh
@@ -0,0 +1,849 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_BLOCK_CUH__
+#define __SOLVER_BLOCK_CUH__
+
+#include "common/PxPhysXCommonConfig.h"
+#include <cuda.h>
+#include <sm_35_intrinsics.h>
+#include "PxgSolverBody.h"
+//#include "PxgSolverConstraint1D.h"
+#include "PxgSolverConstraintBlock1D.h"
+#include "PxgSolverConstraintDesc.h"
+#include "PxgConstraint.h"
+#include "PxgConstraintBlock.h"
+#include "PxgIslandContext.h"
+#include "PxgSolverContext.h"
+#include "cutil_math.h"
+#include "PxgSolverCoreDesc.h"
+#include "DyThresholdTable.h"
+#include "PxgFrictionPatch.h"
+#include "foundation/PxUtilities.h"
+#include "PxgConstraintWriteBack.h"
+#include "PxgSolverFlags.h"
+#include "PxgIntrinsics.h"
+#include "stdio.h"
+#include "assert.h"
+#include "solverResidual.cuh"
+#include "constraintPrepShared.cuh"
+
+#include "solverBlockCommon.cuh"
+#include "PxgDynamicsConfiguration.h"
+#include "DyCpuGpu1dConstraint.h"
+
+using namespace physx;
+
+PX_FORCE_INLINE static __device__ uint32_t nextPowerOfTwo(uint32_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	return x + 1;
+}
+
+PX_FORCE_INLINE static __device__ bool isPowerOfTwo(uint32_t x)
+{
+	return (x & (x - 1)) == 0;
+}
+
+// TGS uses three float4s to store linVel, angVel, linDelta, angDelta, while PGS uses 2 float4s for linVel
+// and angVel. Default: PGS
+PX_FORCE_INLINE static __device__ PxU32 ComputeAverageBodyBatchStartIndex(const PxU32 bodyIndex, const PxU32 float4sPerBody = 2)
+{
+	return float4sPerBody * (bodyIndex & (~31)) + (bodyIndex & 31);
+}
+
+PX_FORCE_INLINE static __device__ PxU32 countActiveSlabs(PxU32 index, PxU32 numSlabs, PxU32 numSolverBodies,
+														 const PxU32* const encodedReferenceCounts)
+{
+	PxU32 referenceCount = 0;
+
+	const PxU32 num32Slabs = (numSlabs + 31) / 32; // In case more than 32 slabs are used.
+	for (PxU32 i = 0; i < num32Slabs; ++i)
+	{
+		const PxU32 id = encodedReferenceCounts[index + i * numSolverBodies];
+		referenceCount += static_cast<PxU32>(__popc(id));
+	}
+
+	return PxMax(1u, referenceCount);
+}
+
+PX_FORCE_INLINE static __device__ void resetSlabCount(PxU32 index, PxU32 numSlabs, PxU32 numSolverBodies,
+													  PxU32* PX_RESTRICT encodedReferenceCounts)
+{
+	const PxU32 num32Slabs = (numSlabs + 31) / 32; // In case more than 32 slabs are used.
+	for(PxU32 i = 0; i < num32Slabs; ++i)
+	{
+		encodedReferenceCounts[index + i * numSolverBodies] = 0u;
+	}
+}
+
+// Mass-splitting version of 1D constraints; mass-related terms are computed at every sub-timestep. See "setupSolverConstraintBlockGPU".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solve1DBlock(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, const PxU32 threadIndex,
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod, bool residualReportingEnabled,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT baseMod = &rowsMod[batch.startConstraintIndex];
+
+	PxVec3 linVel0 = b0LinVel;
+	PxVec3 linVel1 = b1LinVel;
+	PxVec3 angVel0 = b0AngVel;
+	PxVec3 angVel1 = b1AngVel;
+
+	float invMass0 = ref0 * header->invMass0D0[threadIndex];
+	float invMass1 = ref1 * header->invMass1D1[threadIndex];
+
+	float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
+	float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
+
+	for (PxU32 i = 0; i < header->rowCounts[threadIndex]; ++i)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = baseCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = baseMod[i];
+
+		const float4 _clinVel0_minImpulse = ccon.lin0XYZ_minImpulse[threadIndex];
+		const float4 _clinVel1_maxImpulse = ccon.lin1XYZ_maxImpulse[threadIndex];
+		const float4 _cangVel0_resp0 = ccon.ang0XYZ_resp0[threadIndex];
+		const float4 _cangVel1_resp1 = ccon.ang1XYZ_resp1[threadIndex];
+
+		const PxVec3 clinVel0(_clinVel0_minImpulse.x, _clinVel0_minImpulse.y, _clinVel0_minImpulse.z);
+		const PxVec3 clinVel1(_clinVel1_maxImpulse.x, _clinVel1_maxImpulse.y, _clinVel1_maxImpulse.z);
+		const PxVec3 cangVel0(_cangVel0_resp0.x, _cangVel0_resp0.y, _cangVel0_resp0.z);
+		const PxVec3 cangVel1(_cangVel1_resp1.x, _cangVel1_resp1.y, _cangVel1_resp1.z);
+
+		const PxReal resp0 = ref0 * _cangVel0_resp0.w;
+		const PxReal resp1 = ref1 * _cangVel1_resp1.w;
+		const PxReal initJointSpeed = ccon.initJointSpeed[threadIndex];
+
+		const PxReal coeff0 = cmod.coeff0[threadIndex];
+		const PxReal coeff1 = cmod.coeff1[threadIndex];
+		const PxU32 flags = cmod.flags[threadIndex];
+
+		const PxReal unitResponse = resp0 + resp1;
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+		const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		PxReal constant, unbiasedConstant, vMul, iMul;
+
+		bool isSpring = flags & DY_SC_FLAG_SPRING;
+		bool isAccelerationSpring = flags & DY_SC_FLAG_ACCELERATION_SPRING;
+
+		Dy::compute1dConstraintSolverConstantsPGS(isSpring, isAccelerationSpring, coeff0, coeff1, initJointSpeed, unitResponse,
+												  recipResponse, constant, unbiasedConstant, vMul, iMul);
+
+		// For velocity iterations, "constant" is overwritten by "unbiasedConstant".
+		// This is currently done by assigning coeff1 to coeff0 in "conclude1DBlock".
+
+		const float appliedForce = cmod.appliedForce[threadIndex];//FLoad(c.appliedForce);
+
+		const float maxImpulse = _clinVel1_maxImpulse.w;//FLoad(c.maxImpulse);
+		const float minImpulse = _clinVel0_minImpulse.w;//FLoad(c.minImpulse);
+
+		const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
+		const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
+
+		const float normalVel = v0 - v1;
+
+		const float unclampedForce = iMul*appliedForce + (vMul*normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
+		const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
+
+		cmod.appliedForce[threadIndex] = clampedForce;
+		if(residualReportingEnabled)
+			cmod.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
+
+		linVel0 = linVel0 + clinVel0*(deltaF*invMass0);//V3ScaleAdd(clinVel0, FMul(deltaF, invMass0), linVel0);			
+		linVel1 = linVel1 - clinVel1*(deltaF*invMass1);//V3NegScaleSub(clinVel1, FMul(deltaF, invMass1), linVel1);
+		angVel0 = angVel0 + cangVel0*deltaF*invInertiaScale0;//V3ScaleAdd(cangVel0, deltaF, angVel0);
+		angVel1 = angVel1 - cangVel1*deltaF*invInertiaScale1;//V3NegScaleSub(cangVel1, deltaF, angVel1);
+
+	}
+
+
+	b0LinVel = linVel0;
+	b0AngVel = angVel0;
+	b1LinVel = linVel1;
+	b1AngVel = angVel1;
+	
+}
+
+// Mass-splitting version of 1D constraints; mass-related terms are computed at every sub-timestep. See "setupArtiSolverConstraintBlockGPU".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solveExt1DBlock(const PxgBlockConstraintBatch& batch,
+	Cm::UnAlignedSpatialVector& vel0,
+	Cm::UnAlignedSpatialVector& vel1,
+	const PxU32 threadIndex,
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers,
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon,
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod,
+	PxgArticulationBlockResponse* PX_RESTRICT artiResponse,
+	Cm::UnAlignedSpatialVector& impluse0,
+	Cm::UnAlignedSpatialVector& impluse1,
+	bool residualReportingEnabled,
+	PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT baseCon = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT baseMod = &rowsMod[batch.startConstraintIndex];
+
+	PxVec3 linVel0 = vel0.bottom;
+	PxVec3 linVel1 = vel1.bottom;
+	PxVec3 angVel0 = vel0.top;
+	PxVec3 angVel1 = vel1.top;
+
+	PxVec3 li0 = impluse0.bottom;
+	PxVec3 li1 = impluse1.bottom;
+	PxVec3 ai0 = impluse0.top;
+	PxVec3 ai1 = impluse1.top;
+
+	float invMass0 = ref0 * header->invMass0D0[threadIndex];
+	float invMass1 = ref1 * header->invMass1D1[threadIndex];
+
+	float invInertiaScale0 = ref0 * header->invInertiaScale0[threadIndex];
+	float invInertiaScale1 = ref1 * header->invInertiaScale1[threadIndex];
+
+	const float cfm = header->cfm[threadIndex];
+
+	const PxU32 numRows = header->rowCounts[threadIndex];
+	for (PxU32 i = 0; i < numRows; ++i)
+	{
+		PxgBlockSolverConstraint1DCon& ccon = baseCon[i];
+		PxgBlockSolverConstraint1DMod& cmod = baseMod[i];
+		PxgArticulationBlockResponse& response = artiResponse[i];
+
+		const float4 _clinVel0_minImpulse = ccon.lin0XYZ_minImpulse[threadIndex];
+		const float4 _clinVel1_maxImpulse = ccon.lin1XYZ_maxImpulse[threadIndex];
+		const float4 _cangVel0_resp0 = ccon.ang0XYZ_resp0[threadIndex];
+		const float4 _cangVel1_resp1 = ccon.ang1XYZ_resp1[threadIndex];
+
+		const PxReal resp0 = ref0 * _cangVel0_resp0.w;
+		const PxReal resp1 = ref1 * _cangVel1_resp1.w;
+		const PxReal initJointSpeed = ccon.initJointSpeed[threadIndex];
+
+		const PxVec3 clinVel0(_clinVel0_minImpulse.x, _clinVel0_minImpulse.y, _clinVel0_minImpulse.z);
+		const PxVec3 clinVel1(_clinVel1_maxImpulse.x, _clinVel1_maxImpulse.y, _clinVel1_maxImpulse.z);
+		const PxVec3 cangVel0(_cangVel0_resp0.x, _cangVel0_resp0.y, _cangVel0_resp0.z);
+		const PxVec3 cangVel1(_cangVel1_resp1.x, _cangVel1_resp1.y, _cangVel1_resp1.z);
+
+		const PxReal coeff0 = cmod.coeff0[threadIndex];
+		const PxReal coeff1 = cmod.coeff1[threadIndex];
+		const PxU32 flags = cmod.flags[threadIndex];
+
+		const PxReal unitResponse = resp0 + resp1 + cfm;
+
+		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
+		const PxReal minRowResponse = DY_MIN_RESPONSE;
+		const PxReal recipResponse = Dy::computeRecipUnitResponse(unitResponse, minRowResponse);
+
+		PxReal constant, unbiasedConstant, vMul, iMul;
+
+		bool isSpring = flags & DY_SC_FLAG_SPRING;
+		bool isAccelerationSpring = flags & DY_SC_FLAG_ACCELERATION_SPRING;
+
+		compute1dConstraintSolverConstantsPGS(isSpring, isAccelerationSpring, coeff0, coeff1, initJointSpeed, unitResponse, recipResponse,
+											  constant, unbiasedConstant, vMul, iMul);
+
+		// For velocity iterations, "constant" is overwritten by "unbiasedConstant".
+		// This is currently done by assigning coeff1 to coeff0 in "conclude1DBlock".
+
+		const float appliedForce = cmod.appliedForce[threadIndex];//FLoad(c.appliedForce);
+		const float maxImpulse = _clinVel1_maxImpulse.w;//FLoad(c.maxImpulse);
+		const float minImpulse = _clinVel0_minImpulse.w;//FLoad(c.minImpulse);
+
+		const float v0 = linVel0.dot(clinVel0) + angVel0.dot(cangVel0);//V3MulAdd(linVel0, clinVel0, V3Mul(angVel0, cangVel0));
+		const float v1 = linVel1.dot(clinVel1) + angVel1.dot(cangVel1);//V3MulAdd(linVel1, clinVel1, V3Mul(angVel1, cangVel1));
+
+		const float normalVel = v0 - v1;
+
+		const float unclampedForce = iMul * appliedForce + (vMul * normalVel + constant);//FMulAdd(iMul, appliedForce, FMulAdd(vMul, normalVel, constant));
+		const float clampedForce = fminf(maxImpulse, fmaxf(minImpulse, unclampedForce));//FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const float deltaF = clampedForce - appliedForce;//FSub(clampedForce, appliedForce);
+
+		cmod.appliedForce[threadIndex] = clampedForce;
+		if(residualReportingEnabled)
+			cmod.residual[threadIndex] = PxgErrorAccumulator::calculateResidual(deltaF, vMul);
+
+		li0 = clinVel0 * deltaF + li0;
+		ai0 = cangVel0 * deltaF + ai0;
+		li1 = clinVel1 * deltaF + li1;
+		ai1 = cangVel1 * deltaF + ai1;
+
+		PxVec3 linVa = ref0 * PxVec3(response.deltaRALin_x[threadIndex], response.deltaRALin_y[threadIndex], response.deltaRALin_z[threadIndex]);
+		PxVec3 angVa = ref0 * PxVec3(response.deltaRAAng_x[threadIndex], response.deltaRAAng_y[threadIndex], response.deltaRAAng_z[threadIndex]);
+		PxVec3 linVb = ref1 * PxVec3(response.deltaRBLin_x[threadIndex], response.deltaRBLin_y[threadIndex], response.deltaRBLin_z[threadIndex]);
+		PxVec3 angVb = ref1 * PxVec3(response.deltaRBAng_x[threadIndex], response.deltaRBAng_y[threadIndex], response.deltaRBAng_z[threadIndex]);
+
+		linVel0 = linVa * deltaF + linVel0;
+		angVel0 = angVa * deltaF + angVel0;
+
+		linVel1 = linVb * deltaF + linVel1;
+		angVel1 = angVb * deltaF + angVel1;
+	}
+
+	vel0.top = angVel0; vel0.bottom = linVel0;
+	vel1.top = angVel1; vel1.bottom = linVel1;
+
+	impluse0.top = li0 * invMass0; impluse0.bottom = ai0 * invInertiaScale0;
+	impluse1.top = li1 * invMass1; impluse1.bottom = ai1 * invInertiaScale1;
+}
+
+static __device__ void conclude1DBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod)
+{
+	using namespace physx;
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT base = &rowsMod[batch.startConstraintIndex];
+
+	for (PxU32 i = 0; i < header->rowCounts[threadIndex]; i++)
+	{
+		PxgBlockSolverConstraint1DMod& c = base[i];
+		if(!(c.flags[threadIndex] & DY_SC_FLAG_SPRING)) // For spring constraints, it is automatically satisfied.
+		{
+			c.coeff0[threadIndex] = c.coeff1[threadIndex]; // This makes sure "unbiased constant" is used as "constant".
+														   // See also "queryReduced1dConstraintSolverConstantsPGS".
+		}
+	}
+}
+
+// Mass-splitting version of contact constraints; mass-related terms are computed at every sub-timestep. See "setupFinalizeSolverConstraintsBlock".
+// Refer to "Mass Splitting for Jitter-Free Parallel Rigid Body Simulation" for the general mass-splitting concept.
+static __device__ void solveContactBlock(const PxgBlockConstraintBatch& batch, PxVec3& b0LinVel, PxVec3& b0AngVel, PxVec3& b1LinVel, PxVec3& b1AngVel, bool doFriction, const PxU32 threadIndex,
+	PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictionPoints,
+	PxgErrorAccumulator* error, PxReal ref0 = 1.f, PxReal ref1 = 1.f)
+{
+	using namespace physx;
+
+	PxVec3 linVel0 = b0LinVel;
+	PxVec3 linVel1 = b1LinVel;
+	PxVec3 angVel0 = b0AngVel;
+	PxVec3 angVel1 = b1AngVel;
+
+	{
+		//printf("Normal batchIndex = %i, startConstraint = %i, startFriction = %i\n", batch.mConstraintBatchIndex, batch.startConstraintIndex, batch.startFrictionIndex);
+		PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		PxgBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint numNormalConstr = Pxldcg(contactHeader->numNormalConstr[threadIndex]);
+		const uint	numFrictionConstr = Pxldcg(frictionHeader->numFrictionConstr[threadIndex]);
+
+		PxgBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+		PxgBlockSolverContactFriction* PX_RESTRICT frictions = &frictionPoints[batch.startFrictionIndex];
+
+		float accumulatedNormalImpulse = 0.f;
+
+		const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader->invMass0_1_angDom0_1[threadIndex]);
+
+		const float invMassA = ref0 * invMass0_1_angDom0_1.x;
+		const float invMassB = ref1 * invMass0_1_angDom0_1.y;
+
+		const float angDom0 = ref0 * invMass0_1_angDom0_1.z;
+		const float angDom1 = ref1 * invMass0_1_angDom0_1.w;
+
+		const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
+
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		const float restitution = contactHeader->restitution[threadIndex];
+		const float p8 = 0.8f;
+		const PxU8 flags = contactHeader->flags[threadIndex];
+
+		const PxVec3 delLinVel0 = normal * invMassA;
+		const PxVec3 delLinVel1 = normal * invMassB;
+
+		//Bring forward a read event
+		const float staticFrictionCof = normal_staticFriction.w;
+
+		float4 nextRaxn_extraCoeff;
+		float4 nextRbxn_maxImpulseW;
+		float nextAppliedForce;
+
+		float nextResp0;
+		float nextResp1;
+
+		{
+			nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndex]);
+			nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndex]);
+			nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndex]);
+
+			nextResp0 = Pxldcg(contacts[0].resp0[threadIndex]);
+			nextResp1 = Pxldcg(contacts[0].resp1[threadIndex]);
+
+			float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndex]);
+			float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndex]);
+
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				PxgBlockSolverContactPoint& c = contacts[i];
+
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+				const float appliedForce = nextAppliedForce;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float coeff0 = nextCoeff0;
+				const float coeff1 = nextCoeff1;
+
+				if ((i + 1) < numNormalConstr)
+				{
+					const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndex]);
+					nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(nextC.resp0[threadIndex]);
+					nextResp1 = Pxldcg(nextC.resp1[threadIndex]);
+
+					nextCoeff0 = Pxldcg(nextC.coeff0[threadIndex]);
+					nextCoeff1 = Pxldcg(nextC.coeff1[threadIndex]);
+				}
+				else if (numFrictionConstr && doFriction)
+				{
+					nextRaxn_extraCoeff = Pxldcg(frictions[0].raXn_bias[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(frictions[0].rbXn_targetVelW[threadIndex]);
+					nextAppliedForce = Pxldcg(frictions[0].appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(frictions[0].resp0[threadIndex]);
+					nextResp1 = Pxldcg(frictions[0].resp1[threadIndex]);
+				}
+
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+				const float targetVelocity = raXn_extraCoeff.w;
+				const float maxImpulse = rbXn_maxImpulse.w;
+
+				const float unitResponse = ref0 * resp0 + ref1 * resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / unitResponse) : 0.f;
+
+				float velMultiplier = recipResponse;
+				float impulseMul = 1.0f;
+				float unbiasedError = 0.0f;
+				float biasedErr = 0.0f;
+
+				computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0,
+				                           coeff1, velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+				//Compute the normal velocity of the constraint.
+				const float v0 = linVel0.dot(normal) + angVel0.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = linVel1.dot(normal) + angVel1.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce * impulseMul + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+				linVel0 += delLinVel0 * deltaF;
+				linVel1 -= delLinVel1 * deltaF;
+				angVel0 += raXn * (deltaF * angDom0);
+				angVel1 -= rbXn * (deltaF * angDom1);
+
+				if(error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				Pxstcg(&c.appliedForce[threadIndex], newForce);
+
+				accumulatedNormalImpulse = accumulatedNormalImpulse + newForce;
+			}
+		}
+
+		if (numFrictionConstr && doFriction)
+		{
+			const float dynamicFrictionCof = Pxldcg(frictionHeader->dynamicFriction[threadIndex]);
+			const float maxFrictionImpulse = staticFrictionCof * accumulatedNormalImpulse;
+			const float maxDynFrictionImpulse = dynamicFrictionCof * accumulatedNormalImpulse;
+
+			PxU32 broken = 0;
+
+			for (uint i = 0; i < numFrictionConstr; i++)
+			{
+				PxgBlockSolverContactFriction& f = frictions[i];
+
+				const float4 frictionNormal = Pxldg(frictionHeader->frictionNormals[i & 1][threadIndex]);
+
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_targetVelW = nextRbxn_maxImpulseW;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float appliedForce = nextAppliedForce;
+
+				if ((i + 1) < numFrictionConstr)
+				{
+					const PxgBlockSolverContactFriction& f2 = frictions[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(f2.raXn_bias[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(f2.rbXn_targetVelW[threadIndex]);
+					nextAppliedForce = Pxldcg(f2.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(f2.resp0[threadIndex]);
+					nextResp1 = Pxldcg(f2.resp1[threadIndex]);
+				}
+
+				const PxVec3 normal = PxVec3(frictionNormal.x, frictionNormal.y, frictionNormal.z);
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_targetVelW.x, rbXn_targetVelW.y, rbXn_targetVelW.z);
+
+				const float resp = ref0 * resp0 + ref1 * resp1;
+				const float velMultiplier = (resp > 0.f) ? (p8 / resp) : 0.f;
+
+				const float bias = raXn_extraCoeff.w;
+
+				const PxVec3 delLinVel0 = normal * invMassA;
+				const PxVec3 delLinVel1 = normal * invMassB;
+
+				const float targetVel = rbXn_targetVelW.w;
+
+				const float v0 = angVel0.dot(raXn) + linVel0.dot(normal);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = angVel1.dot(rbXn) + linVel1.dot(normal);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				const float tmp1 = appliedForce - (bias - targetVel) * velMultiplier;
+
+				const float totalImpulse = tmp1 - normalVel * velMultiplier;
+
+				const bool clamp = fabsf(totalImpulse) > maxFrictionImpulse;
+
+				const float totalClamped = fminf(maxDynFrictionImpulse, fmaxf(-maxDynFrictionImpulse, totalImpulse));
+
+				const float newAppliedForce = clamp ? totalClamped : totalImpulse;
+
+				float deltaF = newAppliedForce - appliedForce;//FSub(newAppliedForce, appliedForce);
+
+				if (error)
+					error->accumulateErrorLocal(deltaF, velMultiplier);
+
+				linVel0 += delLinVel0 * deltaF;
+				linVel1 -= delLinVel1 * deltaF;
+				angVel0 += raXn * (deltaF * angDom0);
+				angVel1 -= rbXn * (deltaF * angDom1);
+
+				Pxstcg(&f.appliedForce[threadIndex], newAppliedForce);
+				broken = broken | clamp;
+			}
+			Pxstcg(&frictionHeader->broken[threadIndex], broken);
+		}
+
+	}
+
+	// Write back
+	b0LinVel = linVel0;
+	b0AngVel = angVel0;
+	b1LinVel = linVel1;
+	b1AngVel = angVel1;
+}
+
+// A light version of the function "solveContactBlock" to quickly check if there is any active contact.
+// TODO: Make this even lighter.
+
+static __device__ bool checkActiveContactBlock(const PxgBlockConstraintBatch& batch, const PxVec3& linVel0,
+	const PxVec3& angVel0, const PxVec3& linVel1, const PxVec3& angVel1, const PxU32 threadIndex,
+	PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverContactPoint* contactPoints)
+{
+	using namespace physx;
+
+	{
+		PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const uint numNormalConstr = Pxldcg(contactHeader->numNormalConstr[threadIndex]);
+		PxgBlockSolverContactPoint* PX_RESTRICT contacts = &contactPoints[batch.startConstraintIndex];
+
+		const float4 invMass0_1_angDom0_1 = Pxldcg(contactHeader->invMass0_1_angDom0_1[threadIndex]);
+
+		const float4 normal_staticFriction = Pxldcg(contactHeader->normal_staticFriction[threadIndex]);
+		const PxVec3 normal = PxVec3(normal_staticFriction.x, normal_staticFriction.y, normal_staticFriction.z);
+
+		const float restitution = contactHeader->restitution[threadIndex];
+		const PxU8 flags = contactHeader->flags[threadIndex];
+
+		float4 nextRaxn_extraCoeff;
+		float4 nextRbxn_maxImpulseW;
+		float nextAppliedForce;
+
+		float nextResp0;
+		float nextResp1;
+
+		{
+			nextRaxn_extraCoeff = Pxldcg(contacts[0].raXn_targetVelocity[threadIndex]);
+			nextRbxn_maxImpulseW = Pxldcg(contacts[0].rbXn_maxImpulse[threadIndex]);
+			nextAppliedForce = Pxldcg(contacts[0].appliedForce[threadIndex]);
+
+			nextResp0 = Pxldcg(contacts[0].resp0[threadIndex]);
+			nextResp1 = Pxldcg(contacts[0].resp1[threadIndex]);
+
+			float nextCoeff0 = Pxldcg(contacts[0].coeff0[threadIndex]);
+			float nextCoeff1 = Pxldcg(contacts[0].coeff1[threadIndex]);
+
+			for (uint i = 0; i < numNormalConstr; i++)
+			{
+				const float4 raXn_extraCoeff = nextRaxn_extraCoeff;
+				const float4 rbXn_maxImpulse = nextRbxn_maxImpulseW;
+				const float appliedForce = nextAppliedForce;
+
+				const float resp0 = nextResp0;
+				const float resp1 = nextResp1;
+
+				const float coeff0 = nextCoeff0;
+				const float coeff1 = nextCoeff1;
+
+				if ((i + 1) < numNormalConstr)
+				{
+					const PxgBlockSolverContactPoint& nextC = contacts[i + 1];
+					nextRaxn_extraCoeff = Pxldcg(nextC.raXn_targetVelocity[threadIndex]);
+					nextRbxn_maxImpulseW = Pxldcg(nextC.rbXn_maxImpulse[threadIndex]);
+					nextAppliedForce = Pxldcg(nextC.appliedForce[threadIndex]);
+
+					nextResp0 = Pxldcg(nextC.resp0[threadIndex]);
+					nextResp1 = Pxldcg(nextC.resp1[threadIndex]);
+
+					nextCoeff0 = Pxldcg(nextC.coeff0[threadIndex]);
+					nextCoeff1 = Pxldcg(nextC.coeff1[threadIndex]);
+				}
+
+				const PxVec3 raXn = PxVec3(raXn_extraCoeff.x, raXn_extraCoeff.y, raXn_extraCoeff.z);
+				const PxVec3 rbXn = PxVec3(rbXn_maxImpulse.x, rbXn_maxImpulse.y, rbXn_maxImpulse.z);
+				const float targetVelocity = raXn_extraCoeff.w;
+				const float maxImpulse = rbXn_maxImpulse.w;
+
+				const float unitResponse = resp0 + resp1;
+				const float recipResponse = (unitResponse > 0.f) ? (1.f / unitResponse) : 0.f;
+
+				float velMultiplier = recipResponse;
+				float impulseMul = 1.0f;
+				float unbiasedError = 0.0f;
+				float biasedErr = 0.0f;
+
+				computeContactCoefficients(flags, restitution, unitResponse, recipResponse, targetVelocity, coeff0,
+					coeff1, velMultiplier, impulseMul, unbiasedError, biasedErr);
+
+				//Compute the normal velocity of the constraint.
+				const float v0 = linVel0.dot(normal) + angVel0.dot(raXn);//V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const float v1 = linVel1.dot(normal) + angVel1.dot(rbXn);//V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const float normalVel = v0 - v1;
+
+				//KS - clamp the maximum force
+				const float tempDeltaF = biasedErr - normalVel * velMultiplier;
+				const float _deltaF = fmaxf(tempDeltaF, -appliedForce);//FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+				const float _newForce = appliedForce * impulseMul + _deltaF;
+				const float newForce = fminf(_newForce, maxImpulse);//FMin(_newForce, maxImpulse);
+				const float deltaF = newForce - appliedForce;
+
+				// Check for active contact.
+				if (PxAbs(deltaF) > 1.0e-8f)
+				{
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+static __device__ void concludeContactBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, 
+	PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictions)
+{
+
+	using namespace physx;
+
+	{
+		const PxgBlockSolverContactHeader* contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const PxgBlockSolverFrictionHeader* frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+
+		const uint32_t numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		const uint32_t numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+
+		PxgBlockSolverContactPoint* contacts = &contactPoints[batch.startConstraintIndex];
+		if (numNormalConstr)
+		{
+			PxReal restitution = contactHeader->restitution[threadIndex];
+
+			// Assigning unbiased error to biased error.
+			// When restitution is negative (compliant contact), no additional care is required as it is automatically
+			// enforced.
+			if (restitution >= 0.f)
+			{
+				for (uint32_t i = 0; i < numNormalConstr; i++)
+				{
+					contacts[i].coeff0[threadIndex] = contacts[i].coeff1[threadIndex];
+				}
+			}
+		}
+
+		PxgBlockSolverContactFriction* frictionConstr = &frictions[batch.startFrictionIndex];
+		for (uint32_t i = 0; i < numFrictionConstr; i++)
+		{
+			float4 raXn_bias = frictionConstr[i].raXn_bias[threadIndex];
+			raXn_bias.w = 0.f;
+
+			frictionConstr[i].raXn_bias[threadIndex] = raXn_bias;
+		}
+	}
+}
+
+
+
+static __device__ void writeBackContactBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex,
+											 const PxgSolverBodyData* bodies, Dy::ThresholdStreamElement* thresholdStream,
+											 PxI32* sharedThresholdStreamIndex, PxgBlockSolverContactHeader* contactHeaders, PxgBlockSolverFrictionHeader* frictionHeaders, 
+											PxgBlockSolverContactPoint* contactPoints, PxgBlockSolverContactFriction* frictions,
+											PxF32* forcewritebackBuffer, PxgBlockFrictionPatch& frictionPatchBlock,
+											PxgFrictionPatchGPU* frictionPatches)
+{
+	const PxU32 bodyAIndex = batch.bodyAIndex[threadIndex];
+	const PxU32 bodyBIndex = batch.bodyBIndex[threadIndex];
+
+	const PxgSolverBodyData& bd0 = bodies[bodyAIndex];
+	const PxgSolverBodyData& bd1 = bodies[bodyBIndex];
+	bool forceThreshold = false;
+
+	float normalForce = 0.f;
+
+	{
+		const PxgBlockSolverContactHeader* PX_RESTRICT contactHeader = &contactHeaders[batch.mConstraintBatchIndex];
+		const PxgBlockSolverFrictionHeader* PX_RESTRICT frictionHeader = &frictionHeaders[batch.mConstraintBatchIndex];
+	
+		PxU32 forceWritebackOffset = contactHeader->forceWritebackOffset[threadIndex];
+
+		forceThreshold = contactHeader->flags[threadIndex] & PxgSolverContactFlags::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32	numFrictionConstr = frictionHeader->numFrictionConstr[threadIndex];
+
+		const PxU32 numNormalConstr = contactHeader->numNormalConstr[threadIndex];
+		if(forceWritebackOffset!=0xFFFFFFFF)
+		{
+			PxReal* vForceWriteback = &forcewritebackBuffer[forceWritebackOffset];
+			PxgBlockSolverContactPoint* c = &contactPoints[batch.startConstraintIndex];
+			for(PxU32 i=0; i<numNormalConstr; i++)
+			{
+				const PxReal appliedForce = c[i].appliedForce[threadIndex];//FStore(c->getAppliedForce());
+				*vForceWriteback++ = appliedForce;
+				normalForce += appliedForce;
+			}
+		}
+
+		writeBackContactBlockFriction(threadIndex, numFrictionConstr, frictionHeader,
+			frictionPatchBlock, frictions + batch.startFrictionIndex, frictionPatches);
+
+		if(numFrictionConstr && frictionHeader->broken[threadIndex])
+		{
+			frictionPatchBlock.broken[threadIndex] = 1;
+		}
+	}
+
+	float reportThreshold0 = bd0.reportThreshold;
+	float reportThreshold1 = bd1.reportThreshold;
+
+	if((forceThreshold && normalForce !=0 && (reportThreshold0 < PX_MAX_REAL  || reportThreshold1 < PX_MAX_REAL)))
+	{
+		//ToDo : support PxgThresholdStreamElement
+		Dy::ThresholdStreamElement elt;
+		elt.normalForce = normalForce;
+		elt.threshold = PxMin<float>(reportThreshold0, reportThreshold1);
+		
+		elt.nodeIndexA = bd0.islandNodeIndex;
+		elt.nodeIndexB = bd1.islandNodeIndex;
+		elt.shapeInteraction = batch.shapeInteraction[threadIndex];
+		PxOrder(elt.nodeIndexA, elt.nodeIndexB);
+		assert(elt.nodeIndexA < elt.nodeIndexB);
+
+		PxI32 index = atomicAdd(sharedThresholdStreamIndex, 1);
+
+		//KS - force a 16-byte coalesced write
+		//((float4*)thresholdStream)[index] = *((float4*)&elt);
+		thresholdStream[index] = elt;
+	}
+}
+
+static __device__ void writeBack1DBlock(const PxgBlockConstraintBatch& batch, const PxU32 threadIndex, const PxgBlockSolverConstraint1DHeader* PX_RESTRICT headers, 
+	PxgBlockSolverConstraint1DCon* PX_RESTRICT rowsCon, PxgBlockSolverConstraint1DMod* PX_RESTRICT rowsMod,PxgConstraintWriteback* constraintWriteBacks)
+{
+	const PxgBlockSolverConstraint1DHeader* PX_RESTRICT  header = &headers[batch.mConstraintBatchIndex];
+	PxgBlockSolverConstraint1DCon* conBase = &rowsCon[batch.startConstraintIndex];
+	PxgBlockSolverConstraint1DMod* PX_RESTRICT modBase = &rowsMod[batch.startConstraintIndex];
+
+	PxU32 forceWritebackOffset = header->writeBackOffset[threadIndex];
+
+	const PxU8 breakable = header->breakable[threadIndex];
+
+	const PxU32 numRows = header->rowCounts[threadIndex];
+
+	if (forceWritebackOffset != 0xFFFFFFFF)
+	{
+		PxgConstraintWriteback& writeback = constraintWriteBacks[forceWritebackOffset];
+	
+		PxVec3 linVel(0), angVel(0);
+		PxReal constraintErrorSq = 0.0f;
+		for (PxU32 i = 0; i < numRows; ++i)
+		{
+			PxgBlockSolverConstraint1DCon& con = conBase[i];
+			PxgBlockSolverConstraint1DMod& mod = modBase[i];
+
+			if (mod.flags[threadIndex] & DY_SC_FLAG_OUTPUT_FORCE)
+			{
+				const float4 lin0XYZ_minImpulse = con.lin0XYZ_minImpulse[threadIndex];
+				const PxVec3 lin0(lin0XYZ_minImpulse.x, lin0XYZ_minImpulse.y, lin0XYZ_minImpulse.z);
+				const PxVec3 ang0WriteBack = mod.ang0Writeback[threadIndex];
+				const PxReal appliedForce = mod.appliedForce[threadIndex];
+				linVel += lin0 * appliedForce;
+				angVel += ang0WriteBack *appliedForce;
+			}
+
+			PxReal err = mod.residual[threadIndex];
+			constraintErrorSq += err * err;
+		}
+
+		const float4 body0WorldOffset_linBreakImpulse = header->body0WorldOffset_linBreakImpulse[threadIndex];
+		const PxVec3 body0WorldOffset(body0WorldOffset_linBreakImpulse.x, body0WorldOffset_linBreakImpulse.y, body0WorldOffset_linBreakImpulse.z);
+		angVel -= body0WorldOffset.cross(linVel);
+
+
+		const PxU32 broken = breakable ? PxU32((linVel.magnitude() > body0WorldOffset_linBreakImpulse.w) || (angVel.magnitude() > header->angBreakImpulse[threadIndex])) : 0;
+		writeback.angularImpulse_residual = make_float4(angVel.x, angVel.y, angVel.z, constraintErrorSq);
+		writeback.linearImpulse_broken = make_float4(linVel.x, linVel.y, linVel.z, broken ? -0.0f : 0.0f);
+
+	}
+
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockCommon.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockCommon.cuh
@@ -0,0 +1,67 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOLVER_BLOCK_COMMON_CUH__
+#define __SOLVER_BLOCK_COMMON_CUH__
+
+template <typename FRICTION_HEADER, typename FRICTION>
+static __device__ void writeBackContactBlockFriction(
+	const PxU32 threadIndex, PxU32	numFrictionConstr, const FRICTION_HEADER* PX_RESTRICT frictionHeader,
+	PxgBlockFrictionPatch& frictionPatchBlock, FRICTION* fric, PxgFrictionPatchGPU* frictionPatches
+)
+{
+	PxU32 patchIndex = frictionPatchBlock.patchIndex[threadIndex];
+	if (patchIndex != 0xFFFFFFFF)
+	{
+		PxgFrictionPatchGPU& frictionInfo = frictionPatches[patchIndex];
+
+		float4 axis0 = frictionHeader->frictionNormals[0][threadIndex];
+		float4 axis1 = frictionHeader->frictionNormals[1][threadIndex];
+
+		frictionInfo.anchors = numFrictionConstr / 2;
+
+		if (numFrictionConstr >= 2)
+		{
+			float4 anchor = frictionPatchBlock.anchorPoints[0][threadIndex];
+			frictionInfo.points[0] = PxVec3(anchor.x, anchor.y, anchor.z);
+			PxReal impulse0 = fric[0].appliedForce[threadIndex];
+			PxReal impulse1 = fric[1].appliedForce[threadIndex];
+			frictionInfo.impulses[0] = PxVec3(axis0.x, axis0.y, axis0.z) * impulse0 + PxVec3(axis1.x, axis1.y, axis1.z) * impulse1;
+		}
+		if (numFrictionConstr >= 4)
+		{
+			float4 anchor = frictionPatchBlock.anchorPoints[1][threadIndex];
+			frictionInfo.points[1] = PxVec3(anchor.x, anchor.y, anchor.z);
+			PxReal impulse0 = fric[2].appliedForce[threadIndex];
+			PxReal impulse1 = fric[3].appliedForce[threadIndex];
+			frictionInfo.impulses[1] = PxVec3(axis0.x, axis0.y, axis0.z) * impulse0 + PxVec3(axis1.x, axis1.y, axis1.z) * impulse1;
+		}
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockTGS.cuh
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverBlockTGS.cuh
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlock.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlock.cu
--- a/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlockTGS.cu
+++ b/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlockTGS.cu
--- a/engine/third_party/physx/source/gpusolver/src/PxgConstraintPartition.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgConstraintPartition.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgContext.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgCudaSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgCudaSolverCore.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgDynamicsContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgDynamicsContext.cpp
@@ -0,0 +1,120 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgDynamicsContext.h"
+#include "PxgKernelWrangler.h"
+#include "PxgArticulationCore.h"
+#include "PxgCudaSolverCore.h"
+
+namespace physx
+{
+	PxgDynamicsContext::PxgDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+		const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions,
+		bool enableStabilization, bool useEnhancedDeterminism,
+		PxReal maxBiasCoefficient,
+		PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+		bool frictionEveryIteration, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled)
+		:
+		PxgGpuContext(flushPool, islandManager, maxNumPartitions, maxNumStaticPartitions, enableStabilization, useEnhancedDeterminism, maxBiasCoefficient, simStats, heapMemoryManager, lengthScale, enableDirectGPUAPI, contextID, isResidualReportingEnabled, false)
+	{
+		mWorldSolverBody.linearVelocity = PxVec3(0);
+		mWorldSolverBody.angularVelocity = PxVec3(0);
+		mWorldSolverBodyData.invMass = 0;
+		mWorldSolverBodyData.reportThreshold = PX_MAX_REAL;
+		mWorldSolverBodyData.maxImpulse = PX_MAX_REAL;
+		mWorldSolverBodyData.penBiasClamp = -PX_MAX_REAL;
+		mWorldSolverBodyData.initialAngVel = mWorldSolverBodyData.initialLinVel = PxVec3(0.f);
+		mWorldSolverBodyData.body2World = PxAlignedTransform(PxIdentity);
+		mWorldSolverBodyData.islandNodeIndex = PxNodeIndex(PX_INVALID_NODE);
+		mWorldSolverBodyData.offsetSlop = 0.f;
+
+		mWorldTxIData.sqrtInvInertia = PxMat33(PxZero);
+		mWorldTxIData.deltaBody2World = PxTransform(PxIdentity);
+
+		{
+			mGpuArticulationCore = PX_NEW(PxgArticulationCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, heapMemoryManager);
+
+			mGpuSolverCore = PX_NEW(PxgCudaSolverCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, this, heapMemoryManager, config, frictionEveryIteration);
+
+			mGpuArticulationCore->setGpuContext(this);
+		}
+
+		mGpuSolverCore->acquireContext();
+
+		mGpuSolverCore->createStreams();
+
+		createThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+		createForceChangeThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+
+		mPinnedMemoryAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.tempBufferCapacity);
+
+		mCurrentContactStream = 0;
+		mContactStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+		mContactStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+
+		mPatchStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+		mPatchStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+	
+		mForceStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxReal) * 2);
+	
+		mFrictionPatchStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxFrictionPatch));
+
+		mContactStreamPool.mDataStream = mContactStreamAllocators[mCurrentContactStream]->mStart;
+		mContactStreamPool.mDataStreamSize = (PxU32)mContactStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mContactStreamPool.mSharedDataIndex = 0;
+		mContactStreamPool.mSharedDataIndexGPU = 0;
+
+		mPatchStreamPool.mDataStream = mPatchStreamAllocators[mCurrentContactStream]->mStart;
+		mPatchStreamPool.mDataStreamSize = (PxU32)mPatchStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mPatchStreamPool.mSharedDataIndex = 0;
+		mPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		mForceStreamPool.mDataStream = mForceStreamAllocator->mStart;
+		mForceStreamPool.mDataStreamSize = (PxU32)mForceStreamAllocator->mTotalSize;
+		mForceStreamPool.mSharedDataIndex = 0;
+		mForceStreamPool.mSharedDataIndexGPU = 0;
+
+		mFrictionPatchStreamPool.mDataStream = mFrictionPatchStreamAllocator->mStart;
+		mFrictionPatchStreamPool.mDataStreamSize = PxTo32(mFrictionPatchStreamAllocator->mTotalSize);
+		mFrictionPatchStreamPool.mSharedDataIndex = 0;
+		mFrictionPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		//Arbitrarily-large number to reserve to minimize allocation churn.
+		mConstraintsPerPartition.reserve(1024);
+
+		mArtiConstraintsPerPartition.reserve(1024);
+
+		mGpuSolverCore->releaseContext();
+	}
+
+	void PxgDynamicsContext::destroy()
+	{
+		this->~PxgDynamicsContext();
+		PX_FREE_THIS;
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgSolver.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgSolver.cpp
@@ -0,0 +1,66 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgBroadPhase.h"
+
+namespace physx
+{
+
+	extern "C" void initSolverKernels0();
+	extern "C" void initSolverKernels1();
+	extern "C" void initSolverKernels2();
+	extern "C" void initSolverKernels3();
+	extern "C" void initSolverKernels4();
+	extern "C" void initSolverKernels5();
+	extern "C" void initSolverKernels6();
+	extern "C" void initSolverKernels7();
+	extern "C" void initSolverKernels9();
+	extern "C" void initSolverKernels10();
+	extern "C" void initSolverKernels11();
+	extern "C" void initSolverKernels13();
+
+	void createPxgSolver()
+	{
+#if !PX_PHYSX_GPU_EXPORTS
+		//this call is needed to force PhysXGpuSolver linkage as Static Library!
+		initSolverKernels0();
+		initSolverKernels1();
+		initSolverKernels2();
+		initSolverKernels3();
+		initSolverKernels4();
+		initSolverKernels5();
+		initSolverKernels6();
+		initSolverKernels7();
+		initSolverKernels9();
+		initSolverKernels10();
+		initSolverKernels11();
+		initSolverKernels13();
+#endif
+	}
+
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgSolverCore.cpp
@@ -0,0 +1,754 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgSolverCore.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxCudaContext.h"
+#include "PxgSimulationController.h"
+#include "PxgSimulationCore.h"
+#include "common/PxProfileZone.h"
+#include "PxgCudaUtils.h"
+#include "PxgKernelWrangler.h"
+#include "PxgKernelIndices.h"
+#include "CudaKernelWrangler.h"
+#include "PxgContext.h"
+#include "PxgArticulationCore.h"
+#include "PxgSolverKernelIndices.h"
+#include "PxgDynamicsConfiguration.h"
+#include "PxgFrictionPatch.h"
+#include "PxgDynamicsContext.h"
+#include "PxgArticulationCoreKernelIndices.h"
+#include "DyConstraintPrep.h"
+#include "PxgIslandContext.h"
+
+#define GPU_CORE_DEBUG 0
+
+using namespace physx;
+
+PxgRadixSortBuffers::PxgRadixSortBuffers(PxgHeapMemoryAllocatorManager* heapMemoryManager) :
+	mInputKeys(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mInputRanks(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutputKeys(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutputRanks(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRadixCounts(heapMemoryManager, PxsHeapStats::eSOLVER)
+{
+}
+
+void PxgRadixSortBuffers::constructRadixSortDesc(PxgRadixSortDesc* rsDesc) const
+{
+	rsDesc[0].inputKeys = reinterpret_cast<PxU32*>(mInputKeys.getDevicePtr());
+	rsDesc[0].inputRanks = reinterpret_cast<PxU32*>(mInputRanks.getDevicePtr());
+	rsDesc[0].outputKeys = reinterpret_cast<PxU32*>(mOutputKeys.getDevicePtr());
+	rsDesc[0].outputRanks = reinterpret_cast<PxU32*>(mOutputRanks.getDevicePtr());
+	rsDesc[0].radixBlockCounts = reinterpret_cast<PxU32*>(mRadixCounts.getDevicePtr());
+
+	rsDesc[1].inputKeys = reinterpret_cast<PxU32*>(mOutputKeys.getDevicePtr());
+	rsDesc[1].inputRanks = reinterpret_cast<PxU32*>(mOutputRanks.getDevicePtr());
+	rsDesc[1].outputKeys = reinterpret_cast<PxU32*>(mInputKeys.getDevicePtr());
+	rsDesc[1].outputRanks = reinterpret_cast<PxU32*>(mInputRanks.getDevicePtr());
+	rsDesc[1].radixBlockCounts = reinterpret_cast<PxU32*>(mRadixCounts.getDevicePtr());
+}
+
+void PxgRadixSortBuffers::allocate(PxU32 totalContactBatches)
+{
+	mInputKeys.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mInputRanks.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mOutputKeys.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mOutputRanks.allocate(sizeof(PxU32)*totalContactBatches * 32, PX_FL);
+	mRadixCounts.allocate(sizeof(PxU32)*32*16, PX_FL);
+}
+
+PxgSolverCore::PxgSolverCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, PxgGpuContext* dynamicContext, PxgHeapMemoryAllocatorManager* heapMemoryManager) :
+	mGpuKernelWranglerManager(gpuKernelWrangler),
+	mCudaContextManager(cudaContextManager),
+	mCudaContext(cudaContextManager->getCudaContext()),
+	mGpuContext(dynamicContext), 
+	mHeapMemoryManager(heapMemoryManager),
+	mSolverCoreDesc(NULL),
+	mPrepareDesc(NULL),
+	mPrePrepDesc(NULL),
+	mRsDesc(NULL),
+	mNbStaticRigidSlabs(0),
+	mMaxNumStaticPartitions(0),
+	mTotalContactManagers(0),
+	mNbPrevExceededForceElements(0),
+	mNbArticSlabs(0),
+	mContactHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mContactBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointHeaderBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointRowBlockStreamCon(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mJointRowBlockStreamMod(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintContactPrepBlockPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPoolVel(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraint1DPrepBlockPoolPar(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintRowPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintRowPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempStaticBodyOutputPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mIslandNodeIndices2(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutVelocityPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutBody2WorldPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodySleepDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mOutArtiVelocityPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverTxIDataPool(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintsPerPartition(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiConstraintsPerPartition(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mMotionVelocityArray(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mBlockConstraintBatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiOrderedStaticConstraints(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiOrderedStaticContacts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverBodyReferences(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mBlockWorkUnits(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionIndexData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionNodeData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mSolverConstantData(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionStartBatchIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionArticulationStartBatchIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionJointBatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mPartitionArtiJointBatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mDestroyedEdgeIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mNpIndexArray(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mGpuContactBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mDataBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCompressedContacts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCompressedPatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mConstraintWriteBackBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mForceBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatches(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiStaticJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticContactStartIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mRigidStaticJointStartIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempContactUniqueIndicesBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempConstraintUniqueIndicesBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempContactHeaderBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mTempConstraintHeaderBlockBuffer(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfContactIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfJointIndices(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfContactCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mArtiSelfJointCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mNodeInteractionCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionAnchorPatchBlockStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionIndexStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchCounts(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionPatchStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mFrictionAnchorPatchStream(heapMemoryManager, PxsHeapStats::eSOLVER),
+	mCurrentIndex(0),
+	mPinnedEvent(NULL),
+	mCpuIslandNodeIndices(NULL),
+	mSolverBodyOutputVelocityOffset(0),
+	mRadixSort(heapMemoryManager)
+{}
+
+// These two structures must have the same layout
+PX_COMPILE_TIME_ASSERT(sizeof(PxgFrictionPatchGPU) == sizeof(PxFrictionPatch));
+
+void PxgSolverCore::allocateFrictionPatchStream(PxI32 numContactBatches, PxI32 numArtiContactBatches)
+{
+	mFrictionPatchBlockStream[mCurrentIndex].allocate(sizeof(PxgBlockFrictionPatch) * (numContactBatches + numArtiContactBatches), PX_FL);
+	mFrictionAnchorPatchBlockStream[mCurrentIndex].allocate(sizeof(PxgBlockFrictionAnchorPatch) * (numContactBatches + numArtiContactBatches), PX_FL);
+
+	/*frictionPatchStream[currentIndex].allocate(sizeof(PxgFrictionPatch) * numArtiContactBatches);
+	frictionAnchorPatchStream[currentIndex].allocate(sizeof(PxgFrictionAnchorPatch) * numArtiContactBatches);*/
+}
+
+void PxgSolverCore::allocateFrictionCounts(PxU32 totalEdges)
+{
+	mFrictionPatchCounts[1 - mCurrentIndex].allocateCopyOldDataAsync(totalEdges * sizeof(PxU32), mCudaContext, mStream, PX_FL);
+	mFrictionPatchCounts[mCurrentIndex].allocate(totalEdges * sizeof(PxU32), PX_FL);
+}
+
+PxgBlockFrictionIndex* PxgSolverCore::allocateFrictionPatchIndexStream(PxU32 totalFrictionPatchCount)
+{
+	mFrictionIndexStream[mCurrentIndex].allocateCopyOldDataAsync(sizeof(PxgBlockFrictionIndex) * totalFrictionPatchCount, mCudaContext, mStream, PX_FL);
+	return reinterpret_cast<PxgBlockFrictionIndex*>(mFrictionIndexStream[mCurrentIndex].getDevicePtr());
+}
+
+void PxgSolverCore::allocateNodeInteractionCounts(PxU32 nbNodes)
+{
+	mNodeInteractionCounts.allocate(nbNodes * sizeof(PxU32), PX_FL);
+}
+
+void PxgSolverCore::uploadNodeInteractionCounts(const PxU32* nodeInteractionCounts, PxU32 nbNodes)
+{
+	mCudaContext->memcpyHtoDAsync(mNodeInteractionCounts.getDevicePtr(), nodeInteractionCounts, sizeof(PxU32) * nbNodes, mStream);
+}
+
+void PxgSolverCore::gpuMemDMAbackSolverBodies(float4* solverBodyPool, PxU32 nbSolverBodies,
+	PxPinnedArray<PxAlignedTransform>& body2WorldPool,
+	PxPinnedArray<PxgSolverBodySleepData>& solverBodySleepDataPool,
+	const bool enableDirectGPUAPI)
+{
+	PX_PROFILE_ZONE("GpuDynamics.DMABackBodies", 0);
+
+	if (!enableDirectGPUAPI)
+	{
+		mCudaContext->memcpyDtoHAsync(solverBodyPool, mOutVelocityPool.getDevicePtr(), sizeof(PxgSolverBody) * nbSolverBodies, mStream);
+		mCudaContext->memcpyDtoHAsync(body2WorldPool.begin(), mOutBody2WorldPool.getDevicePtr(), sizeof(PxAlignedTransform) * nbSolverBodies, mStream);
+		mCudaContext->memcpyDtoHAsync(solverBodySleepDataPool.begin(), mSolverBodySleepDataPool.getDevicePtr(), sizeof(PxgSolverBodySleepData) * nbSolverBodies, mStream);
+	}
+
+	synchronizeStreams(mCudaContext, mStream2, mStream, mIntegrateEvent);
+
+	CUfunction signalFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::BP_SIGNAL_COMPLETE);
+
+	*mPinnedEvent = 0;
+
+	void* devicePtr = getMappedDevicePtr(mCudaContext, mPinnedEvent);
+	PxCudaKernelParam signalParams[] =
+	{
+		PX_CUDA_KERNEL_PARAM(devicePtr)
+	};
+
+	mCudaContext->launchKernel(signalFunction, 1, 1, 1, 1, 1, 1, 0, mStream, signalParams, sizeof(signalParams), 0, PX_FL);
+}
+
+void PxgSolverCore::allocateSolverBodyBuffersCommon(PxU32 numSolverBodies, PxPinnedArray<PxNodeIndex>& islandNodeIndices)
+{
+	mMotionVelocityArray.allocate(sizeof(float4) * numSolverBodies * 2, PX_FL);
+
+	mSolverBodyIndices.allocate(sizeof(PxU32) * numSolverBodies, PX_FL);
+
+	//allocate enough solver body data space(static + kinematic + dynamic), but we just need to dma static and kinematic solver body and preIntegration kernel will
+	//fill in dynamic solver body data
+	mSolverBodyDataPool.allocate(sizeof(PxgSolverBodyData) * numSolverBodies, PX_FL);
+	mSolverBodySleepDataPool.allocate(sizeof(PxgSolverBodySleepData) * numSolverBodies, PX_FL);
+	mSolverTxIDataPool.allocate(sizeof(PxgSolverTxIData) * numSolverBodies, PX_FL);
+	mOutVelocityPool.allocate(sizeof(float4)*numSolverBodies * 2, PX_FL); //Output buffer to read back solver body velocities
+	mOutBody2WorldPool.allocate(sizeof(PxAlignedTransform)*numSolverBodies, PX_FL); //output buffer to read back solver body transform
+
+	//allocate enough memory for numArticulations * maxLinks
+	//mOutArtiVelocityPool.allocate(sizeof(float4)*numActiveActiculations*maxArticulationLinks * 2);
+	
+	mIslandNodeIndices2.allocate(sizeof(PxNodeIndex) * islandNodeIndices.size(), PX_FL);
+
+	mCudaContext->memsetD32Async(mSolverBodyIndices.getDevicePtr(), 0xFFffFFff, numSolverBodies, mStream);
+	mCudaContext->memcpyHtoDAsync(mIslandNodeIndices2.getDevicePtr(), islandNodeIndices.begin(), sizeof(PxNodeIndex) *islandNodeIndices.size(), mStream);
+
+	synchronizeStreams(mCudaContext, mStream, mGpuContext->getArticulationCore()->getStream());
+
+	mCpuIslandNodeIndices = islandNodeIndices.begin();
+}
+
+void PxgSolverCore::constructConstraintPrePrepDesc(PxgPrePrepDesc& preDesc, PxU32 numBatches, PxU32 numStaticBatches, PxU32 numArtiBatches, PxU32 numArtiStaticBatches,
+	PxU32 numArtiSelfBatches, const PxgPartitionData& pData, PxContact* cpuCompressedContactsBase, PxContactPatch* cpuCompressedPatchesBase, PxReal* cpuForceBufferBase,
+	PxU32 nbD6RigidJoint, PxU32 nbD6ArtiJoint, PxU32 nbTotalArtiJoints,
+	PxsContactManagerOutputIterator& outputIterator, PxU32 maxConstraintPartitions, PxU32 totalActiveBodies, PxU32 nbArticulations,
+	PxU32 activeBodyStartOffset, Sc::ShapeInteraction** shapeInteractions, PxReal* restDistances, PxsTorsionalFrictionData* torsionalData,
+	PxU32 nbElementsPerBody, PxU32 numSlabs)
+{
+	preDesc.blockBatches = reinterpret_cast<PxgBlockConstraintBatch*>(mBlockConstraintBatches.getDevicePtr());
+	preDesc.numBatches = numBatches;
+	preDesc.numStaticBatches = numStaticBatches;
+	preDesc.numArtiBatches = numArtiBatches;
+	preDesc.numArtiStaticBatches = numArtiStaticBatches; //this is just estimation. we write the actually numArticStaticBatches in artiSumInternalContactAndJointBatches2
+	preDesc.numArtiSelfBatches = numArtiSelfBatches; //this is also just an estimation.
+	preDesc.blockWorkUnit = reinterpret_cast<PxgBlockWorkUnit*>(mBlockWorkUnits.getDevicePtr());
+
+	preDesc.numTotalContacts = pData.numTotalContacts;
+	preDesc.numTotalConstraints = pData.numTotalConstraints;
+	preDesc.numTotalStaticConstraints = pData.numTotalStaticConstraints;
+	preDesc.numTotalStaticContacts = pData.numTotalStaticContacts;
+
+	preDesc.numTotalArtiContacts = pData.numTotalArtiContacts;
+	preDesc.numTotalArtiConstraints = pData.numTotalArtiConstraints;
+	preDesc.numTotalStaticArtiContacts = pData.numTotalArtiStaticContacts;
+	preDesc.numTotalStaticArtiConstraints = pData.numTotalArtiStaticConstraints;
+	preDesc.numTotalSelfArtiContacts = pData.numTotalArtiSelfContacts;
+	preDesc.numTotalSelfArtiConstraints = pData.numTotalArtiSelfConstraints;
+
+	preDesc.artiStaticConstraintBatchOffset = pData.artiStaticConstraintBatchOffset;
+	preDesc.artiStaticContactBatchOffset = pData.artiStaticContactBatchOffset;
+
+	preDesc.blockContactData = reinterpret_cast<PxgBlockContactData*>(mConstraintContactPrepBlockPool.getDevicePtr());
+	preDesc.blockContactPoints = reinterpret_cast<PxgBlockContactPoint*>(mGpuContactBlockBuffer.getDevicePtr());
+	preDesc.compressedContacts = reinterpret_cast<PxContact*>(mCompressedContacts.getDevicePtr());
+	preDesc.compressedPatches = reinterpret_cast<PxContactPatch*>(mCompressedPatches.getDevicePtr());
+	preDesc.forceBuffer = reinterpret_cast<PxU8*>(mForceBuffer.getDevicePtr());
+
+	preDesc.sharedJointRowIndex = 0;
+	preDesc.nbD6RigidJoints = nbD6RigidJoint;
+	preDesc.nbD6ArtiJoints = nbD6ArtiJoint;
+	preDesc.nbTotalArtiJoints = nbTotalArtiJoints;
+
+	preDesc.blockPrepData = reinterpret_cast<PxgBlockConstraint1DData*>(mConstraint1DPrepBlockPool.getDevicePtr());
+	preDesc.blockPrepVelocityData = reinterpret_cast<PxgBlockConstraint1DVelocities*>(mConstraint1DPrepBlockPoolVel.getDevicePtr());
+	preDesc.blockPrepParameterData = reinterpret_cast<PxgBlockConstraint1DParameters*>(mConstraint1DPrepBlockPoolPar.getDevicePtr());
+
+	//this is the first pass of constraint 1D data which filled in by the GPU for D6 joint. After that, if we have other joint type, which is filled in by CPU, we need to append
+	//the CPU result in this buffer and do the second pass of data filling for the block format in GPU
+	preDesc.constraintData = reinterpret_cast<PxgConstraintData*>(mConstraintDataPool.getDevicePtr());
+	preDesc.constraintRows = reinterpret_cast<Px1DConstraint*>(mConstraintRowPool.getDevicePtr());
+
+	preDesc.artiConstraintData = reinterpret_cast<PxgConstraintData*>(mArtiConstraintDataPool.getDevicePtr());
+	preDesc.artiConstraintRows = reinterpret_cast<Px1DConstraint*>(mArtiConstraintRowPool.getDevicePtr());
+
+	PxgSimulationCore* simCore = mGpuContext->getSimulationCore();
+	preDesc.rigidJointData = reinterpret_cast<PxgD6JointData*>(simCore->getD6RigidJointBuffer().getDevicePtr());//reinterpret_cast<PxgD6JointData*>(mD6JointDataPool.getDevicePtr(0)); 
+	preDesc.rigidConstraintPrePrep = reinterpret_cast<PxgConstraintPrePrep*>(simCore->getD6RigidJointPrePreBuffer().getDevicePtr());//reinterpret_cast<PxgConstraintPrePrep*>(mD6JointPrePrepPool.getDevicePtr(0));
+
+	preDesc.artiJointData = reinterpret_cast<PxgD6JointData*>(simCore->getD6ArtiJointBuffer().getDevicePtr());
+	preDesc.artiConstraintPrePrep = reinterpret_cast<PxgConstraintPrePrep*>(simCore->getD6ArtiJointPrePreBuffer().getDevicePtr());
+
+	preDesc.cpuCompressedContactsBase = cpuCompressedContactsBase;
+	preDesc.cpuCompressedPatchesBase = cpuCompressedPatchesBase;
+	preDesc.cpuForceBufferBase = cpuForceBufferBase;
+
+	preDesc.contactManagerOutputBase = reinterpret_cast<PxsContactManagerOutput*>(mGpuContactManagerOutputBase);
+	preDesc.sharedFrictionConstraintIndex = 0;
+	preDesc.sharedContactConstraintIndex = 0;	
+	preDesc.sharedArticulationResponseIndex = 0;
+	preDesc.solverBodyIndices = reinterpret_cast<PxU32*>(mSolverBodyIndices.getDevicePtr());
+
+	preDesc.mPartitionIndices = reinterpret_cast<PartitionIndexData*>(mPartitionIndexData.getDevicePtr());
+	preDesc.mPartitionstartBatchIndices = reinterpret_cast<PxU32*>(mPartitionStartBatchIndices.getDevicePtr());
+	preDesc.mPartitionArtiStartBatchIndices = reinterpret_cast<PxU32*>(mPartitionArticulationStartBatchIndices.getDevicePtr());
+	preDesc.mPartitionJointCounts = reinterpret_cast<PxU32*>(mPartitionJointBatchCounts.getDevicePtr());
+	preDesc.mPartitionArtiJointCounts = reinterpret_cast<PxU32*>(mPartitionArtiJointBatchCounts.getDevicePtr());
+	preDesc.currFrictionPatchCount = reinterpret_cast<PxU32*>(mFrictionPatchCounts[mCurrentIndex].getDevicePtr());
+	preDesc.prevFrictionPatchCount = reinterpret_cast<PxU32*>(mFrictionPatchCounts[1 - mCurrentIndex].getDevicePtr());
+
+	preDesc.mNpOutputIndices = reinterpret_cast<PxU32*>(mNpIndexArray.getDevicePtr());
+
+	for (PxU32 i = 0; i < GPU_BUCKET_ID::eCount; ++i)
+		preDesc.mCmOutputOffsets[i] = outputIterator.getIndex(i + GPU_BUCKET_ID::eCount);
+
+	preDesc.mSolverBodyData = reinterpret_cast<PxgSolverBodyData*>(mSolverBodyDataPool.getDevicePtr());
+	preDesc.mPartitionNodeData = reinterpret_cast<PartitionNodeData*>(mPartitionNodeData.getDevicePtr());
+
+	preDesc.mContactConstantData = reinterpret_cast<PxgSolverConstraintManagerConstants*>(mSolverConstantData.getDevicePtr());
+
+	preDesc.mBatchHeaders = reinterpret_cast<PxgConstraintBatchHeader*>(mConstraintBatchHeaders);
+	preDesc.mContactUniqueIndices = reinterpret_cast<PxU32*>(mContactUniqueIndices);
+	preDesc.mConstraintUniqueIndices = reinterpret_cast<PxU32*>(mConstraintUniqueIndices);
+	preDesc.mArtiConstraintUniqueIndices = reinterpret_cast<PxU32*>(mArtiConstraintUniqueIndices);
+	preDesc.mArtiContactUniqueIndices = reinterpret_cast<PxU32*>(mArtiContactUniqueIndices);
+
+	preDesc.mSolverBodyReferences = reinterpret_cast<PxgSolverReferences*>(mSolverBodyReferences.getDevicePtr());
+	preDesc.mMaxConstraintPartitions = maxConstraintPartitions;
+	preDesc.mTotalSlabs = numSlabs;
+	preDesc.mTotalActiveBodies = totalActiveBodies;
+	preDesc.mTotalActiveArticulations = nbArticulations;
+	preDesc.mActiveBodyStartOffset = activeBodyStartOffset;
+	preDesc.nbElementsPerBody = nbElementsPerBody;
+	preDesc.mRestDistances = restDistances;
+	preDesc.mTorsionalFrictionData = torsionalData;
+	preDesc.mShapeInteractions = shapeInteractions;
+
+	preDesc.mArtiStaticContactIndices = reinterpret_cast<PxU32*>(mArtiStaticContactIndices.getDevicePtr());
+	preDesc.mArtiStaticConstraintIndices = reinterpret_cast<PxU32*>(mArtiStaticJointIndices.getDevicePtr());
+	preDesc.mArtiStaticContactCounts = reinterpret_cast<PxU32*>(mArtiStaticContactCounts.getDevicePtr());
+	preDesc.mArtiStaticConstraintCounts = reinterpret_cast<PxU32*>(mArtiStaticJointCounts.getDevicePtr());
+
+	preDesc.mArtiSelfContactIndices = reinterpret_cast<PxU32*>(mArtiSelfContactIndices.getDevicePtr());
+	preDesc.mArtiSelfConstraintIndices = reinterpret_cast<PxU32*>(mArtiSelfJointIndices.getDevicePtr());
+	preDesc.mArtiSelfContactCounts = reinterpret_cast<PxU32*>(mArtiSelfContactCounts.getDevicePtr());
+	preDesc.mArtiSelfConstraintCounts = reinterpret_cast<PxU32*>(mArtiSelfJointCounts.getDevicePtr());
+
+	preDesc.mRigidStaticContactIndices = reinterpret_cast<PxU32*>(mRigidStaticContactIndices.getDevicePtr());
+	preDesc.mRigidStaticConstraintIndices = reinterpret_cast<PxU32*>(mRigidStaticJointIndices.getDevicePtr());
+	preDesc.mRigidStaticContactCounts = reinterpret_cast<PxU32*>(mRigidStaticContactCounts.getDevicePtr());
+	preDesc.mRigidStaticConstraintCounts = reinterpret_cast<PxU32*>(mRigidStaticJointCounts.getDevicePtr());
+
+	preDesc.mRigidStaticContactStartIndices = reinterpret_cast<PxU32*>(mRigidStaticContactStartIndices.getDevicePtr());
+	preDesc.mRigidStaticConstraintStartIndices = reinterpret_cast<PxU32*>(mRigidStaticJointStartIndices.getDevicePtr());
+
+	preDesc.mTempContactUniqueIndices = reinterpret_cast<PxU32*>(mTempContactUniqueIndicesBlockBuffer.getDevicePtr());
+	preDesc.mTempConstraintUniqueIndices = reinterpret_cast<PxU32*>(mTempConstraintUniqueIndicesBlockBuffer.getDevicePtr());
+	preDesc.mTempContactBlockHeader = reinterpret_cast<PxU32*>(mTempContactHeaderBlockBuffer.getDevicePtr());
+	preDesc.mTempConstraintBlockHeader = reinterpret_cast<PxU32*>(mTempConstraintHeaderBlockBuffer.getDevicePtr());
+}
+
+void PxgSolverCore::constructSolverSharedDescCommon(PxgSolverSharedDescBase& sharedDesc, const PxgConstantData& cData,
+	Cm::UnAlignedSpatialVector* deferredZ, PxU32* articulationDirty, uint4* articulationSlabMask)
+{
+	sharedDesc.dt = cData.dt;
+	sharedDesc.invDtF32 = cData.invDtF32;
+
+	sharedDesc.blockCurrentFrictionPatches = reinterpret_cast<PxgBlockFrictionPatch*>(mFrictionPatchBlockStream[mCurrentIndex].getDevicePtr());
+	sharedDesc.blockPreviousFrictionPatches = reinterpret_cast<PxgBlockFrictionPatch*>(mFrictionPatchBlockStream[1 - mCurrentIndex].getDevicePtr());
+
+	sharedDesc.currentFrictionPatches = reinterpret_cast<PxgFrictionPatch*>(mFrictionPatchStream[mCurrentIndex].getDevicePtr());
+	sharedDesc.previousFrictionPatches = reinterpret_cast<PxgFrictionPatch*>(mFrictionPatchStream[1 - mCurrentIndex].getDevicePtr());
+
+	PxgSimulationCore* core = mGpuContext->getSimulationCore();
+
+	sharedDesc.mBodySimBufferDeviceData = core->getBodySimBufferDeviceData().getPointer();
+	sharedDesc.articulations = reinterpret_cast<PxgArticulation*>(core->getArticulationBuffer().getDevicePtr());
+	sharedDesc.articulationDeferredZ = deferredZ;
+	sharedDesc.articulationDirty = articulationDirty;
+	sharedDesc.articulationSlabMask = articulationSlabMask;
+
+	sharedDesc.deltaOutOffset = mSolverBodyOutputVelocityOffset;
+}
+
+// PT: I don't understand the existing code. We already have a constructSolverSharedDescCommon function above, working on a
+// PxgSolverSharedDescBase structure. But there is still plenty of "constructSolverDesc" code that could be shared between
+// PGS and TGS when we initialize PxgSolverCoreDesc (which doesn't inherit from PxgSolverSharedDescBase). I just started moving
+// that shared code here, without touching the other bits.
+void PxgSolverCore::constructSolverDesc(PxgSolverCoreDesc& scDesc, PxU32 numIslands, PxU32 numSolverBodies, PxU32 numConstraintBatchHeader, PxU32 numArticConstraints, PxU32 numSlabs, bool enableStabilization)
+{
+	CUdeviceptr islandContextPoold = mIslandContextPool;//mIslandContextPool.getDevicePtr(0);
+	CUdeviceptr motionVelocityArrayd = mMotionVelocityArray.getDevicePtr();
+	CUdeviceptr constraintsPerPartitiond = mConstraintsPerPartition.getDevicePtr();
+
+	scDesc.outSolverVelocity = reinterpret_cast<float4*>(mOutVelocityPool.getDevicePtr());
+	scDesc.outBody2World = reinterpret_cast<PxAlignedTransform*>(mOutBody2WorldPool.getDevicePtr());
+	scDesc.solverBodyDataPool = reinterpret_cast<PxgSolverBodyData*>(mSolverBodyDataPool.getDevicePtr());
+	scDesc.solverBodyTxIDataPool = reinterpret_cast<PxgSolverTxIData*>(mSolverTxIDataPool.getDevicePtr());
+	scDesc.solverBodySleepDataPool = reinterpret_cast<PxgSolverBodySleepData*>(mSolverBodySleepDataPool.getDevicePtr());
+
+	scDesc.outArtiVelocity = reinterpret_cast<float4*>(mOutArtiVelocityPool.getDevicePtr());
+		
+	scDesc.constraintWriteBack = reinterpret_cast<PxgConstraintWriteback*>(mConstraintWriteBackBuffer.getDevicePtr());
+	scDesc.forceBuffer = reinterpret_cast<PxF32*>(mForceBuffer.getDevicePtr());
+	scDesc.frictionPatches = reinterpret_cast<PxFrictionPatch*>(mFrictionPatches.getDevicePtr());
+
+	scDesc.solverBodyReferences = reinterpret_cast<PxgSolverReferences*>(mSolverBodyReferences.getDevicePtr());
+
+	scDesc.contactManagerOutputBase = reinterpret_cast<PxsContactManagerOutput*>(mGpuContactManagerOutputBase);
+
+	scDesc.islandContextPool = reinterpret_cast<PxgIslandContext*>(islandContextPoold);
+	scDesc.motionVelocityArray = reinterpret_cast<float4*>(motionVelocityArrayd);
+	scDesc.constraintsPerPartition = reinterpret_cast<PxU32*>(constraintsPerPartitiond);
+	scDesc.artiConstraintsPerPartition = reinterpret_cast<PxU32*>(mArtiConstraintsPerPartition.getDevicePtr());
+	PxgSimulationCore* simulationCore = mGpuContext->getSimulationCore(); 
+	scDesc.mBodySimBufferDeviceData = simulationCore->getBodySimBufferDeviceData().getPointer();
+	scDesc.mBodySimPrevVelocitiesBufferDeviceData = simulationCore->getBodySimPrevVelocitiesBufferDeviceData().getPointer();
+
+	scDesc.mRigidStaticContactCounts = reinterpret_cast<PxU32*>(mRigidStaticContactCounts.getDevicePtr());
+	scDesc.mRigidStaticContactStartIndices = reinterpret_cast<PxU32*>(mRigidStaticContactStartIndices.getDevicePtr());
+
+	scDesc.mRigidStaticJointCounts = reinterpret_cast<PxU32*>(mRigidStaticJointCounts.getDevicePtr());
+	scDesc.mRigidStaticJointStartIndices = reinterpret_cast<PxU32*>(mRigidStaticJointStartIndices.getDevicePtr());
+
+	scDesc.numIslands = numIslands;
+	scDesc.numSolverBodies = numSolverBodies;
+	scDesc.numBatches = numConstraintBatchHeader;
+	scDesc.numArticBatches = numArticConstraints;
+
+	scDesc.accumulatedBodyDeltaVOffset = mSolverBodyOutputVelocityOffset;
+
+	scDesc.numSlabs = numSlabs;
+	scDesc.maxLinksPerArticulation = mGpuContext->getSimulationCore()->getMaxArticulationLinks();
+
+	scDesc.sharedThresholdStreamIndex = 0;
+	scDesc.nbForceChangeElements = 0;
+	scDesc.nbExceededThresholdElements = 0;
+	scDesc.nbPrevExceededThresholdElements = mNbPrevExceededForceElements;
+	scDesc.enableStabilization = enableStabilization;
+}
+
+void PxgSolverCore::gpuMemDMAUpJointData(const PxPinnedArray<PxgConstraintData>& cpuJointDataPool, const PxPinnedArray<Px1DConstraint>& cpuJointRowPool,
+	PxU32 nbCpuJoints, PxU32 nbGpuJoints, PxU32 totalCpuRows)
+{
+	CUdeviceptr startPtr = mConstraintDataPool.getDevicePtr() + nbGpuJoints * sizeof(PxgConstraintData);
+	CUdeviceptr startRowPtr = mConstraintRowPool.getDevicePtr() + nbGpuJoints * sizeof(Px1DConstraint)*Dy::MAX_CONSTRAINT_ROWS;
+	mCudaContext->memcpyHtoDAsync(startPtr, cpuJointDataPool.begin(), nbCpuJoints * sizeof(PxgConstraintData), mStream);
+	mCudaContext->memcpyHtoDAsync(startRowPtr, cpuJointRowPool.begin(), totalCpuRows * sizeof(Px1DConstraint), mStream);
+
+#if GPU_CORE_DEBUG
+	CUresult result = mCudaContext->streamSynchronize(mStream);
+	if (result != CUDA_SUCCESS)
+		PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU DMA up cpu joint data fail!!\n");
+#endif
+}
+
+void PxgSolverCore::gpuMemDMAUpArtiJointData(const PxPinnedArray<PxgConstraintData>& cpuArtiJointDataPool, const PxPinnedArray<Px1DConstraint>& cpuArtiJointRowPool,
+	PxU32 nbCpuArtiJoints, PxU32 nbGpuArtiJoints, PxU32 totalArtiRows)
+{
+	CUdeviceptr startPtr = mArtiConstraintDataPool.getDevicePtr() + nbGpuArtiJoints * sizeof(PxgConstraintData);
+	CUdeviceptr startRowPtr = mArtiConstraintRowPool.getDevicePtr() + nbGpuArtiJoints * sizeof(Px1DConstraint)*Dy::MAX_CONSTRAINT_ROWS;
+
+	mCudaContext->memcpyHtoDAsync(startPtr, cpuArtiJointDataPool.begin(), nbCpuArtiJoints * sizeof(PxgConstraintData), mStream);
+	mCudaContext->memcpyHtoDAsync(startRowPtr, cpuArtiJointRowPool.begin(), totalArtiRows * sizeof(Px1DConstraint), mStream);
+
+#if GPU_CORE_DEBUG
+	CUresult result = mCudaContext->streamSynchronize(mStream);
+	if (result != CUDA_SUCCESS)
+		PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU DMA up articulation joint data fail!!\n");
+#endif
+}
+
+void PxgSolverCore::constraintPrePrepParallel(PxU32 nbConstraintBatches, PxU32 nbD6Joints, PxU32 numBodies)
+{
+	PX_PROFILE_ZONE("GpuDynamics.ConstraintPrePrepParallel", 0);
+
+	///////////////////////////////////////
+	//New step here!!!
+	//We need to prep up the static rigid body contact buffers prior to contact pre-prep
+
+	{
+		const CUfunction staticKernel1 = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::RIGID_SUM_STATIC_CONTACT1);
+
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mPrePrepDescd),
+			PX_CUDA_KERNEL_PARAM(numBodies)
+		};
+
+		const PxU32 nbBlocksRequired = 32;
+
+		CUresult launchResult = mCudaContext->launchKernel(staticKernel1, nbBlocksRequired, 1, 1, PxgKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU rigidSumInternalContactAndJointBatches1 kernel fail!\n");
+#endif
+	}
+
+	{
+		const CUfunction staticKernel2 = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::RIGID_SUM_STATIC_CONTACT2);
+
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mPrePrepDescd),
+			PX_CUDA_KERNEL_PARAM(mSolverCoreDescd),
+			PX_CUDA_KERNEL_PARAM(mPrepareDescd),
+			PX_CUDA_KERNEL_PARAM(numBodies)
+		};
+
+		const PxU32 nbBlocksRequired = 32;
+
+		CUresult launchResult = mCudaContext->launchKernel(staticKernel2, nbBlocksRequired, 1, 1, PxgKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU rigidSumInternalContactAndJointBatches1 kernel fail!\n");
+#endif
+	}
+
+	//////////////////////////////////////
+
+	CUfunction kernelFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::CONTACT_CONSTRAINT_PREPREP_BLOCK);
+
+	CUdeviceptr descd = mPrePrepDescd;
+	CUdeviceptr shDescd = mSharedDescd;
+	PxCudaKernelParam kernelParams[] =
+	{
+		PX_CUDA_KERNEL_PARAM(descd),
+		PX_CUDA_KERNEL_PARAM(shDescd)
+	};
+
+	const PxU32 nbBlocksRequired = (nbConstraintBatches*PXG_BATCH_SIZE + PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK - 1) / PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK;
+
+	if (nbBlocksRequired > 0)
+	{
+		CUresult launchResult = mCudaContext->launchKernel(kernelFunction, nbBlocksRequired, 1, 1, PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		PX_ASSERT(launchResult == CUDA_SUCCESS);
+		PX_UNUSED(launchResult);
+
+#if GPU_CORE_DEBUG
+		CUresult result = mCudaContext->streamSynchronize(mStream);
+		PX_ASSERT(result == CUDA_SUCCESS);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU constraintContactBlockPrePrepLaunch kernel fail!\n");
+#endif
+	}
+
+	const PxU32 nbD6JointsBlocks = (nbD6Joints + PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK - 1) / PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK;
+	if (nbD6JointsBlocks > 0)
+	{
+		//non-block joint constraint pre-prepare
+		kernelFunction = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::JOINT_CONSTRAINT_PREPREP);
+
+		CUresult result = mCudaContext->launchKernel(kernelFunction, nbD6JointsBlocks, 1, 1, PxgKernelBlockDim::CONSTRAINT_PREPREP_BLOCK, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+
+		PX_ASSERT(result == CUDA_SUCCESS);
+		PX_UNUSED(result);
+
+#if GPU_CORE_DEBUG
+		result = mCudaContext->streamSynchronize(mStream);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU constraintPrePrepare kernel fail!\n");
+#endif
+	}
+}
+
+void PxgSolverCore::resetVelocities(bool isTGS)
+{
+	PX_PROFILE_ZONE("GpuDynamics.ZeroBodies", 0);
+
+	{
+		CUfunction zeroBodiesFunction =
+		    isTGS ? mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::ZERO_BODIES_TGS)
+		          : mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(PxgKernelIds::ZERO_BODIES);
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(mSolverCoreDescd),
+			PX_CUDA_KERNEL_PARAM(mSharedDescd)
+		};
+		CUresult result = mCudaContext->launchKernel(zeroBodiesFunction, PxgKernelGridDim::ZERO_BODIES, 1, 1, PxgKernelBlockDim::ZERO_BODIES, 1, 1, 0, mStream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU zero bodies fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+		result = mCudaContext->streamSynchronize(mStream);
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "GPU zero bodies kernel fail!\n");
+#endif
+	}
+}
+
+void PxgSolverCore::precomputeReferenceCount(PxgIslandContext* islandContext, PxU32 islandIndex, PxInt32ArrayPinned& constraintsPerPartition,
+	PxInt32ArrayPinned& artiConstraintsPerPartition, bool isTGS, PxReal minPen, PxReal elapsedTime)
+{
+	PX_PROFILE_ZONE("GpuDynamics.precomputeReferenceCount", 0);
+	{
+		PxgIslandContext& context = islandContext[islandIndex];
+
+		if(context.mNumPartitions)
+		{
+			const PxU32 numThreadsPerWarp = WARP_SIZE;
+			PxU32 numWarpsPerBlock =
+			    PxgArticulationCoreKernelBlockDim::COMPUTE_UNCONSTRAINED_VELOCITES / numThreadsPerWarp;
+
+			if (isTGS)
+			{
+				CUfunction markActiveSlabTGS = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(
+					PxgKernelIds::MARK_ACTIVE_SLAB_TGS);
+
+				// Mark slabs that are active.
+				// Loosely following solveBlockUnified launches.
+				{
+					const PxU32 lastPartition = context.mNumPartitions - 1; // Pass the last partition so that all the
+																			// partition iterations can be run in a single
+																			// kernel.
+					CUdeviceptr artiDescd = mGpuContext->getArticulationCore()->getArticulationCoreDescd();
+
+					PxCudaKernelParam kernelParamsTGS[] = {
+						PX_CUDA_KERNEL_PARAM(mSolverCoreDescd), PX_CUDA_KERNEL_PARAM(mSharedDescd),
+						PX_CUDA_KERNEL_PARAM(islandIndex),      PX_CUDA_KERNEL_PARAM(lastPartition),
+						PX_CUDA_KERNEL_PARAM(minPen),           PX_CUDA_KERNEL_PARAM(elapsedTime),
+						PX_CUDA_KERNEL_PARAM(artiDescd)
+					};
+
+					PxU32 nbBlocks = (constraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					PxU32 nbArtiBlocks = (artiConstraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					const PxU32 maxBlocks = PxMax(nbBlocks, nbArtiBlocks);
+
+					if (maxBlocks)
+					{
+						const PxU32 blockY = nbArtiBlocks > 0 ? 2 : 1;
+						CUresult result = mCudaContext->launchKernel(markActiveSlabTGS, maxBlocks, blockY, 1,
+						                                             numThreadsPerWarp, numWarpsPerBlock, 1, 0, mStream,
+						                                             kernelParamsTGS, sizeof(kernelParamsTGS), 0, PX_FL);
+
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+						result = mCudaContext->streamSynchronize(mStream);
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab kernel fail!\n");
+#endif
+					}
+				}
+			}
+			else
+			{
+				CUfunction markActiveSlabPGS = mGpuKernelWranglerManager->getKernelWrangler()->getCuFunction(
+					PxgKernelIds::MARK_ACTIVE_SLAB_PGS);
+
+				// Mark slabs that are active.
+				// Loosely following solveBlockUnified launches.
+				{
+					const PxU32 lastPartition = context.mNumPartitions - 1; // Pass the last partition so that all the
+																			// partition iterations can be run in a single
+																			// kernel.
+					CUdeviceptr artiDescd = mGpuContext->getArticulationCore()->getArticulationCoreDescd();
+
+					PxCudaKernelParam kernelParamsPGS[] = {
+						PX_CUDA_KERNEL_PARAM(mSolverCoreDescd), PX_CUDA_KERNEL_PARAM(mSharedDescd),
+						PX_CUDA_KERNEL_PARAM(islandIndex),      PX_CUDA_KERNEL_PARAM(lastPartition),
+						PX_CUDA_KERNEL_PARAM(artiDescd)
+					};
+
+					PxU32 nbBlocks = (constraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					PxU32 nbArtiBlocks = (artiConstraintsPerPartition[lastPartition] * PXG_BATCH_SIZE +
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION - 1) /
+						PxgKernelBlockDim::SOLVE_BLOCK_PARTITION;
+
+					const PxU32 maxBlocks = PxMax(nbBlocks, nbArtiBlocks);
+
+					if (maxBlocks)
+					{
+						const PxU32 blockY = nbArtiBlocks > 0 ? 2 : 1;
+						CUresult result = mCudaContext->launchKernel(markActiveSlabPGS, maxBlocks, blockY, 1,
+						                                             numThreadsPerWarp, numWarpsPerBlock, 1, 0, mStream,
+						                                             kernelParamsPGS, sizeof(kernelParamsPGS), 0, PX_FL);
+
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab fail to launch kernel!!\n");
+
+#if GPU_DEBUG
+						result = mCudaContext->streamSynchronize(mStream);
+						if (result != CUDA_SUCCESS)
+							PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL,
+								"GPU markActiveSlab kernel fail!\n");
+#endif
+					}
+				}
+			}
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusolver/src/PxgTGSCudaSolverCore.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgTGSCudaSolverCore.cpp
--- a/engine/third_party/physx/source/gpusolver/src/PxgTGSDynamicsContext.cpp
+++ b/engine/third_party/physx/source/gpusolver/src/PxgTGSDynamicsContext.cpp
@@ -0,0 +1,122 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgTGSDynamicsContext.h"
+#include "PxgKernelWrangler.h"
+#include "PxgArticulationCore.h"
+#include "PxgTGSCudaSolverCore.h"
+
+namespace physx
+{
+	PxgTGSDynamicsContext::PxgTGSDynamicsContext(Cm::FlushPool& flushPool, PxsKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager,
+		const PxGpuDynamicsMemoryConfig& config, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions,
+		bool enableStabilization, bool useEnhancedDeterminism,
+		PxReal maxBiasCoefficient,
+		PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+		bool externalForcesEveryTgsIterationEnabled, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled)
+		:
+		PxgGpuContext(flushPool, islandManager, maxNumPartitions, maxNumStaticPartitions, enableStabilization, useEnhancedDeterminism,
+			maxBiasCoefficient, simStats, heapMemoryManager, lengthScale, enableDirectGPUAPI, contextID, isResidualReportingEnabled, true)
+	{
+		mWorldSolverBody.linearVelocity = PxVec3(0);
+		mWorldSolverBody.angularVelocity = PxVec3(0);
+		mWorldSolverBodyData.invMass = 0;
+		mWorldSolverBodyData.reportThreshold = PX_MAX_REAL;
+		mWorldSolverBodyData.maxImpulse = PX_MAX_REAL;
+		mWorldSolverBodyData.penBiasClamp = -PX_MAX_REAL;
+		mWorldSolverBodyData.initialAngVel = mWorldSolverBodyData.initialLinVel = PxVec3(0.f);
+		mWorldSolverBodyData.body2World = PxAlignedTransform(PxIdentity);
+		mWorldSolverBodyData.islandNodeIndex = PxNodeIndex(PX_INVALID_NODE);
+		mWorldSolverBodyData.offsetSlop = 0.f;
+
+		mWorldTxIData.sqrtInvInertia = PxMat33(PxZero);
+		mWorldTxIData.deltaBody2World = PxTransform(PxIdentity);
+
+		{
+			mGpuArticulationCore = PX_NEW(PxgArticulationCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, heapMemoryManager);
+
+			mGpuSolverCore = PX_NEW(PxgTGSCudaSolverCore)(static_cast<PxgCudaKernelWranglerManager*>(gpuKernelWrangler), cudaContextManager, this, heapMemoryManager, config);
+
+			mGpuArticulationCore->setGpuContext(this);
+		}
+
+		mGpuSolverCore->acquireContext();
+
+		mGpuSolverCore->createStreams();
+
+		createThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+		createForceChangeThresholdStream(*heapMemoryManager->mMappedMemoryAllocators);
+
+		mPinnedMemoryAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.tempBufferCapacity);
+
+		mCurrentContactStream = 0;
+		mContactStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+		mContactStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxContact));
+
+		mPatchStreamAllocators[0] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+		mPatchStreamAllocators[1] = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxContactPatch));
+	
+		mForceStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidContactCount * sizeof(PxReal) * 2);
+
+		mFrictionPatchStreamAllocator = PX_NEW(PxgPinnedHostLinearMemoryAllocator)(cudaContextManager, config.maxRigidPatchCount * sizeof(PxFrictionPatch));
+
+		mContactStreamPool.mDataStream = mContactStreamAllocators[mCurrentContactStream]->mStart;
+		mContactStreamPool.mDataStreamSize = (PxU32)mContactStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mContactStreamPool.mSharedDataIndex = 0;
+		mContactStreamPool.mSharedDataIndexGPU = 0;
+
+		mPatchStreamPool.mDataStream = mPatchStreamAllocators[mCurrentContactStream]->mStart;
+		mPatchStreamPool.mDataStreamSize = (PxU32)mPatchStreamAllocators[mCurrentContactStream]->mTotalSize;
+		mPatchStreamPool.mSharedDataIndex = 0;
+		mPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		mForceStreamPool.mDataStream = mForceStreamAllocator->mStart;
+		mForceStreamPool.mDataStreamSize = (PxU32)mForceStreamAllocator->mTotalSize;
+		mForceStreamPool.mSharedDataIndex = 0;
+		mForceStreamPool.mSharedDataIndexGPU = 0;
+
+		mFrictionPatchStreamPool.mDataStream = mFrictionPatchStreamAllocator->mStart;
+		mFrictionPatchStreamPool.mDataStreamSize = PxTo32(mFrictionPatchStreamAllocator->mTotalSize);
+		mFrictionPatchStreamPool.mSharedDataIndex = 0;
+		mFrictionPatchStreamPool.mSharedDataIndexGPU = 0;
+
+		//Arbitrarily-large number to reserve to minimize allocation churn.
+		mConstraintsPerPartition.reserve(1024);
+
+		mArtiConstraintsPerPartition.reserve(1024);
+
+		mGpuSolverCore->releaseContext();
+	    mIsExternalForcesEveryTgsIterationEnabled = externalForcesEveryTgsIterationEnabled;
+	}
+
+	void PxgTGSDynamicsContext::destroy()
+	{
+		this->~PxgTGSDynamicsContext();
+		PX_FREE_THIS;
+	}
+}