feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/MemCopyBalanced.cu
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/MemCopyBalanced.cu
@@ -0,0 +1,106 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXG_MEM_COPY_BALANCED_CU
+#define PXG_MEM_COPY_BALANCED_CU
+
+#include "foundation/PxMath.h"
+#include <assert.h>
+#include <stdio.h>
+#include "PxgCopyManager.h"
+#include "PxgCommonDefines.h"
+
+using namespace physx;
+
+extern "C" __host__ void initCommonKernels0() {}
+
+template<PxU32 warpsPerBlock>
+__device__ void copyBalanced(
+	PxgCopyManager::CopyDesc* PX_RESTRICT	desc,						/* Input */
+	PxU32									count						/* Input */
+)
+{
+	__shared__ PxgCopyManager::CopyDesc copyDesc[warpsPerBlock];
+
+	if (blockIdx.x < count)
+	{
+		const PxU32 idxInWarp = threadIdx.x;
+		const PxU32 warpIdxInBlock = threadIdx.y;
+
+		if (idxInWarp == 0)
+		{
+			PxgCopyManager::CopyDesc d = desc[blockIdx.x];
+			copyDesc[warpIdxInBlock] = d;
+		}
+
+		__syncwarp();
+
+		PxU32* srcPtr = reinterpret_cast<PxU32*>(copyDesc[warpIdxInBlock].source);
+		PxU32* dstPtr = reinterpret_cast<PxU32*>(copyDesc[warpIdxInBlock].dest);
+		PxU32 size = copyDesc[warpIdxInBlock].bytes / 4; //Size is in bytes, we're reading words...
+
+		PxU32 groupThreadIdx = threadIdx.x + threadIdx.y * WARP_SIZE;
+
+		for (PxU32 a = groupThreadIdx; a < size; a += WARP_SIZE * warpsPerBlock)
+		{
+			PxU32 sourceVal = srcPtr[a];
+			dstPtr[a] = sourceVal;
+		}
+	}
+}
+
+extern "C"
+__global__
+void MemCopyBalanced(
+	PxgCopyManager::CopyDesc* PX_RESTRICT	desc,
+	PxU32								count
+)
+{
+	copyBalanced<COPY_KERNEL_WARPS_PER_BLOCK>(
+		desc,
+		count
+		);
+}
+
+extern "C" __global__ void clampMaxValue(PxU32* value, const PxU32 maxValue)
+{
+	if(*value > maxValue)
+		*value = maxValue;
+}
+
+// temporary clamping function for contact counts: will be generalized in the future.
+extern "C" __global__ void clampMaxValues(PxU32* value0, PxU32* value1, PxU32* value2, const PxU32 maxValue)
+{
+	if (*value0 > maxValue)
+		*value0 = maxValue;
+	if (*value1 > maxValue)
+		*value1 = maxValue;
+	if (*value2 > maxValue)
+		*value2 = maxValue;
+}
+
+
+#endif 
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/MemoryAllocator.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/MemoryAllocator.cuh
@@ -0,0 +1,128 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_MEMORY_ALLOCATOR_CUH__
+#define __CU_MEMORY_ALLOCATOR_CUH__
+
+#include "cutil_math.h"
+#include "stdio.h"
+
+class ScratchMemoryAllocator
+{
+public:
+	__device__ ScratchMemoryAllocator(uchar* mem, uint allocatedSize) : startPtr(mem),
+		totalAllocatedSize(allocatedSize), currentSize(0)
+	{
+
+	}
+
+	template <typename T>
+	__device__  T* alloc(uint requestedSize)
+	{
+		T* ptr = reinterpret_cast<T*>(startPtr + currentSize);
+
+		if (totalAllocatedSize < (currentSize + requestedSize))
+		{
+			printf("alloc out of sharedMemory !\n");
+			return NULL;
+		}
+
+		currentSize += requestedSize;
+		return ptr;
+	}
+
+	template <typename T>
+	__device__ T* allocAligned(uint requestedSize, size_t alignment = 4)
+	{
+		size_t baseAddress = size_t(startPtr + currentSize);
+		size_t alignedAddress = (baseAddress + size_t(alignment - 1)) & (~(size_t(alignment-1)));
+		uint paddingBytes = uint(alignedAddress - baseAddress);
+
+		const uint newRequestedSize = requestedSize + paddingBytes;
+		
+		if (totalAllocatedSize < (currentSize + newRequestedSize))
+		{
+#if 1
+			printf("allocAligned out of sharedMemory allocating %i bytes!\n", requestedSize);
+#endif
+			return NULL;
+		}
+
+		currentSize += newRequestedSize;
+
+		T* ptr = reinterpret_cast<T*>(alignedAddress);
+		return ptr;
+	}
+
+	uchar* startPtr;
+	uint totalAllocatedSize;
+	uint currentSize;
+	
+};
+
+class ScratchMemoryMarker
+{
+
+	uint currentSize;
+	ScratchMemoryAllocator& alloc;
+public:
+	__device__  ScratchMemoryMarker(ScratchMemoryAllocator& allocator) : alloc(allocator)
+	{
+		currentSize = alloc.currentSize;
+	}
+
+
+	__device__  ~ScratchMemoryMarker()
+	{
+		alloc.currentSize = currentSize;
+	}
+
+	__device__  void reset()
+	{
+		alloc.currentSize = currentSize;
+	}
+};
+
+template <typename Type, int SharedCapacity, int Capacity>
+class HybridSharedArray
+{
+	Type* sharedBuffer;
+	Type locBuff[Capacity - SharedCapacity];
+
+public:
+
+	HybridSharedArray(Type* shBuff) : sharedBuffer(shBuff)
+	{
+	}
+
+	PX_FORCE_INLINE const Type& operator[] (const uint index) const { return index < SharedCapacity ? sharedBuffer[index] : locBuff[index-SharedCapacity];}
+	PX_FORCE_INLINE Type& operator[] (const uint index) { return index < SharedCapacity ? sharedBuffer[index] : locBuff[index - SharedCapacity]; }
+};
+
+#endif
+
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/RadixSort.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/RadixSort.cuh
@@ -0,0 +1,762 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+
+#ifndef __CU_RADIX_SORT_CUH__
+#define __CU_RADIX_SORT_CUH__
+
+#include "foundation/PxPreprocessor.h"
+#include "vector_types.h"
+#include "PxgRadixSortDesc.h"
+#include "PxgRadixSortKernelIndices.h"
+#include "PxgCommonDefines.h"
+#include "stdio.h"
+
+#include "reduction.cuh"
+
+#include "foundation/PxMath.h"
+#include <assert.h>
+
+using namespace physx;
+
+#define		RADIX_SIZE			16
+#define		RADIX_ACCUM_SIZE	8
+
+
+static __device__ uint4 getRadix(uint4 input, const PxU32 startBit)
+{
+	uint4 radix;
+	radix.x = (input.x >> startBit) & 0xF;
+	radix.y = (input.y >> startBit) & 0xF;
+	radix.z = (input.z >> startBit) & 0xF;
+	radix.w = (input.w >> startBit) & 0xF;
+	return radix;
+	//return (input >> startBit) & 0xF;
+}
+
+static __device__ PxU32 getRadix(PxU32 input, const PxU32 startBit)
+{
+	return (input >> startBit) & 0xF;
+}
+
+
+//accumulated each individual warps to a responding radix
+//radix0[0, 7] ==> radixSum[7], radix1[8, 15] ==> radixSum[15], radix2[16, 23] ==>radixSum[23], radix3[24, 31] ==> radixSum[31], 
+//radix4[32, 39] ==> radixSum[39], radix5[40, 47] ==>radixSum[47], radix6[48, 55] ==>radixSum[55], radix7[56, 63] ==>radixSum[63]
+//
+template <PxU32 WARP_PERBLOCK_SIZE>
+static __device__ PxU32 scanRadixWarps(const PxU32 threadIndexInWarp, PxU32* radixSum, const PxU32 originalVal, const PxU32 value)
+{
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 radixIndex = threadIndexInWarp & (WARP_PERBLOCK_SIZE-1);
+
+	int val = originalVal; 
+
+	for(PxU32 a = 1; a < WARP_PERBLOCK_SIZE; a*=2)
+	{
+		int temp = __shfl_sync(FULL_MASK, val, idx-a);
+
+		if(radixIndex >= a)
+		{
+			val += temp;
+		}
+	}
+	
+	return val - value;
+}
+
+template <PxU32 WARP_PERBLOCK_SIZE>
+static __device__ void scanRadixes(const PxU32 warpIndexInBlock, const PxU32 threadIndexInWarp, PxU32* PX_RESTRICT gData, PxU32* PX_RESTRICT sData, PxU32* PX_RESTRICT accumulateBuffer)
+{
+	for(PxU32 i=warpIndexInBlock; i<RADIX_SIZE; i+=WARP_PERBLOCK_SIZE)
+	{
+
+		const PxU32 radixSumIndex = i*gridDim.x + threadIndexInWarp;
+
+		const PxU32 value = gData[radixSumIndex];
+
+		PxU32 output = warpScanAddWriteToSharedMem<WARP_SIZE>(FULL_MASK, radixSumIndex, threadIndexInWarp, sData, value, value);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+			accumulateBuffer[i] = output + value;
+	}
+}
+
+//there are 256 threads in a block and therefore 8 warp in a block
+
+__device__ inline PxU32 sanitizeKeys(uint4& keyValue, PxU32 id, const PxU32 count)
+{
+	PxU32 goodVals = count - id;
+	if (goodVals < 4)
+	{
+		PxU32 badVals = 4 - goodVals;
+
+		switch (badVals)
+		{
+		case 3:
+			keyValue.y = 0xFFffFFff;
+		case 2:
+			keyValue.z = 0xFFffFFff;
+		case 1:
+			keyValue.w = 0xFFffFFff;
+		}
+	}
+
+	return goodVals;
+}
+
+struct RadixAccum
+{
+	PxU32 radixAccum[RADIX_ACCUM_SIZE];
+};
+
+__device__ inline void radixSortWarp(const uint4* PX_RESTRICT gInputKeys, const PxU32 idx, const PxU32 count, const PxU32 stride, 
+	const PxU32 startBit, const PxU32 startIdx, const PxU32 totalCount, RadixAccum& radixAccum)
+{
+	//HISTOGRAM-KEYS && SCAN-BUCKET
+	uint4 keyValue;
+	uint4 radix;
+
+	for(PxU32 i = idx; i < count; i += stride)
+	{
+		const PxU32 gInputIdx = i + startIdx;
+		keyValue = gInputKeys[gInputIdx];
+
+		const PxU32 nbVals = sanitizeKeys(keyValue, gInputIdx * 4, totalCount);
+
+		radix = getRadix(keyValue, startBit);
+
+		//each thread read 4 elements. We store the each element's radix[0, 15] into a local array. The code should be
+		//radixAccum[radix.x]++; radixAccum[radix.y]++; radixAccum[radix.z]++; radixAccum[radix.w]++; 
+		//However, in order to save register, each radixAccum can store 2 radix's accumulation result. Therefore, radixAccum is
+		//half of the size of RADIX_SIZE and each radix has 16 bits in the radixAccum.
+		//The for loop is used to trick the compiler to keep radixAccum array in registers
+#pragma unroll
+		for (PxU32 bit = 0; bit < 16; bit += 2)
+		{
+			PxU32 accum = (1u << ((radix.x - bit) << 4));
+			accum += (1u << ((radix.y - bit) << 4));
+			accum += (1u << ((radix.z - bit) << 4));
+			accum += (1u << ((radix.w - bit) << 4));
+
+			radixAccum.radixAccum[bit / 2] += accum;
+		}
+	}
+}
+
+
+
+template <PxU32 WARP_PERBLOCK_SIZE>
+__device__ inline void radixSortSingleBlock(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumKeys, const PxU32 startBit, PxU32* gRadixCount)
+{
+	const PxU32 nbBlocks = PxgRadixSortKernelGridDim::RADIX_SORT;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	__shared__ PxU32 sRadixSum[RADIX_SIZE*WARP_PERBLOCK_SIZE];
+
+	//the number of inputKeys is random. However, this algorithm sort inputKeys at a time, so that we need to initialize the number of keys properly
+	const PxU32 numKeys = (gNumKeys+3)/4;
+
+	const PxU32 totalBlockRequired = (numKeys + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
+	//and we are going to index the work based on that
+	//const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/WARP_SIZE;
+	const PxU32 warpIndexInBlock = threadIdx.x/WARP_SIZE;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE-1);
+
+	const PxU32 idx = threadIdx.x;
+
+	RadixAccum radixAccum;
+	#pragma unroll
+	for(PxU32 i=0; i< RADIX_ACCUM_SIZE; ++i)
+	{
+		radixAccum.radixAccum[i] = 0;
+	}
+
+	const PxU32 inputKeyIndex = PxMin(numIterationPerBlock * blockIdx.x * blockDim.x, numKeys);
+	const PxU32 endIndex = PxMin(inputKeyIndex + numIterationPerBlock*blockDim.x, numKeys);
+	const PxU32 count = endIndex - inputKeyIndex;
+	radixSortWarp(gInputKeys, idx, count, WARP_SIZE*WARP_PERBLOCK_SIZE, startBit, inputKeyIndex, gNumKeys, radixAccum);
+
+	PxU32 accumValue = 0;
+	#pragma unroll
+	for(PxU32 i=0; i<RADIX_SIZE; i+=2)
+	{
+		const PxU32 accum = radixAccum.radixAccum[i/2];
+
+		const PxU32 val = warpScanAdd<WARP_SIZE>(FULL_MASK, idx, threadIndexInWarp, (PxU32*)NULL, accum, accumValue);
+
+		const PxU32 val2 = __shfl_sync(FULL_MASK, (int)val, (WARP_SIZE - 1));// getLastElementValueInAWarp(val, idx, sData, WARP_SIZE);
+
+		if(threadIndexInWarp < 2)
+		{
+			sRadixSum[(i+threadIndexInWarp)*WARP_PERBLOCK_SIZE + idx/WARP_SIZE ] = (val2 >> (threadIndexInWarp*16)) & 0xFFFF;
+		}
+	}
+	
+	__syncthreads();
+
+	//unsigned mask_warpIndexInBlock = __ballot_sync(syncMask, warpIndexInBlock < (WARP_PERBLOCK_SIZE / 2));
+	if (warpIndexInBlock < (WARP_PERBLOCK_SIZE/2))
+	{
+		const PxU32 originalValue = sRadixSum[idx];
+		const PxU32 output = scanRadixWarps<WARP_PERBLOCK_SIZE>(threadIndexInWarp, sRadixSum, originalValue, 0);//output is the value should be in sRadixSum[idx]
+
+		if((idx & (WARP_PERBLOCK_SIZE-1)) == (WARP_PERBLOCK_SIZE-1))
+		{
+			//copy to global memory
+			//const PxU32 gRadixIndex = blockIdx.x * RADIX_SIZE + idx;
+			const PxU32 gRadixIndex = blockIdx.x + idx/WARP_PERBLOCK_SIZE * gridDim.x;
+		
+			//gRadixCount have 16 radix and each radix has 32 blocks. Each block in gRadixCount store the numbers of elements in each radix
+			gRadixCount[ gRadixIndex ] = output;
+		}
+	}
+}
+
+template <PxU32 WARP_PERBLOCK_SIZE>
+__device__ inline void radixSortCalculateRanks(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 startBit, PxU32* gRadixCount, PxU32* gOutputKeys, PxU32* gOutputRanks)
+{
+	const PxU32 nbBlocks = PxgRadixSortKernelGridDim::RADIX_SORT;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	__shared__ PxU32 sRadixSumBetweenBlocks[RADIX_SIZE];//how many 0 before 1
+
+	__shared__ PxU32 sRadixCountBetweenBlocks[RADIX_SIZE * (WARP_PERBLOCK_SIZE +2) + WARP_PERBLOCK_SIZE * WARP_SIZE];
+
+	PxU32* sBuckets = &sRadixCountBetweenBlocks[0];
+	PxU32* sRadixSum = sBuckets + WARP_PERBLOCK_SIZE * WARP_SIZE;
+	PxU32* sRadixSumSum = sRadixSum + RADIX_SIZE*WARP_PERBLOCK_SIZE;
+	PxU32* sRadixCount = sRadixSumSum + RADIX_SIZE;
+
+	__shared__ PxU32 sKeys[WARP_SIZE  *WARP_PERBLOCK_SIZE * 4];
+	__shared__ PxU32 sRanks[WARP_SIZE * WARP_PERBLOCK_SIZE * 4];
+
+	//in the CPU we pass an array of PxU32 as the source inputKeys, therefore, we need to get the correct number of keys in GPU
+	const PxU32 numKeys = (gNumOfKeys+3)/4;
+
+	const PxU32 idx = threadIdx.x;
+
+	//const PxU32 gridThreadIdx = idx + blockIdx.x * blockDim.x;
+
+	const PxU32 warpIndexInBlock = threadIdx.x/WARP_SIZE;
+
+	//This identifies which thread within a warp a specific thread is
+	const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE-1);
+
+	scanRadixes<WARP_PERBLOCK_SIZE>(warpIndexInBlock, threadIndexInWarp, gRadixCount, sRadixCountBetweenBlocks, sRadixSumBetweenBlocks);
+
+	__syncthreads();
+
+	//accumulate total numbers of each radix in each warp inside the same block
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < RADIX_SIZE);
+	if(idx < RADIX_SIZE)
+	{
+		const PxU32 value = sRadixSumBetweenBlocks[idx];
+
+		const PxU32 output = warpScanAdd<RADIX_SIZE>(mask_idx, idx, threadIndexInWarp, sRadixSumBetweenBlocks, value, value);
+		sRadixSumBetweenBlocks[idx] = output + sRadixCountBetweenBlocks[idx * nbBlocks + blockIdx.x];
+	}
+
+	__syncthreads();
+	
+	const PxU32 totalBlockRequired = (numKeys + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 inputKeyIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		uint4 keyValue;
+		uint4 radix;
+		uint4 keyIndex;
+		uint4 radixOffset;
+
+		radixOffset.x = radixOffset.y = radixOffset.z = radixOffset.w = 0;
+
+		//read 4 elements at a time
+		if(inputKeyIndex < numKeys)
+		{
+			keyIndex = gInputRanks[inputKeyIndex];
+			
+			keyValue = gInputKeys[inputKeyIndex];
+
+			sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
+
+			radix = getRadix(keyValue, startBit);		
+		}
+		else
+		{
+			//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
+			radix.x = radix.y = radix.z = radix.w = 0xff;
+		}
+		
+		//#pragma unroll
+		for(PxU32 i=0; i<RADIX_SIZE; i+=4)
+		{
+			PxU32 accum = (1u << ((radix.x - i) << 3));
+			accum += (1u << ((radix.y - i) << 3));
+			accum += (1u << ((radix.z - i) << 3));
+			accum += (1u << ((radix.w - i) << 3));
+
+			PxU32 val = warpScanAdd<WARP_SIZE, PxU32>(FULL_MASK, idx, threadIndexInWarp, sBuckets, accum, 0);
+
+			const PxU32 val2 = __shfl_sync(FULL_MASK, (int)val, (WARP_SIZE - 1)); //getLastElementValueInAWarp(val, idx, sBuckets, WARP_SIZE);
+
+			if(threadIndexInWarp < 4)
+			{
+				sRadixSum[(i+threadIndexInWarp)*WARP_PERBLOCK_SIZE + idx/WARP_SIZE ] = (val2 >> (8*threadIndexInWarp)) & 0xFF;
+			}
+
+			val -= accum;
+
+			//radix offset inside a warp
+			PxU32 shiftBits = (radix.x - i) << 3;
+			PxU32 offset = ((val >> shiftBits) & 0xFF);
+			radixOffset.x |= offset;
+			val += (1<<shiftBits);
+			shiftBits = (radix.y - i) << 3;
+			offset = ((val >> shiftBits) & 0xFF);
+			radixOffset.y |= offset;
+			val += (1<<shiftBits);
+			shiftBits = (radix.z - i) << 3;
+			offset = ((val >> shiftBits) & 0xFF);
+			radixOffset.z |= offset;
+			val += (1<<shiftBits);
+			shiftBits = (radix.w - i) << 3;
+			offset = ((val >> shiftBits) & 0xFF);
+			radixOffset.w |= offset;			
+		}
+
+		__syncthreads();
+
+		PxU32 lastRadixSum = 0;
+		if(idx < RADIX_SIZE)
+		{
+			lastRadixSum = sRadixSum[idx*WARP_PERBLOCK_SIZE+(WARP_PERBLOCK_SIZE-1)];
+		}
+		__syncthreads();
+		//scan sRadixSum for a block
+		
+		if(warpIndexInBlock < (WARP_PERBLOCK_SIZE/2))
+		{
+			const PxU32 tempVal = sRadixSum[idx];
+			sRadixSum[idx] = scanRadixWarps<WARP_PERBLOCK_SIZE>(threadIndexInWarp, sRadixSum, tempVal, tempVal);
+		}
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < RADIX_SIZE);
+		if(idx < RADIX_SIZE)
+		{
+			const PxU32 value = sRadixSum[idx*WARP_PERBLOCK_SIZE+(WARP_PERBLOCK_SIZE-1)] + lastRadixSum;
+			sRadixCount[idx] = value;
+			sRadixSumSum[idx] = value;
+			__syncwarp(mask_idx);
+
+			warpScanAddWriteToSharedMem<RADIX_SIZE>(mask_idx, idx, threadIndexInWarp, sRadixSumSum, value, value);
+		}
+		__syncthreads();
+
+		
+		if(idx < (WARP_PERBLOCK_SIZE * RADIX_SIZE))
+		{
+			sRadixSum[idx] += sRadixSumSum[idx/WARP_PERBLOCK_SIZE];
+		}
+		
+		__syncthreads();
+
+		if(idx < RADIX_SIZE)
+			sRadixSumSum[idx] = sRadixSumBetweenBlocks[idx] - sRadixSumSum[idx];
+
+		if(inputKeyIndex < numKeys)
+		{
+			//radix offset between warps inside a block
+			radixOffset.x += sRadixSum[(WARP_PERBLOCK_SIZE * radix.x) + warpIndexInBlock]; 
+			radixOffset.y += sRadixSum[(WARP_PERBLOCK_SIZE * radix.y) + warpIndexInBlock]; 
+			radixOffset.z += sRadixSum[(WARP_PERBLOCK_SIZE * radix.z) + warpIndexInBlock]; 
+			radixOffset.w += sRadixSum[(WARP_PERBLOCK_SIZE * radix.w) + warpIndexInBlock];
+
+			sKeys[radixOffset.x] = keyValue.x;
+			sKeys[radixOffset.y] = keyValue.y;
+			sKeys[radixOffset.z] = keyValue.z;
+			sKeys[radixOffset.w] = keyValue.w;
+
+			sRanks[radixOffset.x] = keyIndex.x;
+			sRanks[radixOffset.y] = keyIndex.y;
+			sRanks[radixOffset.z] = keyIndex.z;
+			sRanks[radixOffset.w] = keyIndex.w;
+		}
+
+		__syncthreads();
+		const PxU32 baseInputKeyIndex = inputKeyIndex-idx;
+
+		if(baseInputKeyIndex < numKeys)
+		{
+			//If there were keys to process... The if statement defends against the PxU32 becoming huge and us overflowing the arrays
+			//const PxU32 keysToProcess = min(WARP_SIZE*WARP_PERBLOCK_SIZE*4, (numKeys - baseInputKeyIndex)*4);
+			const PxU32 keysToProcess = min(WARP_SIZE*WARP_PERBLOCK_SIZE * 4, (gNumOfKeys - baseInputKeyIndex*4));
+
+			for(PxU32 a = idx; a < keysToProcess; a += blockDim.x)
+			{
+				const PxU32 key = sKeys[a];
+				const PxU32 radix = getRadix(key, startBit);
+				const PxU32 writeIndex = a + sRadixSumSum[radix];
+		
+				gOutputKeys[writeIndex] = key;
+				gOutputRanks[writeIndex] = sRanks[a];
+			}
+		}
+
+		__syncthreads();
+
+		if(idx < RADIX_SIZE)
+		{
+			sRadixSumBetweenBlocks[idx]+=sRadixCount[idx];
+		}
+	}
+}
+
+
+static __device__ void radixSortPassSingleWarp(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 numUint4,
+	PxU32* PX_RESTRICT gOutputKeys, PxU32* PX_RESTRICT gOutputRanks, const PxU32 startBit, PxU32* radixExclusiveRunsum)
+{
+	RadixAccum radixAccum;
+#pragma unroll
+	for (PxU32 i = 0; i< RADIX_ACCUM_SIZE; ++i)
+	{
+		radixAccum.radixAccum[i] = 0;
+	}
+
+	radixSortWarp(gInputKeys, threadIdx.x, numUint4, WARP_SIZE, startBit, 0, gNumOfKeys, radixAccum);
+
+	PxU32 accumValue = 0;
+
+#pragma unroll
+	for (PxU32 i = 0; i<RADIX_SIZE; i += 2)
+	{
+		const PxU32 accum = radixAccum.radixAccum[i / 2];
+		const PxU32 val = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, accum);
+		const PxU32 v0 = val & 0xFFFF;
+		const PxU32 v1 = val >> 16;
+
+		radixExclusiveRunsum[i] = accumValue;
+		accumValue += v0;
+		radixExclusiveRunsum[i+1] = accumValue;
+		accumValue += v1;
+	}
+
+	//Now we loop and output the elements in order from the input buffer to the output buffer...
+	__syncwarp();
+
+	for (PxU32 i = 0; i < numUint4; i += WARP_SIZE)
+	{
+		const PxU32 inputKeyIndex = i + threadIdx.x;
+		//All threads enter this stage because we need to do some warp synchronous stuff...
+		uint4 keyValue;
+		uint4 radix;
+		uint4 keyIndex;
+
+		//read 4 elements at a time
+		if (inputKeyIndex < numUint4)
+		{
+			keyIndex = gInputRanks[inputKeyIndex];
+
+			keyValue = gInputKeys[inputKeyIndex];
+
+			sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
+
+			radix = getRadix(keyValue, startBit);
+		}
+		else
+		{
+			//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
+			radix.x = radix.y = radix.z = radix.w = 0xff;
+		}
+
+		//#pragma unroll
+		for (PxU32 i = 0; i<RADIX_SIZE; i += 4)
+		{
+			PxU32 radixRankX = (radix.x - i);
+			PxU32 radixRankY = (radix.y - i);
+			PxU32 radixRankZ = (radix.z - i);
+			PxU32 radixRankW = (radix.w - i);
+			PxU32 accum0 = (1u << (radixRankX << 3));
+			PxU32 accum1 = (1u << (radixRankY << 3));
+			PxU32 accum2 = (1u << (radixRankZ << 3));
+			PxU32 accum3 = (1u << (radixRankW << 3));
+
+			PxU32 accum = accum0 + accum1 + accum2 + accum3;
+
+			PxU32 val = warpScan<AddOpPxU32, PxU32>(FULL_MASK, accum);
+
+			const PxU32 val2 = __shfl_sync(FULL_MASK, val, (WARP_SIZE - 1));
+
+			//Take off how many I have so I have my local offset...
+			val -= accum;
+
+			if (accum)
+			{
+				//We have something in this radix range...output it!
+				if (radixRankX < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.x] + ((val >> (radixRankX<<3u))&0xFF);
+					val += (1 << (radixRankX << 3u));
+					gOutputKeys[outputIndex] = keyValue.x;
+					gOutputRanks[outputIndex] = keyIndex.x;
+				}
+
+				if (radixRankY < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.y] + ((val >> (radixRankY << 3u)) & 0xFF);
+					val += (1 << (radixRankY << 3u));
+					gOutputKeys[outputIndex] = keyValue.y;
+					gOutputRanks[outputIndex] = keyIndex.y;
+				}
+
+				if (radixRankZ < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.z] + ((val >> (radixRankZ << 3u)) & 0xFF);
+					val += (1 << (radixRankZ << 3u));
+					gOutputKeys[outputIndex] = keyValue.z;
+					gOutputRanks[outputIndex] = keyIndex.z;
+				}
+
+				if (radixRankW < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.w] + ((val >> (radixRankW << 3u)) & 0xFF);
+					val += (1 << (radixRankW << 3u));
+					gOutputKeys[outputIndex] = keyValue.w;
+					gOutputRanks[outputIndex] = keyIndex.w;
+				}
+
+			}
+
+			__syncwarp();
+
+			if (threadIdx.x == 0) 
+			{
+				radixExclusiveRunsum[i] += (val2 & 0xFF);
+				radixExclusiveRunsum[i + 1] += ((val2 >> 8) & 0xFF);
+				radixExclusiveRunsum[i + 2] += ((val2 >> 16) & 0xFF);
+				radixExclusiveRunsum[i + 3] += ((val2 >> 24) & 0xFF);
+			}
+
+			__syncwarp();
+		}
+	}
+}
+
+static __device__ void radixSortPassSingleWarpKeysOnly(const uint4* PX_RESTRICT gInputKeys, const PxU32 gNumOfKeys, const PxU32 numUint4,
+	PxU32* PX_RESTRICT gOutputKeys, const PxU32 startBit, PxU32* radixExclusiveRunsum)
+{
+	RadixAccum radixAccum;
+#pragma unroll
+	for (PxU32 i = 0; i < RADIX_ACCUM_SIZE; ++i)
+	{
+		radixAccum.radixAccum[i] = 0;
+	}
+
+	radixSortWarp(gInputKeys, threadIdx.x, numUint4, WARP_SIZE, startBit, 0, gNumOfKeys, radixAccum);
+
+	PxU32 accumValue = 0;
+
+#pragma unroll
+	for (PxU32 i = 0; i < RADIX_SIZE; i += 2)
+	{
+		const PxU32 accum = radixAccum.radixAccum[i / 2];
+		const PxU32 val = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, accum);
+		const PxU32 v0 = val & 0xFFFF;
+		const PxU32 v1 = val >> 16;
+
+		radixExclusiveRunsum[i] = accumValue;
+		accumValue += v0;
+		radixExclusiveRunsum[i + 1] = accumValue;
+		accumValue += v1;
+	}
+
+	//Now we loop and output the elements in order from the input buffer to the output buffer...
+	__syncwarp();
+
+	for (PxU32 i = 0; i < numUint4; i += WARP_SIZE)
+	{
+		const PxU32 inputKeyIndex = i + threadIdx.x;
+		//All threads enter this stage because we need to do some warp synchronous stuff...
+		uint4 keyValue;
+		uint4 radix;
+
+		//read 4 elements at a time
+		if (inputKeyIndex < numUint4)
+		{
+			keyValue = gInputKeys[inputKeyIndex];
+
+			sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
+
+			radix = getRadix(keyValue, startBit);
+		}
+		else
+		{
+			//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
+			radix.x = radix.y = radix.z = radix.w = 0xff;
+		}
+
+		//#pragma unroll
+		for (PxU32 i = 0; i < RADIX_SIZE; i += 4)
+		{
+			PxU32 radixRankX = (radix.x - i);
+			PxU32 radixRankY = (radix.y - i);
+			PxU32 radixRankZ = (radix.z - i);
+			PxU32 radixRankW = (radix.w - i);
+			PxU32 accum0 = (1u << (radixRankX << 3));
+			PxU32 accum1 = (1u << (radixRankY << 3));
+			PxU32 accum2 = (1u << (radixRankZ << 3));
+			PxU32 accum3 = (1u << (radixRankW << 3));
+
+			PxU32 accum = accum0 + accum1 + accum2 + accum3;
+
+			PxU32 val = warpScan<AddOpPxU32, PxU32>(FULL_MASK, accum);
+
+			const PxU32 val2 = __shfl_sync(FULL_MASK, val, (WARP_SIZE - 1));
+
+			//Take off how many I have so I have my local offset...
+			val -= accum;
+
+			if (accum)
+			{
+				//We have something in this radix range...output it!
+				if (radixRankX < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.x] + ((val >> (radixRankX << 3u)) & 0xFF);
+					val += (1 << (radixRankX << 3u));
+					gOutputKeys[outputIndex] = keyValue.x;
+				}
+
+				if (radixRankY < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.y] + ((val >> (radixRankY << 3u)) & 0xFF);
+					val += (1 << (radixRankY << 3u));
+					gOutputKeys[outputIndex] = keyValue.y;
+				}
+
+				if (radixRankZ < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.z] + ((val >> (radixRankZ << 3u)) & 0xFF);
+					val += (1 << (radixRankZ << 3u));
+					gOutputKeys[outputIndex] = keyValue.z;
+				}
+
+				if (radixRankW < 4)
+				{
+					PxU32 outputIndex = radixExclusiveRunsum[radix.w] + ((val >> (radixRankW << 3u)) & 0xFF);
+					val += (1 << (radixRankW << 3u));
+					gOutputKeys[outputIndex] = keyValue.w;
+				}
+
+			}
+
+			__syncwarp();
+
+			if (threadIdx.x == 0)
+			{
+				radixExclusiveRunsum[i] += (val2 & 0xFF);
+				radixExclusiveRunsum[i + 1] += ((val2 >> 8) & 0xFF);
+				radixExclusiveRunsum[i + 2] += ((val2 >> 16) & 0xFF);
+				radixExclusiveRunsum[i + 3] += ((val2 >> 24) & 0xFF);
+			}
+
+			__syncwarp();
+		}
+	}
+}
+
+
+template <PxU32 nbWarps>
+static __device__ void radixSortSingleWarp(uint4* PX_RESTRICT gInputKeys, uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 numUint4,
+	uint4* PX_RESTRICT gTempKeys, uint4* PX_RESTRICT gTempRanks, const PxU32 nbPasses)
+{
+	__shared__ PxU32 radixExclusiveRunsum[nbWarps][RADIX_SIZE];
+	uint4* PX_RESTRICT k0 = gInputKeys;
+	uint4* PX_RESTRICT k1 = gTempKeys;
+
+	uint4* PX_RESTRICT r0 = gInputRanks;
+	uint4* PX_RESTRICT r1 = gTempRanks;
+
+	for (PxU32 i = 0, startBit = 0; i < nbPasses; ++i, startBit += 4)
+	{
+		radixSortPassSingleWarp(k0, r0, gNumOfKeys, numUint4, reinterpret_cast<PxU32*>(k1), reinterpret_cast<PxU32*>(r1), startBit,
+			radixExclusiveRunsum[threadIdx.y]);
+		//Swap buffers
+		uint4* PX_RESTRICT t = k0; k0 = k1; k1 = t;
+		t = r0; r0 = r1; r1 = t;
+	}
+}
+
+template <PxU32 nbWarps>
+static __device__ void radixSortSingleWarpKeysOnly(uint4* PX_RESTRICT gInputKeys, const PxU32 gNumOfKeys, const PxU32 numUint4,
+	uint4* PX_RESTRICT gTempKeys, const PxU32 nbPasses)
+{
+	__shared__ PxU32 radixExclusiveRunsum[nbWarps][RADIX_SIZE];
+	uint4* PX_RESTRICT k0 = gInputKeys;
+	uint4* PX_RESTRICT k1 = gTempKeys;
+
+	for (PxU32 i = 0, startBit = 0; i < nbPasses; ++i, startBit += 4)
+	{
+		radixSortPassSingleWarpKeysOnly(k0, gNumOfKeys, numUint4, reinterpret_cast<PxU32*>(k1), startBit,
+			radixExclusiveRunsum[threadIdx.y]);
+		//Swap buffers
+		uint4* PX_RESTRICT t = k0; k0 = k1; k1 = t;
+	}
+}
+
+/* bitonic sorting network for 32 inputs */
+/* sorts in-place without extra storage  */
+PX_FORCE_INLINE __device__ void bitonicSortWarp(const PxU32 mask, PxU32& key, PxU32& val)
+{
+	const PxU32 laneId = threadIdx.x & 0x1f;
+	/* only if the complete warp participates */
+	if (mask == UINT_MAX)
+	{
+		PxU32 sKey; PxReal sVal; bool swap;
+		for (int k = 2; k <= 32; k <<=1)
+		{
+			for (PxU32 stride = k / 2; stride > 0; stride >>= 1)
+			{
+				sKey = __shfl_xor_sync(mask, key, stride);
+				sVal = __shfl_xor_sync(mask, val, stride);	
+				swap = (((laneId & stride) != 0 ? val > sVal : val < sVal))^((laneId&k) == 0);
+				key = swap ? sKey : key, val = swap ? sVal : val;
+			}
+		}
+	}
+}
+
+#endif // !PXG_RADIX_SORT_CUH
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/SparseRemove.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/SparseRemove.cuh
@@ -0,0 +1,105 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_SPARSE_REMOVE_CUH__
+#define __CU_SPARSE_REMOVE_CUH__
+
+#include "reduction.cuh"
+
+/**
+This function initializes a keep-drop buffer. Assuming an array of size N is having K elements removed, it initializes the first (N-K) elements to 0 and the next (K) elements to 1.
+*/
+
+static __device__ void initializeKeepDropBuffer(PxU32* PX_RESTRICT globalRunSumBuffer, PxU32 totalCount, PxU32 nbToRemove)
+{
+	const PxU32 newArraySize = totalCount - nbToRemove;
+	const PxU32 globalThreadIdx = threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y;
+	
+	for(PxU32 i = globalThreadIdx; i < totalCount; i += blockDim.x * blockDim.y * gridDim.x)
+	{
+		globalRunSumBuffer[i] = i < newArraySize ? 0 : 1;
+	}
+}
+
+/**
+This function marks a keep-drop buffer based on an array of indices to remove. Assuming an array of length N with K elements being removed, this marks a 1 in any element in the first (N-K)
+elements that is being removed with a 1. It marks any element in the last K elements being removed with a 0. This assumes that "initializeKeepDropBuffer" was performed on the array first
+*/
+static __device__ void markKeepDropBuff(const PxU32* PX_RESTRICT removeIndex, const PxU32 nbToRemove, PxU32* globalRunSumBuffer, const PxU32 totalCount)
+{
+	const PxU32 newArraySize = totalCount - nbToRemove;
+	const PxU32 globalThreadIdx = threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y;
+	for(PxU32 i = globalThreadIdx; i < nbToRemove; i += blockDim.x * blockDim.y * gridDim.x)
+	{
+		PxU32 index = removeIndex[i];
+		PxU32 mask = index < newArraySize ? 1 : 0;
+		globalRunSumBuffer[index] = mask;
+	}
+}
+
+template<PxU32 blockSize, PxU32 gridSize>
+static __device__ void processKeepDropBuff(PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalCount, PxU32* crossBlockTotalAccumulator)
+{
+	ReadArrayFunctor<PxU32> readF(globalRunSumBuffer);
+	WriteArrayFunctor<PxU32> writeF(globalRunSumBuffer);
+	scanKernel1of2<blockSize, gridSize, AddOpPxU32, PxU32, ReadArrayFunctor<PxU32>,	WriteArrayFunctor<PxU32> >(
+		readF,
+		writeF,
+		totalCount,
+		crossBlockTotalAccumulator);
+}
+
+template<PxU32 gridSize>
+static __device__ void accumulateKeepDrop(PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalCount, PxU32* crossBlockTotalAccumulator)
+{
+	ReadArrayFunctor<PxU32> readF(globalRunSumBuffer);
+	WriteArrayFunctor<PxU32> writeArrayF(globalRunSumBuffer);
+	WriteValueNOPFunctor<PxU32> writeValueF;
+	scanKernel2of2<gridSize, AddOpPxU32, PxU32, ReadArrayFunctor<PxU32>, WriteArrayFunctor<PxU32>, WriteValueNOPFunctor<PxU32> >(
+		readF,
+		writeArrayF,
+		writeValueF,
+		totalCount,
+		crossBlockTotalAccumulator);
+}
+
+static __device__ PxU32 getNbSwapsRequired(const PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 originalCount, const PxU32 nbToRemove)
+{
+	const PxU32 newTotalSize = originalCount - nbToRemove;
+	const PxU32 nbToReplaceInBuffer = globalRunSumBuffer[newTotalSize];
+	return nbToReplaceInBuffer;
+}
+
+static __device__ void getSwapIndices(const PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalSize, const PxU32 indexToFind, const PxU32 totalSwapsRequired, 
+	PxU32& destIndex, PxU32& srcIndex)
+{
+	destIndex = binarySearch(globalRunSumBuffer, totalSize, indexToFind);
+	srcIndex = binarySearch(globalRunSumBuffer, totalSize, indexToFind+totalSwapsRequired);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/atomic.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/atomic.cuh
@@ -0,0 +1,160 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_ATOMIC_CUH__
+#define __CU_ATOMIC_CUH__
+
+#include "cuda.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxSimpleTypes.h"
+#include "PxgIntrinsics.h"
+#include "PxgArticulation.h"
+
+static __device__ inline void AtomicAdd(float4& a, const float4 b)
+{
+	atomicAdd(&a.x, b.x);
+	atomicAdd(&a.y, b.y);
+	atomicAdd(&a.z, b.z);
+	atomicAdd(&a.w, b.w);
+}
+
+static __device__ inline void AtomicAdd(float4& a, const physx::PxVec3 b, const physx::PxReal w)
+{
+	atomicAdd(&a.x, b.x);
+	atomicAdd(&a.y, b.y);
+	atomicAdd(&a.z, b.z);
+	atomicAdd(&a.w, w);
+}
+
+static __device__ inline void AtomicAdd(float4& a, const physx::PxVec3 b)
+{
+	atomicAdd(&a.x, b.x);
+	atomicAdd(&a.y, b.y);
+	atomicAdd(&a.z, b.z);
+}
+
+
+__device__ inline void AtomicAdd(float* p, physx::PxU32 i, const physx::PxReal val)
+{
+	atomicAdd(&p[i], val);
+}
+
+__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec3& v, physx::PxReal w)
+{
+	atomicAdd(&p[i].x, v.x);
+	atomicAdd(&p[i].y, v.y);
+	atomicAdd(&p[i].z, v.z);
+	atomicAdd(&p[i].w, w);
+}
+
+__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec4& v)
+{
+	atomicAdd(&p[i].x, v.x);
+	atomicAdd(&p[i].y, v.y);
+	atomicAdd(&p[i].z, v.z);
+	atomicAdd(&p[i].w, v.w);
+}
+
+__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec3& v)
+{
+	atomicAdd(&p[i].x, v.x);
+	atomicAdd(&p[i].y, v.y);
+	atomicAdd(&p[i].z, v.z);
+}
+
+__device__ inline void AtomicAdd3(float4* p, physx::PxU32 i, const float4& v)
+{
+	atomicAdd(&p[i].x, v.x);
+	atomicAdd(&p[i].y, v.y);
+	atomicAdd(&p[i].z, v.z);
+}
+
+__device__ inline void AtomicAdd3(physx::PxVec3& p, const physx::PxVec3& v)
+{
+	atomicAdd(&p.x, v.x);
+	atomicAdd(&p.y, v.y);
+	atomicAdd(&p.z, v.z);
+}
+
+__device__ inline float AtomicMin(float* address, float val)
+{
+	int *address_as_int = (int*)address;
+	int old = *address_as_int, assumed;
+
+	while (val < __int_as_float(old))
+	{
+		assumed = old;
+		old = atomicCAS(address_as_int, assumed,
+			__float_as_int(val));
+	}
+
+	return __int_as_float(old);
+}
+
+inline __device__ float AtomicMax(float* address, float val)
+{
+	int *address_as_int = (int*)address;
+	int old = *address_as_int, assumed;
+
+	while (val > __int_as_float(old))
+	{
+		assumed = old;
+		old = atomicCAS(address_as_int, assumed,
+			__float_as_int(val));
+	}
+
+	return __int_as_float(old);
+}
+
+
+//Some compiler was complaining about not supporting atomicOr on 64bit integers
+PX_FORCE_INLINE static __device__ void AtomicOr(physx::PxU64* address, const physx::PxU64 mask)
+{
+	physx::PxU32* address32 = reinterpret_cast<physx::PxU32*>(address);
+	const physx::PxU32* maskPtr = reinterpret_cast<const physx::PxU32*>(&mask);
+	atomicOr(address32, maskPtr[0]);
+	atomicOr(address32 + 1, maskPtr[1]);
+}
+
+/* use inline assembly with .global qualifier to perform the operation at the L2 cache
+ * adds 20% performance in FLIP P2G compared to atomicAdd() or plain red.add.f32 */
+PX_FORCE_INLINE __device__ void PxRedAddGlobal(float* addr, const float val)
+{
+#if __CUDA_ARCH__ >= 350
+	asm volatile ("red.global.add.f32 [%0], %1;" :: __STG_PTR(addr) , "f"(val));
+#else
+#if __CUDA_ARCH__ >= 200
+	atomicAdd(addr, val);
+#else
+	PX_UNUSED(addr);
+	PX_UNUSED(val);
+#endif
+#endif
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/contactReduction.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/contactReduction.cuh
@@ -0,0 +1,191 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_CONTACT_REDUCTION_CUH__
+#define __CU_CONTACT_REDUCTION_CUH__
+
+#include "utils.cuh"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "shuffle.cuh"
+#include "nputils.cuh"
+
+namespace physx
+{
+
+//See trunk/internaldocumentation/Solver/PhysX 3 constraint solver.doc
+//* If the number of points in the patch is more than 6, for each patch
+//* We find the most extreme point, p0.This is the point that is farthest from the origin. ALGORITHM BELOW TAKES DEEPEST CONTACT AS p0.
+//* We find the point farthest from the most extreme point : p1.
+//* We find point p2, which is the point farthest from the segment p0p1.
+//* We find the direction from p2 to the closest point to p2 on segment p0p1.We then find point p3 : the point farthest 
+//  from p0p1 in that direction.
+//* These 4 points define the anchors for our clusters.We then assign the points to their respective clusters, i.e.the 
+//  cluster which the contact point is closest to.In a case where a point is equidistant between 2 anchors, the earlier anchor 
+//  in the array of anchors is arbitrarily chosen.
+//* We choose the deepest point in each cluster.We slightly bias the initial points p0 - p3 by considering them to have deeper 
+//  penetrations than they actually have; these are biased by an epsilon.This avoids oscillation between points when they have roughly 
+//  equal depth, which can cause instability in the friction model.The deepest contact point in each cluster is selected.
+//* Finally, we choose 2 remaining contacts.These contacts are the 2 deepest unselected contacts.
+
+template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
+static __device__ int contactReduceShared(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
+	PxReal clusterBias, PxReal distanceAdjustment, PxReal initialPointCriteria)
+{
+	PxReal v, w;
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+
+
+	if (TDoDupeTest)
+	{
+		bool imADupe = false;
+		for (PxU32 m = allMask; m; m = clearLowestSetBit(m))
+		{
+			int i = lowestSetIndex(m);
+			PxReal d = (shuffle(FULL_MASK, pointRaw, i) - pointRaw).magnitudeSquared();
+			PxReal sep = __shfl_sync(FULL_MASK, separation, i);
+			if (d < 1e-8f && (sep < separation || (sep == separation && i > threadIndexInWarp)))
+			{
+				imADupe = true;
+			}
+		}
+		allMask &= ~(__ballot_sync(FULL_MASK, imADupe));
+	}
+
+	PxU32 newCount = __popc(allMask);
+
+	if (newCount <= TMaxPoints)
+	{
+		return allMask;
+	}
+
+	const PxVec3 point = pointRaw - normal * pointRaw.dot(normal);
+
+	//Distance calculation is altered by separation value to give further away contacts less weight
+	int i0 = maxIndex(initialPointCriteria + distanceAdjustment, allMask, v);										// p0 - most extreme contact (furthest away from origin)
+	int mask = 1 << i0;
+	PxReal dist = (shuffle(FULL_MASK, point, i0) - point).magnitude();
+	int i1 = maxIndex(dist + distanceAdjustment, allMask&~mask, v);	// p1 - furthest from p0, when projected onto normal plane
+	mask |= 1 << i1;
+
+	//Now we have the p0-p1 edge. We try to find the point furthest from it in the normal plane.
+	//For that, we look for the 2 extreme points - one to the right and one to the left
+	//One maximizes [(p1 - p0) x n] * (p - p0), the other one minimizes that
+	//[(p1 - p0) x n] * (p - p0) = [n x (p - p0)] * (p1 - p0) = (n x p) * p1 - (n x p) * p0  - (n x p0) * p1 + (n x p0) * p0 =
+	//= k1 - k0 - k1[0], as (n x p0) * p0 = 0
+
+	PxVec3 dir = normal.cross(shuffle(FULL_MASK, point, i1) - shuffle(FULL_MASK, point, i0));
+
+	PxReal d = dir.dot(point - shuffle(FULL_MASK, point, i0));
+
+	int f = maxIndex(d + distanceAdjustment, allMask&~mask, v);
+	mask |= (1 << f);
+
+	int g = minIndex(d - distanceAdjustment, allMask&~mask, w);
+
+	//if (__shfl_sync(FULL_MASK, d, f) * __shfl_sync(FULL_MASK, d, g) > 0.f)
+	//{
+	//	//We need to pick again...
+	//	g = maxIndex(d, allMask&~mask, v);
+	//}
+
+	mask |= (1 << g);
+
+	//if (TKeepAnotherDeepestForPCM && __popc(mask) == 4)
+	bool predicate = (TKeepAnotherDeepestForPCM && __popc(mask) == 4);
+
+	//unsigned mask_predicate = __ballot_sync(FULL_MASK, predicate);
+	if (predicate && TMaxPoints > 4)
+	{
+		int i4 = minIndex(separation, allMask&~mask, v);
+		mask |= (1 << i4);
+	}
+
+	// post-cull clustering for mesh collisions
+	//unsigned mask_TClustering = __ballot_sync(syncMask, TClustering);
+	if (TClustering)
+	{
+
+		PxReal sep = separation;
+		if (mask & (1 << threadIndexInWarp))
+			sep -= clusterBias;
+
+		int nbClusters = 0, label = -1;													// label each point with its closest cluster (distance measured orthogonal to the normal)
+		for (PxReal t = FLT_MAX; mask; nbClusters++, mask &= (mask - 1))
+		{
+			PxReal d = (point - shuffle(FULL_MASK, point, lowestSetIndex(mask))).magnitudeSquared();
+			if (d < t)
+				t = d, label = nbClusters;
+		}
+
+		mask = 0;
+
+		for (int i = 0; i < nbClusters; i++)												// find a point in each cluster (clusters can be empty if all input points are equal)
+		{
+			int cluster = __ballot_sync(FULL_MASK, label == i)&allMask;
+			if (cluster)
+				mask |= 1 << minIndex(sep, cluster, v);
+		}
+
+		for (int i = nbClusters; i < TMaxPoints; i++)										// fill out the rest of the points
+			mask |= 1 << minIndex(sep, allMask&~mask, v);
+	}
+	else
+	{
+		PxU32 count = __popc(mask);
+
+		for (PxU32 i = count; i < TMaxPoints; ++i)
+			mask |= 1 << minIndex(separation, allMask&~mask, v);
+	}
+
+	return mask;
+}
+
+
+template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
+static __device__ int contactReduce(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
+	PxReal clusterBias)
+{
+	return contactReduceShared<TClustering, TKeepAnotherDeepestForPCM, TMaxPoints, TDoDupeTest>(pointRaw, separation, normal, allMask,
+		clusterBias, 0.0f, separation);
+}
+
+
+template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
+static __device__ int contactReduce2(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
+	PxReal clusterBias)
+{	
+	const PxVec3 point = pointRaw - normal * pointRaw.dot(normal);
+	PxReal distToOrigin = point.magnitude();
+	return contactReduceShared<TClustering, TKeepAnotherDeepestForPCM, TMaxPoints, TDoDupeTest>(pointRaw, separation, normal, allMask,
+		clusterBias, -separation, distToOrigin);
+}
+
+}
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/copy.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/copy.cuh
@@ -0,0 +1,83 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_COPY_CUH__
+#define __CU_COPY_CUH__
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxgCommonDefines.h"
+#include "cutil_math.h"
+#include <assert.h>
+
+template<typename T>
+__device__ void warpCopy(T* dest, const T* source, const uint totalSize)
+{
+	assert((size_t(dest) & (alignof(T)-1)) == 0);
+	assert((size_t(source) & (alignof(T)-1)) == 0);
+	assert(totalSize % sizeof(T) == 0);
+
+	const uint idxInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const uint requiredThreads = totalSize / sizeof(T);
+
+	for (uint i = idxInWarp; i < requiredThreads; i += WARP_SIZE)
+	{
+		dest[i] = source[i];
+	}
+}
+
+template<typename T>
+__device__ void warpCopy(T* dest, const T& value, const uint totalSize)
+{
+	assert(((size_t(dest) & (alignof(T)-1)) == 0));
+	assert(totalSize % sizeof(T) == 0);
+
+	const uint idxInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const uint requiredThreads = totalSize / sizeof(T);
+
+	for (uint i = idxInWarp; i < requiredThreads; i += WARP_SIZE)
+	{
+		dest[i] = value;
+	}
+}
+
+template<typename T>
+__device__ void blockCopy(T* dest, const T* source, const uint totalSize)
+{
+	assert((size_t(dest) & (alignof(T)-1)) == 0);
+	assert((size_t(source) & (alignof(T)-1)) == 0);
+	assert(totalSize % sizeof(T) == 0);
+
+	const uint requiredThreads = totalSize / sizeof(T);
+
+	for (uint i = threadIdx.x; i < requiredThreads; i += blockDim.x)
+	{
+		dest[i] = source[i];
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/femMidphaseScratch.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/femMidphaseScratch.cuh
@@ -0,0 +1,70 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __CU_FEM_CLOTH_MIDPHASESCRATCH_CUH__
+#define __CU_FEM_CLOTH_MIDPHASESCRATCH_CUH__
+
+#include "vector_types.h"
+
+#define	FEM_MIDPHASE_SCRATCH_SIZE 224 // 192 (WARP SIZE * 6) < 198 (sizeof(femMidphaseScratch)/sizeof(unsigned int)) < 224 (WARP SIZE * 7)
+
+namespace physx
+{
+	namespace Gu
+	{
+		struct BV32DataDepthInfo;
+		struct BV32DataPacked;
+	};
+}
+
+struct femMidphaseScratch
+{
+	const float4* PX_RESTRICT meshVerts;       // either tetrahedron mesh or triangle mesh
+	const uint4* PX_RESTRICT meshVertsIndices; // either tetrahedron mesh or triangle mesh
+
+	// const physx::Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
+	// const unsigned int* PX_RESTRICT bv32RemapPackedNodeIndex;
+	// bv32 tree
+	const physx::Gu::BV32DataPacked* bv32PackedNodes;
+
+	// stack for traversal
+	int sBv32Nodes[192]; // 6 depth of the bv32 tree
+};
+PX_COMPILE_TIME_ASSERT(sizeof(femMidphaseScratch) <= WARP_SIZE * 7 * sizeof(unsigned int));
+
+class femClothRefitMidphaseScratch : public femMidphaseScratch
+{
+public:
+	const physx::Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
+	const unsigned int* PX_RESTRICT bv32RemapPackedNodeIndex;
+	
+};
+PX_COMPILE_TIME_ASSERT(sizeof(femClothRefitMidphaseScratch) <= WARP_SIZE * 7 * sizeof(unsigned int));
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/gridCal.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/gridCal.cuh
@@ -0,0 +1,142 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_GRID_CAL_CUH__
+#define __CU_GRID_CAL_CUH__
+
+#include "foundation/PxSimpleTypes.h"
+
+PX_FORCE_INLINE static __device__ int3 calcGridPos(const float4& particlePos, const PxReal cellWidth)
+{
+	int3 gridPos;
+
+	gridPos.x = floor((particlePos.x) / cellWidth);
+	gridPos.y = floor((particlePos.y) / cellWidth);
+	gridPos.z = floor((particlePos.z) / cellWidth);
+
+	return gridPos;
+}
+
+// calculate address in grid from position (wrap-around)
+PX_FORCE_INLINE static __device__ physx::PxU32 calcGridHash(int3 gridPos, uint3 gridSize)
+{
+	gridPos.x = gridPos.x & (gridSize.x - 1);
+	gridPos.y = gridPos.y & (gridSize.y - 1);
+	gridPos.z = gridPos.z & (gridSize.z - 1);
+
+	return ((gridPos.z * gridSize.y) * gridSize.x) + (gridPos.y * gridSize.x) + gridPos.x;
+}
+
+PX_FORCE_INLINE static __device__ PxU32 calcGridHashPeriodic(int3 gridPos, int3 gridSize, int3 periodGridSize)
+{
+	//With periodic boundaries, gridPos for a particle should be >= 0 always. Particle positions were wrapped based on periodic grid size so 
+	//the particles are within in the periodic range.
+	//When reading neighboring cells, we wrap using a simple < and >= check. We don't need modulo.
+	//We use the original grid size (non-periodic) to then work out the hash of the particle.
+
+	if (gridPos.x < 0)
+		gridPos.x += periodGridSize.x;
+	else if (gridPos.x >= periodGridSize.x)
+		gridPos.x -= periodGridSize.x;
+
+	if (gridPos.y < 0)
+		gridPos.y += periodGridSize.y;
+	else if (gridPos.y >= periodGridSize.y)
+		gridPos.y -= periodGridSize.y;
+
+	if (gridPos.z < 0)
+		gridPos.z += periodGridSize.z;
+	else if (gridPos.z >= periodGridSize.z)
+		gridPos.z -= periodGridSize.z;
+
+	return ((gridPos.z * gridSize.y) * gridSize.x) + (gridPos.y * gridSize.x) + gridPos.x;
+}
+
+/**
+ * takes a global grid range e.g. (-4 , 2, -1) to (-1, 3, 2) and computes a range size
+ * which is clamped to the wrapped grid size. The result is just used to compute cell hashes
+ * hence if the range in a dimension exceeds the wrapped grid size, all wrapped cells in that dimension
+ * are covered anyways. The clamp to 0 is probably there to catch numerical issues.
+ */
+PX_FORCE_INLINE static __device__ uint3 calcWrappedGridRangeSize(int3 gridRangeMin, int3 gridRangeMax, uint3 gridSize)
+{
+	const PxU32 x = PxClamp(gridRangeMax.x - gridRangeMin.x + 1, 0, PxI32(gridSize.x));
+	const PxU32 y = PxClamp(gridRangeMax.y - gridRangeMin.y + 1, 0, PxI32(gridSize.y));
+	const PxU32 z = PxClamp(gridRangeMax.z - gridRangeMin.z + 1, 0, PxI32(gridSize.z));
+	return make_uint3(x, y, z);
+}
+
+/**
+ * Takes a linear cell offset and maps it to a 3D offset within a 3D grid range.
+ */
+PX_FORCE_INLINE static __device__ uint3 calcGridOffsetInRange(uint3 gridRangeSize, PxU32 offset)
+{
+	const PxU32 x = offset % gridRangeSize.x;
+	const PxU32 y = (offset / gridRangeSize.x) % gridRangeSize.y;
+	const PxU32 z = offset / (gridRangeSize.x * gridRangeSize.y);
+	return make_uint3(x, y, z);
+}
+
+PX_FORCE_INLINE static __device__ void calcGridRange(int3& gridPosMin, int3& gridPosMax, const PxBounds3& bounds, float cellWidth)
+{
+	float4 min = make_float4(bounds.minimum.x, bounds.minimum.y, bounds.minimum.z, 0.f);
+	float4 max = make_float4(bounds.maximum.x, bounds.maximum.y, bounds.maximum.z, 0.f);
+	gridPosMin = calcGridPos(min, cellWidth);
+	gridPosMax = calcGridPos(max, cellWidth);
+}
+
+#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_PGS 1.0f
+#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_TGS 0.5f
+#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_DIFFUSE 1.0f
+
+/**
+ * Returns position and radius of the volume that needs to be tested for particle collision.
+ * The function reconstructs the full forward projection step in pre integration and selects the mid point and a 
+ * radius that covers the whole projected motion range and adds the contact distance.
+ */
+ PX_FORCE_INLINE static __device__ PxReal getParticleSpeculativeContactVolume(PxVec3& cVolumePos,
+	const PxVec3& currentPos, const PxVec3& predictedPos, const PxReal contactDist, const bool isDiffuse, const bool isTGS)
+{
+	static const PxReal sScaleInvPGS = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_PGS);
+	static const PxReal sScaleInvTGS = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_TGS);
+	static const PxReal sScaleInvDiffuse = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_DIFFUSE);
+	PxReal scaleInv;
+	if (isDiffuse)
+	{
+		scaleInv = sScaleInvDiffuse;
+	}
+	else
+	{
+		scaleInv = isTGS ? sScaleInvTGS : sScaleInvPGS;
+	}
+	PxVec3 cVolumeOffset = (predictedPos - currentPos)*scaleInv*0.5f;
+	cVolumePos = currentPos + cVolumeOffset;
+	return cVolumeOffset.magnitude() + contactDist;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/radixSortImpl.cu
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/radixSortImpl.cu
@@ -0,0 +1,252 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+
+#include "RadixSort.cuh"
+#include "PxgRadixSortDesc.h"
+#include "PxgRadixSortKernelIndices.h"
+#include "stdio.h"
+#include "PxNodeIndex.h"
+
+extern "C" __host__ void initCommonKernels1() {}
+
+
+//gNumOfKeys is device ptr
+extern "C" __global__ 
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiBlockLaunch(PxgRadixSortBlockDesc* desc, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	const PxU32 numKeys = *desc[blockIdx.y].numKeys;
+
+	radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount);
+}
+
+//gNumOfKeys is device ptr
+extern "C" __global__ 
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiCalculateRanksLaunch(PxgRadixSortBlockDesc* desc, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
+	PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
+
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	const PxU32 numKeys = *desc[blockIdx.y].numKeys;
+
+	radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
+}
+
+extern "C" __global__
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiBlockLaunchWithoutCount(PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, desc[blockIdx.y].count, gStartBit, gRadixCount);
+}
+
+//gNumOfKeys is device ptr
+extern "C" __global__
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiCalculateRanksLaunchWithoutCount(PxgRadixSortDesc* desc, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
+	PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, desc[blockIdx.y].count, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
+}
+
+
+//gNumOfKeys is device ptr
+extern "C" __global__ 
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiBlockLaunchWithCount(PxgRadixSortDesc* desc, const PxU32 numKeys, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount);
+}
+
+//gNumOfKeys is device ptr
+extern "C" __global__ 
+__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
+void radixSortMultiCalculateRanksLaunchWithCount(PxgRadixSortDesc* desc, const PxU32 numKeys, const PxU32 gStartBit)
+{
+	const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
+	const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
+	PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
+	PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
+	PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
+
+	radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
+}
+
+extern "C" __global__ void radixSortCopyHigh32Bits(const PxU64* inValue, PxU32* outValue, PxU32* rank, const PxU32* numKeys)
+{
+	const PxU32 numElems = *numKeys;
+	const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
+
+	for (PxU32 i = 0; i < numIternations; ++i)
+	{
+		const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
+
+		if (workIndex >= numElems)
+			return;
+
+		const PxU32 index = rank[workIndex];
+		outValue[workIndex] = PxU32(inValue[index] >> 32);
+
+		//printf("Copy 32 workIndex %i index %i blockDim.x %i gridDim.x %i\n", workIndex, index, blockDim.x, gridDim.x);
+
+	}
+}
+
+extern "C" __global__ void radixSortDoubleCopyHigh32Bits(const PxU64 * inValue0, PxU32 * outValue0, PxU32 * rank0, const PxU64 * inValue1, PxU32 * outValue1, PxU32 * rank1, const PxU32 * numKeys)
+{
+	const PxU32 numElems = *numKeys;
+	const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
+
+	for (PxU32 i = 0; i < numIternations; ++i)
+	{
+		const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
+
+		if (workIndex >= numElems)
+			return;
+
+		const PxU32 index0 = rank0[workIndex];
+		outValue0[workIndex] = PxU32(inValue0[index0] >> 32);
+
+		const PxU32 index1 = rank1[workIndex];
+		outValue1[workIndex] = PxU32(inValue1[index1] >> 32);
+
+		//printf("Copy 32 workIndex %i index %i blockDim.x %i gridDim.x %i\n", workIndex, index, blockDim.x, gridDim.x);
+
+	}
+}
+
+
+extern "C" __global__ void radixSortCopy(const PxU64* inValue, PxU64* outValue, PxU32* rank, const PxU32* numKeys)
+{
+
+	const PxU32 numElems = *numKeys;
+	const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
+
+
+	for (PxU32 i = 0; i < numIternations; ++i)
+	{
+		const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
+
+		if (workIndex >= numElems)
+			return;
+
+		const PxU32 index = rank[workIndex];
+		outValue[workIndex] = inValue[index];
+
+		//const PxNodeIndex nodeIndex = reinterpret_cast<const PxNodeIndex&>(inValue[index]);
+
+		//printf("Copy 64 workIndex %i index %i value %i blockDim.x %i gridDim.x %i\n", workIndex, index, nodeIndex.index(), blockDim.x, gridDim.x);
+	}
+
+}
+
+extern "C" __global__ void radixSortDoubleCopy(
+	const PxU64 * inValue0, PxU64 * outValue0, PxU32 * rank0, 
+	const PxU64 * inValue1, PxU64 * outValue1, PxU32 * rank1, 
+	const PxU32 * numKeys)
+{
+
+	const PxU32 numElems = *numKeys;
+	const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
+
+
+	for (PxU32 i = 0; i < numIternations; ++i)
+	{
+		const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
+
+		if (workIndex >= numElems)
+			return;
+
+		const PxU32 index0 = rank0[workIndex];
+		outValue0[workIndex] = inValue0[index0];
+
+		const PxU32 index1 = rank1[workIndex];
+		outValue1[workIndex] = inValue1[index1];
+
+		//const PxNodeIndex nodeIndex = reinterpret_cast<const PxNodeIndex&>(inValue[index]);
+
+		//printf("Copy 64 workIndex %i index %i value %i blockDim.x %i gridDim.x %i\n", workIndex, index, nodeIndex.index(), blockDim.x, gridDim.x);
+	}
+
+}
+
+
+extern "C" __global__ void radixSortCopyBits2(const PxU64* inValue, PxU32* outValue, PxU32* rank, const PxU32 numKeys,
+	const bool lowBit)
+{
+	const PxU64 lowerMask = 0x00000000ffffffffull;
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
+
+	if (globalThreadIndex < numKeys)
+	{
+		const PxU32 index = rank[globalThreadIndex];
+
+		//this is aggregate, nodeIndex should be 0xffffffff
+		if (index == 0xffffffff)
+			outValue[globalThreadIndex] = 0xffffffff;
+		else
+			outValue[globalThreadIndex] = lowBit ? PxU32(inValue[index] & lowerMask) : PxU32(inValue[index] >> 32ll);
+	}
+}
+
+
+extern "C" __global__ void radixSortCopy2(const PxU64* inValue, PxU64* outValue, PxU32* rank, const PxU32 numKeys)
+{
+
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
+
+	if(globalThreadIndex < numKeys)
+	{
+		const PxU32 index = rank[globalThreadIndex];
+		const bool aggregate = (index == 0xffffffff);
+		outValue[globalThreadIndex] = aggregate ? 0xffffffffffffffff : inValue[index];
+	}
+
+}
+
+
+
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/reduction.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/reduction.cuh
@@ -0,0 +1,754 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_REDUCTION_CUH__
+#define __CU_REDUCTION_CUH__
+
+#include <float.h>
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxMath.h"
+#include "PxgCommonDefines.h"
+#include "assert.h"
+#include "utils.cuh"
+
+using namespace physx;
+
+struct MinOpFloat
+{
+	PX_CUDA_CALLABLE
+	static float defaultValue()
+	{
+		return FLT_MAX;
+	}
+
+	PX_CUDA_CALLABLE
+	static float op(float a, float b)
+	{
+		return fminf(a, b);
+	}
+
+	PX_CUDA_CALLABLE
+	static float op(unsigned int& retIdx, float a, unsigned int idxA, float b, unsigned int idxB)
+	{
+		if(b < a)
+		{
+			retIdx = idxB;
+			return b;
+		}
+		else
+		{
+			retIdx = idxA;
+			return a;
+		}
+	}
+};
+
+struct MaxOpFloat
+{
+	PX_CUDA_CALLABLE
+	static inline float defaultValue()
+	{
+		return -FLT_MAX;
+	}
+
+	PX_CUDA_CALLABLE
+	static inline float op(float a, float b)
+	{
+		return fmaxf(a, b);
+	}
+
+	PX_CUDA_CALLABLE
+	static float op(unsigned int& retIdx, float a, unsigned int idxA, float b, unsigned int idxB)
+	{
+		if(b > a)
+		{
+			retIdx = idxB;
+			return b;
+		}
+		else
+		{
+			retIdx = idxA;
+			return a;
+		}
+	}
+};
+
+struct MaxOpPxU32
+{
+	PX_CUDA_CALLABLE
+	static inline PxU32 op(const PxU32 a, const PxU32 b)
+	{
+		return max(a, b);
+	}
+};
+
+struct AndOpPxU32
+{
+	PX_CUDA_CALLABLE
+	static PxU32 defaultValue()
+	{
+		return 0xFFffFFff;
+	}
+
+	PX_CUDA_CALLABLE
+	static PxU32 op(PxU32 a, PxU32 b)
+	{
+		return a & b;
+	}
+};
+
+struct AddOpPxU32
+{
+	PX_CUDA_CALLABLE
+	static PxU32 defaultValue()
+	{
+		return 0ul;
+	}
+
+	PX_CUDA_CALLABLE
+	static PxU32 op(PxU32 a, PxU32 b)
+	{
+		return a + b;
+	}
+};
+
+struct AddOpPxReal
+{
+	PX_CUDA_CALLABLE
+	static PxReal defaultValue()
+	{
+		return 0ul;
+	}
+
+	PX_CUDA_CALLABLE
+	static PxReal op(PxReal a, PxReal b)
+	{
+		return a + b;
+	}
+}; 
+
+struct AddOpPxI32
+{
+	PX_CUDA_CALLABLE
+	static PxI32 defaultValue()
+	{
+		return 0l;
+	}
+
+	PX_CUDA_CALLABLE
+	static PxI32 op(PxI32 a, PxI32 b)
+	{
+		return a + b;
+	}
+}; 
+
+struct OrOpPxU32
+{
+	PX_CUDA_CALLABLE
+	static PxU32 defaultValue()
+	{
+		return 0;
+	}
+
+	PX_CUDA_CALLABLE
+	static PxU32 op(PxU32 a, PxU32 b)
+	{
+		return a | b;
+	}
+};
+
+//This isn't a runsum. It will produce the sum and spat the result to all the active threads
+template<typename OP, typename T, PxU32 log2threadGroupSize>
+__device__ static inline T warpReduction(const PxU32 syncMask, T input)
+{
+	const PxU32 threadGroupSize = (1U << log2threadGroupSize);
+
+	#pragma unroll
+	for(PxU32 reductionRadius = threadGroupSize >> 1; reductionRadius > 0; reductionRadius >>= 1)
+	{
+		T val = __shfl_xor_sync(syncMask, input, reductionRadius, threadGroupSize);
+		input = OP::op(input, val);
+	}
+
+	return input;
+}
+
+//This isn't a runsum. It will produce the sum and spat the result to all the active threads
+template<typename OP, typename T>
+__device__ static inline T warpReduction(const PxU32 syncMask, T input)
+{
+	return warpReduction<OP, T, LOG2_WARP_SIZE>(syncMask, input);
+}
+
+//makes sense only for comparison operations that don't alter the value. Expect -1 if the op is altering inputs
+template<typename OP, typename T, PxU32 log2threadGroupSize>
+__device__ static inline T warpReduction(const PxU32 syncMask, const T& input, PxU32& winnerLaneIndex)
+{
+	T best = warpReduction<OP, T, log2threadGroupSize>(syncMask, input);
+	winnerLaneIndex = lowestSetIndex(__ballot_sync(syncMask, best == input));
+
+	return best;
+}
+
+template<typename OP, typename T>
+__device__ static inline T warpReduction(const PxU32 syncMask, const T& input, PxU32& winnerLaneIndex)
+{
+	return warpReduction<OP, T, LOG2_WARP_SIZE>(syncMask, input, winnerLaneIndex);
+}
+
+template<typename OP, typename T>
+__device__ static inline T blockReduction(const PxU32 syncMask, const T& input, const T& initialValue, const PxU32 blockSize, volatile T* sharedMemoryOneEntryPerWarp)
+{
+	const PxU32 numWarpsPerBlock = blockSize / WARP_SIZE;
+
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+
+	T warpResult = warpReduction<OP, T>(syncMask, input);
+
+	if (threadIndexInWarp == 0)
+	{
+		sharedMemoryOneEntryPerWarp[warpIndex] = warpResult;
+	}
+	__syncthreads();
+
+	T val = (threadIdx.x < numWarpsPerBlock) ? sharedMemoryOneEntryPerWarp[threadIndexInWarp] : initialValue;
+
+	if (warpIndex == 0)
+		return warpReduction<OP, T>(syncMask, val);
+	else
+		return initialValue;
+}
+
+template<typename OP, typename T, const PxU32 blockSize>
+__device__ static inline T blockReduction(const PxU32 syncMask, const T& input, const T& initialValue)
+{
+	const PxU32 numWarpsPerBlock = blockSize / WARP_SIZE;
+
+	volatile __shared__ T sData[numWarpsPerBlock];
+
+	return blockReduction<OP, T>(syncMask, input, initialValue, blockSize, sData);
+}
+
+
+//inclusive scan
+template<typename OP, typename T, PxU32 log2threadGroupSize>
+__device__ static inline T warpScan(const PxU32 syncMask, T input)
+{
+	const PxU32 threadGroupSize = (1U << log2threadGroupSize);
+	const PxU32 idxInGroup = threadIdx.x & (threadGroupSize-1);
+	
+	#pragma unroll
+	for(PxU32 reductionRadius = 1; reductionRadius < threadGroupSize; reductionRadius <<= 1)
+	{
+		T val = __shfl_up_sync(syncMask, input, reductionRadius, threadGroupSize);
+
+		if (idxInGroup >= reductionRadius)
+			input = OP::op(input, val);
+	}
+
+	return input;
+}
+
+//inclusive scan
+template<typename OP, typename T>
+__device__ static inline T warpScan(const PxU32 syncMask, T input)
+{
+	return warpScan<OP, T, LOG2_WARP_SIZE>(syncMask, input);
+}
+
+
+//exclusive scan
+template<typename OP, typename T>
+__device__ static inline T warpScanExclusive(T input)
+{
+	T output = OP::defaultValue();
+
+	const PxU32 idxInGroup = threadIdx.x & (WARP_SIZE - 1);
+
+#pragma unroll
+	for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
+	{
+		T val = __shfl_up_sync(FULL_MASK, input, reductionRadius);
+
+		if (idxInGroup >= reductionRadius)
+		{
+			input = OP::op(input, val);
+			output = OP::op(output, val);
+		}
+	}
+
+	return output;
+}
+
+template<typename T>
+class ReadArrayFunctor
+{
+public:
+	PX_CUDA_CALLABLE
+	ReadArrayFunctor(const T* PX_RESTRICT arr): mArr(arr) {}
+
+	PX_CUDA_CALLABLE
+	T operator()(PxU32 idx) const {return mArr[idx];}
+protected:
+	const T* mArr;
+};
+
+template<typename T>
+class WriteArrayFunctor
+{
+public:
+	PX_CUDA_CALLABLE
+	WriteArrayFunctor(T* PX_RESTRICT arr): mArr(arr) {}
+
+	PX_CUDA_CALLABLE
+	void operator()(PxU32 idx, const T& val) {mArr[idx] = val;}
+protected:
+	T* mArr;
+};
+
+template<typename T>
+class WriteValueFunctor
+{
+public:
+	PX_CUDA_CALLABLE
+	WriteValueFunctor(T* PX_RESTRICT addr): mAddr(addr) {}
+
+	PX_CUDA_CALLABLE
+	void operator()(const T& val) {*mAddr = val;}
+
+protected:
+	T* mAddr;
+};
+
+template<typename T>
+class WriteValueNOPFunctor
+{
+public:
+	PX_CUDA_CALLABLE
+	WriteValueNOPFunctor() {}
+
+	PX_CUDA_CALLABLE
+	void operator()(const T&) {}
+};
+
+template<PxU32 blockSize, PxU32 gridSize, typename OP, typename T, typename GetInputFunctor, typename SetTempFunctor>
+static __device__ void scanKernel1of2(
+									  const GetInputFunctor& getInputF,
+									  SetTempFunctor& setTempF,
+									  const PxU32 totalCount, 
+									  T* crossBlockTotalAccumulator)
+{
+	__shared__ T crossWarpAccumulator[blockSize >> LOG2_WARP_SIZE];
+	__shared__ T accum;
+
+	if(threadIdx.x == 0)
+		accum = OP::defaultValue();
+
+	__syncthreads();
+
+	const PxU32 nbThreads = blockSize;
+
+	const PxU32 nbBlocksRequired = (totalCount + (nbThreads-1))/nbThreads;
+	const PxU32 nbBlocksPerBlock = (nbBlocksRequired + gridDim.x-1)/gridDim.x;
+
+	const PxU32 blockStartIndex = blockIdx.x * nbBlocksPerBlock;
+	const PxU32 threadIndexInWarp = threadIdx.x;
+	const PxU32 warpIndexInBlock = threadIdx.y;
+	
+	for (PxU32 i = 0; i < nbBlocksPerBlock; ++i)
+	{
+		const PxU32 threadIndex = threadIdx.x + WARP_SIZE * threadIdx.y + (blockStartIndex + i) * blockDim.x * blockDim.y;
+
+		T val = OP::defaultValue();
+
+		if (threadIndex < totalCount)
+			val = getInputF(threadIndex);
+
+		T res = warpScanExclusive<OP, T>(val);
+
+		if (threadIndexInWarp == (WARP_SIZE - 1))
+			crossWarpAccumulator[warpIndexInBlock] = OP::op(res, val);
+				
+		T prevAccum = accum;
+
+		__syncthreads();
+
+		if (warpIndexInBlock == 0)
+		{
+			T val2 = OP::defaultValue();
+
+			if (threadIndexInWarp < (blockSize >> LOG2_WARP_SIZE))
+				val2 = crossWarpAccumulator[threadIndexInWarp];
+
+
+			T res2 = warpScanExclusive<OP, T>(val2);
+
+			if (threadIndexInWarp < (blockSize >> LOG2_WARP_SIZE))
+				crossWarpAccumulator[threadIndexInWarp] = res2;
+		
+			if (threadIndexInWarp == ((blockSize >> LOG2_WARP_SIZE) - 1))
+			{
+				accum = OP::op(accum, OP::op(res2, val2));
+			}
+		}
+
+		__syncthreads();
+
+		if (threadIndex < totalCount)
+			setTempF(threadIndex, OP::op(res, OP::op(crossWarpAccumulator[warpIndexInBlock], prevAccum)));
+	}
+
+	if ((threadIdx.y * WARP_SIZE + threadIdx.x) == ((blockSize >> LOG2_WARP_SIZE)-1))
+	{
+		crossBlockTotalAccumulator[blockIdx.x] = accum;
+	}
+}
+
+template<PxU32 gridSize, typename OP, typename T, typename GetTempFunctor, typename SetOutputFunctor, 
+																				typename WriteGrandTotalFunctor>
+static __device__ void scanKernel2of2(
+									  const GetTempFunctor& getTempF,
+									  SetOutputFunctor& setOutF,
+									  WriteGrandTotalFunctor& writeTotalF,
+									  const PxU32 totalCount,
+									  const T* crossBlockTotalAccumulator)
+{
+	const PxU32 nbThreads = blockDim.x * blockDim.y;
+
+	const PxU32 nbBlocksRequired = (totalCount + (nbThreads-1))/nbThreads;
+	const PxU32 nbBlocksPerBlock = (nbBlocksRequired + gridDim.x-1)/gridDim.x;
+
+	const PxU32 blockStartIndex = blockIdx.x * nbBlocksPerBlock;
+
+	__shared__ T blockAccum[gridSize];
+
+	const PxU32 threadIndexInWarp = threadIdx.x;
+	
+
+	if (threadIdx.y == 0)
+	{
+		T val = OP::defaultValue();
+
+		if (threadIdx.x < gridSize)
+			val = crossBlockTotalAccumulator[threadIndexInWarp];
+
+		T res = warpScanExclusive<OP, T>(val);
+
+	
+		if (threadIdx.x < gridSize)
+			blockAccum[threadIndexInWarp] = res;
+
+		if (threadIdx.x == gridSize - 1 && blockIdx.x == 0)
+		{
+			writeTotalF(OP::op(val, res));
+		}
+	}
+
+	__syncthreads();
+
+	T accumulation = blockAccum[blockIdx.x];
+
+	for(PxU32 i = 0; i < nbBlocksPerBlock; ++i)
+	{
+		const PxU32 threadIndex = threadIdx.x + WARP_SIZE * threadIdx.y + (blockStartIndex + i) * blockDim.x * blockDim.y;
+
+		if(threadIndex < totalCount)
+		{
+			T val = OP::op(getTempF(threadIndex), accumulation);
+			setOutF(threadIndex, val);
+		}
+	}
+}
+
+//keeping this for the broadphase: 
+
+//This is the parallel version of sum. 
+template<PxU32 nbElems, typename T>
+static __inline__ __device__ T warpScanAdd(const PxU32 syncMask, const PxU32 /*index*/, const PxU32 threadIndexInWarp, T* /*sData*/, const T originalValue, const T value)
+{
+	unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
+
+	if(threadIndexInWarp < nbElems)
+	{
+		T val = originalValue;
+
+		#pragma unroll
+		for(PxU32 i = 1; i < nbElems; i<<=1)
+		{
+			const T temp = __shfl_sync(mask_local, val, threadIndexInWarp-i);
+
+			if(threadIndexInWarp >= i)
+				val += temp;
+		}
+		return val - value;
+	}
+
+	return 0;
+}
+
+template<PxU32 nbElems>
+static __inline__ __device__ PxU32 warpScanMax(const PxU32 syncMask, const PxU32 /*index*/, const PxU32 threadIndexInWarp, PxU32* /*sData*/, const PxU32 originalValue)
+{
+	unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
+
+	if(threadIndexInWarp < nbElems)
+	{
+		PxU32 val = originalValue;
+
+		PxU32 mask = nbElems < 32 ? (1 << nbElems) - 1 : 0xffffffff;
+
+		#pragma unroll
+		for(PxU32 i = 1; i < nbElems; i<<=1)
+		{
+			const PxU32 temp = __shfl_sync(mask_local, (int)val, threadIndexInWarp-i);
+			
+			if(threadIndexInWarp >= i)
+				val = PxMax(temp, val);
+		}
+		return val;
+	}
+
+	return 0;	
+}
+
+//This is the parallel version of exclusive sum. We have 32 thread in a warp, so that we need to
+//have 2 exp 5 step(16) of add
+template<PxU32 nbElems, typename T>
+static __inline__ __device__ T warpScanAddWriteToSharedMem(PxU32 syncMask, PxU32 index, PxU32 threadIndexInWarp, T* sData, T originalValue, T value)
+{
+	unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
+
+	if(threadIndexInWarp < nbElems)
+	{
+		T temp = 0;
+		T val = originalValue;
+
+		#pragma unroll
+		for(PxU32 i = 1; i < nbElems; i<<=1)
+		{
+			temp = __shfl_sync(mask_local, val, threadIndexInWarp-i);
+			
+			if(threadIndexInWarp >= i)
+				val += temp;
+		}
+
+		val -= value;
+		sData[index] = val;
+		__syncwarp(mask_local); //Mem fence for shared data write
+		return val;
+	}
+
+	return 0;	
+}
+
+PX_FORCE_INLINE __device__ PxU32 warpCountAndBroadcast(PxU32 mask, bool threadContributesElement)
+{
+	return __popc(__ballot_sync(mask, threadContributesElement));
+}
+
+PX_FORCE_INLINE __device__ PxU32 threadBlockCountAndBroadcast(bool threadContributesElement, PxU32* sharedMem)
+{
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+	const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
+
+	PxU32 perWarpCount = warpCountAndBroadcast(FULL_MASK, threadContributesElement);
+
+	if (threadIndexInWarp == 0)
+		sharedMem[warpIndex] = perWarpCount;
+
+	__syncthreads();
+
+	return warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sharedMem[threadIndexInWarp]);
+}
+
+// Performs an exclusive scan over a warp. Every thread can only contribute 0 or 1 to the scan through the mask
+PX_FORCE_INLINE __device__ PxU32 warpScanExclusive(PxU32 mask, PxU32 threadIndexInWarp)
+{
+	return __popc(mask & ((1 << threadIndexInWarp) - 1));
+}
+
+//Performs an exclusive scan over a warp. Every thread can only contribute 0 or 1 to the sum. The return value will be the exclusive cumulative sum for every thread. totalSum will provide the total number of contributed elements.
+//perWarpCountS must be shared memory with NbWarps entries available
+template<PxU32 NbWarps>
+PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(bool threadContributesElement, PxU32& totalSum, PxU32* perWarpCountS)
+{
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+	const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
+
+	PxU32 ballotMask = __ballot_sync(FULL_MASK, threadContributesElement);
+
+	PxU32 nbInteresting = __popc(ballotMask); //The number of elements emitted per warp
+
+	if (threadIndexInWarp == 0)
+		perWarpCountS[warpIndex] = nbInteresting;
+
+	__syncthreads();
+
+	PxU32 warpCount = threadIndexInWarp < NbWarps ? perWarpCountS[threadIndexInWarp] : 0;
+
+	PxU32 total = warpScan<AddOpPxU32, PxU32>(FULL_MASK, warpCount); //The inclusive cumulative sum per warp. The last warp has the total number of elements created by the full thread block
+
+	PxU32 carryExclusiveScan = __shfl_sync(FULL_MASK, total, warpIndex) - perWarpCountS[warpIndex]; //Broadcast the exclusive cumulative base sum (inclusiveSum - perWarpCount = exclusiveSum)
+
+	totalSum = __shfl_sync(FULL_MASK, total, 31); //Broadcast the total sum of the last warp (which is the overall total sum) to all warps
+
+	return carryExclusiveScan + warpScanExclusive(ballotMask, threadIndexInWarp); //Combine base sum and the sum per warp
+}
+
+template<PxU32 NbWarps>
+PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(bool threadContributesElement, PxU32& totalSum)
+{
+	__shared__ PxU32 perWarpCountS[NbWarps];
+	return threadBlockScanExclusive<NbWarps>(threadContributesElement, totalSum, perWarpCountS);
+}
+
+//Performs an exclusive scan over a warp. Every thread can contribute an arbitrary value to the sum. The return value will be the exclusive cumulative sum for every thread. totalSum will provide the total number of contributed elements.
+//perWarpCountS must be shared memory with NbWarps entries available
+template<PxU32 NbWarps>
+PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(PxU32 numElementsFromCallingThread, PxU32& totalSum, PxU32* perWarpCountS)
+{
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+	const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
+
+	PxU32 perWarpInclusiveScan = warpScan<AddOpPxU32, PxU32>(FULL_MASK, numElementsFromCallingThread);
+
+	if (threadIndexInWarp == 31)
+		perWarpCountS[warpIndex] = perWarpInclusiveScan;
+
+	__syncthreads();
+
+	PxU32 warpCount = threadIndexInWarp < NbWarps ? perWarpCountS[threadIndexInWarp] : 0;
+
+	PxU32 total = warpScan<AddOpPxU32, PxU32>(FULL_MASK, warpCount); //The inclusive cumulative sum per warp. The last warp has the total number of elements created by the full thread block
+
+	PxU32 carryExclusiveScan = __shfl_sync(FULL_MASK, total, warpIndex) - perWarpCountS[warpIndex]; //Broadcast the exclusive cumulative base sum (inclusiveSum - perWarpCount = exclusiveSum)
+
+	totalSum = __shfl_sync(FULL_MASK, total, 31); //Broadcast the total sum of the last warp (which is the overall total sum) to all warps
+
+	return carryExclusiveScan + perWarpInclusiveScan - numElementsFromCallingThread; //Combine base sum and the sum per warp
+}
+
+template<PxU32 NbWarps>
+PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(PxU32 numElementsFromCallingThread, PxU32& totalSum)
+{
+	__shared__ PxU32 perWarpCountS[NbWarps];
+	return threadBlockScanExclusive<NbWarps>(numElementsFromCallingThread, totalSum, perWarpCountS);
+}
+
+//Allows to get indices in an output array for every thread (even accross thread blocks) where every thread either emits an element or not.
+//Only threads that contribute an element will get a valid index returned which is expected since all other threads don't output anything.
+//The order of the ouput indices is not deterministic since atomic add is used
+//The method must be called by one or preferrably multiple full warps, for a single warp see optimized version below
+template<PxU32 NbWarps>
+PX_FORCE_INLINE __device__ PxU32 globalScanExclusive(bool threadContributesElement, PxU32* atomicCounter)
+{
+	__syncthreads();
+
+	__shared__ PxU32 perWarpCountS[NbWarps];
+	PxU32 threadBlockTotalSum;
+	PxU32 indexInThreadBlock = threadBlockScanExclusive<NbWarps>(threadContributesElement, threadBlockTotalSum, perWarpCountS);
+
+	__shared__ PxU32 globalOffset;
+	if (threadIdx.x == 0)
+	{
+		//Only one thread per thread block needs to perform an atomic add
+		globalOffset = atomicAdd(atomicCounter, threadBlockTotalSum);
+	}
+	__syncthreads();
+	return indexInThreadBlock + globalOffset;
+}
+
+//Optimized version where a thread block only consist out of one warp. Does not need shared memory.
+//Allows to get indices in an output array for every thread (even accross thread blocks) where every thread either emits an element or not.
+//Only threads that contribute an element will get a valid index returned which is expected since all other threads don't output anything.
+//The order of the ouput indices is not deterministic since atomic add is used
+//The method must be called by one full warp
+PX_FORCE_INLINE __device__ PxU32 globalScanExclusiveSingleWarp(bool threadContributesElement, PxU32* atomicCounter)
+{
+	PxU32 idxInWarp = threadIdx.x & 31;
+
+	const PxU32 resultWarp = __ballot_sync(FULL_MASK, threadContributesElement);
+	const PxU32 offset = warpScanExclusive(resultWarp, idxInWarp); // __popc(resultWarp & ((1 << idxInWarp) - 1));
+	const PxU32 validCount = __popc(resultWarp);
+
+	PxU32 startIndex = 0xFFffFFff - 32; // -32 to prevent wrap-around
+	if (idxInWarp == 0 && validCount > 0)
+	{
+		startIndex = atomicAdd(atomicCounter, validCount);
+	}
+	return __shfl_sync(FULL_MASK, startIndex, 0) + offset;
+}
+
+//Overrides "globalScanExclusiveSingleWarp" to support adding multiple elements per thread, rather than limiting to a single element.
+PX_FORCE_INLINE __device__ PxU32 globalScanExclusiveSingleWarp(PxU32 numElements, PxU32* atomicCounter)
+{
+	const PxU32 idxInWarp = threadIdx.x & 31;
+	const PxU32 offset = warpScanExclusive<AddOpPxU32, PxU32>(numElements);
+
+	PxU32 startIndex = 0xFFffFFff - 32; // -32 to prevent wrap-around
+	if(idxInWarp == 31)
+	{
+		PxU32 totalSum = offset + numElements; // When executed by the last thread in the warp, this is the total sum of all
+											   // numElements over the whole warp
+		if(totalSum > 0)
+			startIndex = atomicAdd(atomicCounter, totalSum);
+	}
+	return __shfl_sync(FULL_MASK, startIndex, 31) + offset;
+}
+
+// returns the largest index into the (sorted!) data array s.t. data[index] <= value
+// if there is no such index (i.e., data[0] > value) returns 0
+template<class T>
+static __device__ PxU32 binarySearch(const T* PX_RESTRICT data, const PxU32 numElements, const T& value)
+{
+	PxU32 left = 0;
+	PxU32 right = numElements;
+	
+	//while((right - left) > 1)
+	while(left < right)
+	{
+		PxU32 pos = (left + right) / 2;
+		const T& element = data[pos];
+
+		if (element <= value)
+		{
+			//Found exact value
+			left = pos+1;
+		}
+		else
+		{
+			right = pos;
+		}
+	}
+
+	return left ? left - 1 : 0;
+}
+
+#endif //CU_REDUCTION_CU
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/sbMidphaseScratch.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/sbMidphaseScratch.cuh
@@ -0,0 +1,60 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __CU_SB_MIDPHASESCRATCH_CUH__
+#define __CU_SB_MIDPHASESCRATCH_CUH__
+
+#include "vector_types.h"
+
+namespace physx
+{
+	namespace Gu
+	{
+		struct BV32DataDepthInfo;
+		struct BV32DataPacked;
+	};
+};
+
+struct sbMidphaseScratch
+{
+	const float4 * PX_RESTRICT tetmeshVerts;
+	const uint4 * PX_RESTRICT tetmeshTetIndices;
+	const PxU8* PX_RESTRICT tetmeshSurfaceHint;
+
+	const Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
+	const PxU32* PX_RESTRICT bv32RemapPackedNodeIndex;
+	//bv32 tree
+	Gu::BV32DataPacked* bv32PackedNodes;
+
+	//stack for traversal
+	int sBv32Nodes[192]; //6 depth of the bv32 tree
+};
+PX_COMPILE_TIME_ASSERT(sizeof(sbMidphaseScratch) <= WARP_SIZE * 7 * sizeof(PxU32));
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/shuffle.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/shuffle.cuh
@@ -0,0 +1,110 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_SHUFFLE_CUH__
+#define __CU_SHUFFLE_CUH__
+
+#include "cuda.h"
+#include "PxgCommonDefines.h"
+//#include "nputils.cuh"
+
+static __device__ __forceinline__
+physx::PxVec3 shuffle(const physx::PxU32 syncMask, const physx::PxVec3& v, int i, physx::PxU32 width = WARP_SIZE)
+{
+	return physx::PxVec3(__shfl_sync(syncMask, v.x, i, width), __shfl_sync(syncMask, v.y, i, width), __shfl_sync(syncMask, v.z, i, width));
+}
+
+static __device__ __forceinline__
+float4 shuffle(const physx::PxU32 syncMask, const float4& v, const int lane)
+{
+	return make_float4(__shfl_sync(syncMask, v.x, lane), __shfl_sync(syncMask, v.y, lane), __shfl_sync(syncMask, v.z, lane), __shfl_sync(syncMask, v.w, lane));
+}
+
+static __device__ __forceinline__
+physx::PxVec3 warpShuffleMin(physx::PxVec3 v)
+{
+	for (physx::PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
+	{
+		v.x = fminf(v.x, __shfl_xor_sync(FULL_MASK, v.x, reductionRadius));
+		v.y = fminf(v.y, __shfl_xor_sync(FULL_MASK, v.y, reductionRadius));
+		v.z = fminf(v.z, __shfl_xor_sync(FULL_MASK, v.z, reductionRadius));
+	}
+
+	return v;
+}
+
+static __device__ __forceinline__
+physx::PxVec3 warpShuffleMax(physx::PxVec3 v)
+{
+	for (physx::PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
+	{
+		v.x = fmaxf(v.x, __shfl_xor_sync(FULL_MASK, v.x, reductionRadius));
+		v.y = fmaxf(v.y, __shfl_xor_sync(FULL_MASK, v.y, reductionRadius));
+		v.z = fmaxf(v.z, __shfl_xor_sync(FULL_MASK, v.z, reductionRadius));
+	}
+
+	return v;
+}
+
+//// experimentally, seems more register-efficient to coalesce this
+//static __device__ __forceinline__
+//physx::PxReal shuffleDot(const physx::PxU32 syncMask, const physx::PxVec3& v0, int shuffle0, const physx::PxVec3& v1)
+//{
+//	return __shfl_sync(syncMask, v0.x, shuffle0)*v1.x + __shfl_sync(syncMask, v0.y, shuffle0)*v1.y + __shfl_sync(syncMask, v0.z, shuffle0)*v1.z;
+//}
+//
+//static __device__ __forceinline__
+//physx::PxU32 maxIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& maxV)
+//{
+//	maxV = mask & (1 << threadIdx.x) ? v : -FLT_MAX;
+//
+//	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 16));
+//	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 8));
+//	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 4));
+//	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 2));
+//	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 1));
+//
+//	return lowestSetIndex(__ballot_sync(FULL_MASK, maxV == v)&mask);
+//}
+//
+//static __device__ __forceinline__
+//physx::PxU32 minIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& minV)
+//{
+//	minV = mask & (1 << threadIdx.x) ? v : FLT_MAX;
+//
+//	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 16));
+//	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 8));
+//	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 4));
+//	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 2));
+//	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 1));
+//
+//	return lowestSetIndex(__ballot_sync(FULL_MASK, minV == v)&mask);
+//}
+
+
+#endif //SHUFFLE_CUH
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/solverResidual.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/solverResidual.cuh
@@ -0,0 +1,98 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __CU_SOLVER_ERROR_CUH__
+#define __CU_SOLVER_ERROR_CUH__
+
+#include "DyResidualAccumulator.h"
+#include "atomic.cuh"
+#include "reduction.cuh"
+
+struct PxgErrorAccumulator
+{
+	PxReal sumOfSquares;
+	PxU32 counter;
+	PxReal maxError;
+
+	PX_FORCE_INLINE __device__ PxgErrorAccumulator() : sumOfSquares(0.0f), counter(0), maxError(0.0f)
+	{ }
+
+	// Provides a calculateResidual function using fast GPU math instructions
+	static PX_FORCE_INLINE __device__ PxReal calculateResidual(PxReal deltaF, PxReal velocityMultiplier)
+	{
+		return velocityMultiplier == 0.0f ? 0.0f : __fdividef(deltaF, velocityMultiplier);
+	}
+
+	PX_FORCE_INLINE __device__ void accumulateErrorLocal(PxReal deltaF, PxReal velocityMultiplier)
+	{
+		PxReal e = PxgErrorAccumulator::calculateResidual(deltaF, velocityMultiplier);
+		sumOfSquares += e * e;
+		++counter;
+		maxError = PxMax(maxError, PxAbs(e));
+	}
+
+	PX_FORCE_INLINE __device__ void accumulateErrorLocal(PxReal deltaF0, PxReal deltaF1,
+		PxReal velocityMultiplier0, PxReal velocityMultiplier1)
+	{
+		accumulateErrorLocal(deltaF0, velocityMultiplier0);
+		accumulateErrorLocal(deltaF1, velocityMultiplier1);
+	}
+
+	/*PX_FORCE_INLINE __device__ void accumulateErrorGlobal(Dy::ErrorAccumulator& globalAccumulator)
+	{
+		atomicAdd(&globalAccumulator.mErrorSumOfSquares, sumOfSquares);
+		atomicAdd(&globalAccumulator.mCounter, counter);
+		if (maxError > globalAccumulator.mMaxError)
+			AtomicMax(&globalAccumulator.mMaxError, maxError);
+	}*/
+
+	PX_FORCE_INLINE __device__ void accumulateErrorGlobalNoAtomics(Dy::ErrorAccumulator& globalAccumulator)
+	{
+		globalAccumulator.mErrorSumOfSquares += sumOfSquares;
+		globalAccumulator.mCounter += counter;
+		if (maxError > globalAccumulator.mMaxError)
+			globalAccumulator.mMaxError = maxError;
+	}
+
+	PX_FORCE_INLINE __device__ void accumulateErrorGlobalFullWarp(Dy::ErrorAccumulator& globalAccumulator, PxU32 threadIndexInWarp)
+	{
+		PxReal s = warpReduction<AddOpPxReal, PxReal>(FULL_MASK, sumOfSquares);
+		PxU32 count = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, counter);
+		PxReal maxErr = warpReduction<MaxOpFloat, PxReal>(FULL_MASK, maxError);
+		if (threadIndexInWarp == 0)
+		{
+			atomicAdd(&globalAccumulator.mErrorSumOfSquares, s);
+			atomicAdd(&globalAccumulator.mCounter, count);
+			if (maxErr > globalAccumulator.mMaxError)
+				AtomicMax(&globalAccumulator.mMaxError, maxErr);
+		}
+	}
+};
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/updateCacheAndBound.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/updateCacheAndBound.cuh
@@ -0,0 +1,175 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __CU_UPDATE_CACHE_AND_BOUND_CUH__
+#define __CU_UPDATE_CACHE_AND_BOUND_CUH__
+
+#include "foundation/PxTransform.h"
+#include "PxsTransformCache.h"
+#include "PxgConvexConvexShape.h"
+#include "geometry/PxGeometry.h"
+
+namespace physx
+{
+
+static __device__ PxTransform getAbsPose(const PxTransform& body2World, const PxTransform& shape2Actor, const PxTransform& body2Actor)
+{
+	PxTransform t0 = body2Actor.transformInv(shape2Actor);
+	return body2World.transform(t0);
+}
+
+static __device__ void setTransformCache(PxsCachedTransform* cacheArray, const PxTransform& transform, const PxU32 flags, const PxU32 index)
+{
+	cacheArray[index].transform = transform;
+	cacheArray[index].flags = flags;
+}
+
+ static __device__ PxVec3 basisExtent(const PxMat33& basis, const PxVec3& extent)
+{
+	// extended basis vectors
+	const PxVec3 c0 = basis.column0 * extent.x;
+	const PxVec3 c1 = basis.column1 * extent.y;
+	const PxVec3 c2 = basis.column2 * extent.z;
+
+	PxVec3 w;
+	// find combination of base vectors that produces max. distance for each component = sum of abs()
+	w.x = PxAbs(c0.x) + PxAbs(c1.x) + PxAbs(c2.x);
+	w.y = PxAbs(c0.y) + PxAbs(c1.y) + PxAbs(c2.y);
+	w.z = PxAbs(c0.z) + PxAbs(c1.z) + PxAbs(c2.z);
+
+	return w;
+}
+
+
+static __device__ void updateBounds(const PxgShapeSim& shapeSim, const PxgShape* convexShapes, PxBounds3* boundsArray, const PxTransform& pose, const PxU32 index)
+{
+
+	const PxBounds3& localBound = shapeSim.mLocalBounds;
+	PxBounds3& updatedBound = boundsArray[index];
+	switch (shapeSim.mShapeType)
+	{
+	case PxGeometryType::eSPHERE:
+	{
+		updatedBound.minimum = pose.p + localBound.minimum;
+		updatedBound.maximum = pose.p + localBound.maximum;
+	}
+	break;
+
+	case PxGeometryType::eCAPSULE:
+	{
+		const PxF32 radius = localBound.maximum.y;
+		const PxF32 halfHeight = localBound.maximum.x - radius;
+		const PxVec3 d = pose.q.getBasisVector0();
+		PxVec3 extents;
+		for (PxU32 ax = 0; ax < 3; ax++)
+			extents[ax] = PxAbs(d[ax]) * halfHeight + radius;
+		updatedBound.minimum = pose.p - extents;
+		updatedBound.maximum = pose.p + extents;
+	}
+	break;
+
+	case PxGeometryType::eBOX:
+	{
+		const PxVec3 halfExtents = localBound.maximum;
+		const PxVec3 extents = basisExtent(PxMat33(pose.q), halfExtents);
+		updatedBound.minimum = pose.p - extents;
+		updatedBound.maximum = pose.p + extents;
+	}
+	break;
+
+	case PxGeometryType::eCONVEXMESH:
+	{
+		const PxU32 hullIndex = shapeSim.mHullDataIndex;
+
+		if (hullIndex != 0xFFffFFff)
+		{
+			const PxgShape& shape = convexShapes[hullIndex];
+			PxMat33 rot(pose.q);
+			if (!shape.scale.isIdentity())
+				rot = rot * shape.scale.toMat33();
+
+			const PxU8* convexPtr = (PxU8*)shape.hullOrMeshPtr;
+			const uint4 tmp = *(((uint4*)convexPtr) + 1);
+			const float4* pVertices = reinterpret_cast<const float4*>(convexPtr + sizeof(uint4) + sizeof(float4) + sizeof(float4));
+
+			//const PxU32 polyData0_NbEdgesNbHullVerticesNbPolygons = tmp.x;
+
+			const PxU32 nbHullVertices = u16High(u32Low(tmp.x));//getNbHullVertices(polyData0_NbEdgesNbHullVerticesNbPolygons);
+
+			//PxU32 nb = shape.hullData->mNbHullVertices;
+			//const PxVec3* v = shape.hullData->getHullVertices();
+			PxVec3 minV = PxVec3(PX_MAX_F32);
+			PxVec3 maxV = PxVec3(-PX_MAX_F32);
+
+			for (PxU32 i = 0; i < nbHullVertices; ++i)
+			{
+				const float4 vf = pVertices[i];
+				const PxVec3 v = PxVec3(vf.x, vf.y, vf.z);
+				const PxVec3 vertexV = rot.transform(v);
+				minV = minV.minimum(vertexV);
+				maxV = maxV.maximum(vertexV);
+			}
+
+			//const Vec4V posV = Vec4V_From_Vec3V(V3LoadU(&pose.p.x));
+			maxV += pose.p;
+			minV += pose.p;
+
+			updatedBound.minimum = minV;
+			updatedBound.maximum = maxV;
+		}
+		else
+		{
+			//ML: this is for GPU incompatible type, which is hull vertices >64 and each hull polygon has vertices > 31
+			updatedBound = PxBounds3::transformFast(pose, localBound);
+		}
+
+	}
+	break;
+	default:
+	{
+		//This updates any dynamic meshes or HFs that may be attached to simulation shapes
+		updatedBound = PxBounds3::transformFast(pose, localBound);
+	}
+	break;
+
+	}
+}
+
+__device__ static inline void updateCacheAndBound(const PxTransform& absPos, const PxgShapeSim& shapeSim, PxU32 index,
+	PxsCachedTransform* cacheArray, PxBounds3* boundsArray, const PxgShape* shapes, bool isBP)
+{
+	//TODO: port the transform flags
+	setTransformCache(cacheArray, absPos, 0, index);
+	if (isBP)
+		updateBounds(shapeSim, shapes, boundsArray, absPos, index);
+	
+}
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/utility.cu
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/utility.cu
@@ -0,0 +1,234 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxMat44.h"
+#include "assert.h"
+#include "utils.cuh"
+#include "PxgInterpolation.h"
+#include <stdio.h>
+#include "PxDeformableSkinning.h"
+#include "atomic.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initCommonKernels2() {}
+
+extern "C" __global__ void interleaveBuffers(const float4* PX_RESTRICT vertices, const float4* PX_RESTRICT normals, PxU32 length, PxVec3* interleavedResultBuffer)
+{
+	int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= length)
+		return;
+
+	float4 v = vertices[threadIndex];
+	float4 n = normals[threadIndex];
+	interleavedResultBuffer[2 * threadIndex] = PxVec3(v.x, v.y, v.z);
+	interleavedResultBuffer[2 * threadIndex + 1] = PxVec3(n.x, n.y, n.z);
+}
+
+//A bit experimental. Can help to get smoother transition between triangles.
+__device__ static PxVec3 modifyBarycentrics(PxVec3 bary)
+{
+    //Use cubic basis function
+    bary.x = 3.0f * bary.x * bary.x - 2.0f * bary.x * bary.x * bary.x;
+    bary.y = 3.0f * bary.y * bary.y - 2.0f * bary.y * bary.y * bary.y;
+    bary.z = 3.0f * bary.z * bary.z - 2.0f * bary.z * bary.z * bary.z;
+
+    float sum = bary.x + bary.y + bary.z;
+    bary *= 1.0f / sum;
+    return bary;
+}
+
+//The paper https://perso.telecom-paristech.fr/boubek/papers/PhongTessellation/PhongTessellation.pdf uses alpha = 0.75 but a slightly lower value 
+//seems to be better. 0.5 is too low, so 0.625 (middle of 0.5 and 0.75) was chosen.
+template<bool normalsAreNormalized = true>
+__device__ static PxVec3 evaluatePointPhongInterpolation(const PxVec3& a, const PxVec3& b, const PxVec3& c, const PxVec3& uvw_,
+    const PxVec3& nA, const PxVec3& nB, const PxVec3& nC, PxReal halfSurfaceThickness, PxReal alpha = 0.625f)
+{
+    PxVec3 uvw = false ? modifyBarycentrics(uvw_) : uvw_;
+    PxVec3 q = uvw.x * a + uvw.y * b + uvw.z * c;
+
+    PxReal scale1 = (q - a).dot(nA);
+    if (!normalsAreNormalized)
+        scale1 /= nA.magnitudeSquared();
+    PxVec3 projA = q - scale1 * nA;
+
+    PxReal scale2 = (q - b).dot(nB);
+    if (!normalsAreNormalized)
+        scale2 /= nB.magnitudeSquared();
+    PxVec3 projB = q - scale2 * nB;
+
+    PxReal scale3 = (q - c).dot(nC);
+    if (!normalsAreNormalized)
+        scale3 /= nC.magnitudeSquared();
+    PxVec3 projC = q - scale3 * nC;
+
+    //uvw = Pow(uvw, 1.5); //Experimental
+
+    PxVec3 qStar = uvw.x * projA + uvw.y * projB + uvw.z * projC;
+
+    PxVec3 dir = qStar - q;
+    PxReal offset = dir.normalizeSafe() * alpha;
+
+    //Asymptotic function applied to offset such that the magnitude of offset cannot exceed halfSurfaceThickness
+    
+    PxReal ratio = 0.0f;
+    if (halfSurfaceThickness > 0.0f)
+    {
+        ratio = offset / halfSurfaceThickness;
+        ratio = tanhf(ratio); //Derivative at zero of tanh is one and tanh asymptotically reaches 1 - this is kind of a softMin(val, 1)
+    }
+    offset = ratio * halfSurfaceThickness;
+    
+    return q + offset * dir;
+}
+
+extern "C" __global__
+void normalVectorsAreaWeighted(
+    PxTrimeshSkinningGpuData* data)
+{
+    PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
+
+    const PxU32 xDim = gridDim.x * blockDim.x;
+    const PxU32 loopEnd = d.nbGuideTriangles;
+    for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
+    {
+        const PxU32* tri = &d.guideTrianglesD[3 * threadIndex];
+
+        const PxVec3 p0 = d.guideVerticesD.at(tri[0]);
+        const PxVec3 p1 = d.guideVerticesD.at(tri[1]);
+        const PxVec3 p2 = d.guideVerticesD.at(tri[2]);
+        const PxVec3 n = (p1 - p0).cross(p2 - p0);
+
+        AtomicAdd3(d.guideNormalsD.atRef(tri[0]), n);
+        AtomicAdd3(d.guideNormalsD.atRef(tri[1]), n);
+        AtomicAdd3(d.guideNormalsD.atRef(tri[2]), n);
+    }
+}
+
+extern "C" __global__
+void zeroNormals(
+    PxTrimeshSkinningGpuData* data)
+{
+    PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
+
+    const PxU32 xDim = gridDim.x * blockDim.x;
+    const PxU32 loopEnd = d.guideVerticesD.count;
+    for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
+    {
+        d.guideNormalsD.atRef(threadIndex) = PxVec3(0.0f);
+    }
+}
+
+extern "C" __global__
+void normalizeNormals(
+    PxTrimeshSkinningGpuData* data)
+{
+    PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
+
+    const PxU32 xDim = gridDim.x * blockDim.x;
+    const PxU32 loopEnd = d.guideVerticesD.count;
+    for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
+    {
+        d.guideNormalsD.atRef(threadIndex).normalizeSafe();
+    }
+}
+
+extern "C" __global__
+void interpolateSkinnedClothVertices(
+    PxTrimeshSkinningGpuData* data)
+{
+    PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
+
+    const PxU32 xDim = gridDim.x * blockDim.x;
+    const PxU32 loopEnd = d.skinnedVerticesD.count;
+    for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
+    {
+
+        PxTriangleMeshEmbeddingInfo info = d.skinningInfoPerVertexD[threadIndex];
+        const PxU32* tri = &d.guideTrianglesD[3 * info.guideTriangleId];
+        PxReal w = 1.0f - info.uv.x - info.uv.y;
+
+        PxVec3 uvw(info.uv.x, info.uv.y, w);
+        PxVec3 uvwProj = uvw.maximum(PxVec3(0.0));
+        PxReal sumProj = uvwProj.x + uvwProj.y + uvwProj.z;
+        if(sumProj > 0.0f)
+        {
+            uvwProj *= 1.0f / sumProj;
+        }
+
+        PxVec3 nA = d.guideNormalsD.at(tri[0]);
+        PxVec3 nB = d.guideNormalsD.at(tri[1]);
+        PxVec3 nC = d.guideNormalsD.at(tri[2]);
+
+        PxVec3 normal = uvwProj.x * nA + uvwProj.y * nB + uvwProj.z * nC;
+        normal.normalizeSafe();
+
+        PxVec3 pointPhong = evaluatePointPhongInterpolation(
+            d.guideVerticesD.at(tri[0]),
+            d.guideVerticesD.at(tri[1]),
+            d.guideVerticesD.at(tri[2]),
+            uvwProj, nA, nB, nC, d.halfSurfaceThickness);
+
+        PxVec3 pointUVW = uvw.x * d.guideVerticesD.at(tri[0]) + uvw.y * d.guideVerticesD.at(tri[1]) + uvw.z * d.guideVerticesD.at(tri[2]);
+        PxVec3 pointUVWProj = uvwProj.x * d.guideVerticesD.at(tri[0]) + uvwProj.y * d.guideVerticesD.at(tri[1]) + uvwProj.z * d.guideVerticesD.at(tri[2]);
+
+        //The offset could also be used to modify the alpha factor of the method EvaluatePoint. Or one could introduce an offset to EvaluatePoint that offsets along the same direction as the alpha factor
+        PxVec3 offsetPoint = pointPhong + info.offsetAlongInterpolatedNormal * normal + pointUVW - pointUVWProj;
+
+        d.skinnedVerticesD.atRef(threadIndex) = offsetPoint;
+    }
+}
+
+
+extern "C" __global__
+void interpolateSkinnedSoftBodyVertices(
+    PxTetmeshSkinningGpuData* data)
+{
+    PxTetmeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
+
+    const PxU32 xDim = gridDim.x * blockDim.x;
+    const PxU32 loopEnd = d.skinnedVerticesD.count;
+    for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
+    {
+        //Uses linear barycentric interpolation - plenty of room for improvements
+
+        PxTetrahedronMeshEmbeddingInfo info = d.skinningInfoPerVertexD[threadIndex];
+        const PxU32* tet = &d.guideTetrahedraD[4 * info.guideTetrahedronId];
+        PxReal s = 1.0f - info.uvw.x - info.uvw.y - info.uvw.z;
+
+        PxVec3 point =
+            info.uvw.x * d.guideVerticesD.at(tet[0]) +
+            info.uvw.y * d.guideVerticesD.at(tet[1]) +
+            info.uvw.z * d.guideVerticesD.at(tet[2]) +
+            s * d.guideVerticesD.at(tet[3]);
+
+        d.skinnedVerticesD.atRef(threadIndex) = point;
+    }
+}
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/utils.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/utils.cuh
@@ -0,0 +1,55 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_UTILS_CUH__
+#define __CU_UTILS_CUH__
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+
+namespace physx
+{
+	__device__ PX_FORCE_INLINE PxVec3 PxLoad3(const float4& v) { float4 tmp = v; return PxVec3(tmp.x, tmp.y, tmp.z); }
+	__device__ PX_FORCE_INLINE PxVec3 PxLoad3(const float4& v, float& w) { float4 tmp = v; w = tmp.w; return PxVec3(tmp.x, tmp.y, tmp.z); }
+	__device__ PX_FORCE_INLINE PxVec4 PxLoad4(const float4& v) { float4 tmp = v; return PxVec4(tmp.x, tmp.y, tmp.z, tmp.w); }
+	__device__ PX_FORCE_INLINE float4 PxSave3(const PxVec3& v) { return float4({ v.x, v.y, v.z, 0 }); }
+	__device__ PX_FORCE_INLINE float4 PxSave4(const PxVec4& v) { return float4({ v.x, v.y, v.z, v.w }); }
+
+
+	//Only works if val > 0
+	__device__ PX_FORCE_INLINE int lowestSetIndex(int val) { return __ffs(val) - 1; }
+	__device__ PX_FORCE_INLINE int highestSetIndex(int val) { return 31 - __clz(val); }
+	__device__ PX_FORCE_INLINE int lowestSetBit(int val) { return val & -val; }
+	__device__ PX_FORCE_INLINE bool testBit(int map, int index) { return (map & 1 << index) != 0; }
+
+	//Returns the index of the lowest set bit. Returns 0xFFffFFff is not bit is set
+	__device__ PX_FORCE_INLINE PxU32 lowestSetIndex(PxU32 val) { return __ffs(val) - 1; }
+	__device__ PX_FORCE_INLINE PxU32 clearLowestSetBit(PxU32 val) { return val & (val - 1); }
+}
+
+#endif
--- a/engine/third_party/physx/source/gpucommon/src/CUDA/vector.cuh
+++ b/engine/third_party/physx/source/gpucommon/src/CUDA/vector.cuh
--- a/engine/third_party/physx/source/gpucommon/src/PxgCommon.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgCommon.cpp
@@ -0,0 +1,48 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgBroadPhase.h"
+
+namespace physx
+{
+
+	extern "C" void initCommonKernels0();
+	extern "C" void initCommonKernels1();
+	extern "C" void initCommonKernels2();
+
+	void createPxgCommon()
+	{
+#if !PX_PHYSX_GPU_EXPORTS
+		//this call is needed to force PhysXCommonGpu linkage as Static Library!
+		initCommonKernels0();
+		initCommonKernels1();
+		initCommonKernels2();
+#endif
+	}
+
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgCopyManager.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgCopyManager.cpp
@@ -0,0 +1,143 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include <stdio.h>
+#include "CudaKernelWrangler.h"
+#include "PxgCopyManager.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "PxgKernelIndices.h"
+#include "PxgHeapMemAllocator.h"
+#include "PxgCudaUtils.h"
+#include "PxgCommonDefines.h"
+
+#include "cudamanager/PxCudaContext.h"
+
+#define DEBUG_COPY_MANAGER 0
+
+using namespace physx;
+
+PxgCopyManager::PxgCopyManager(PxgHeapMemoryAllocatorManager* heapMemoryManager) :
+						mDescriptorsQueue(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)),
+						mNumDescriptors(0),
+						mFinishedEvent(0),
+						mEventRecorded(false),
+						mHeapMemoryManager(heapMemoryManager)
+
+{
+}
+
+void PxgCopyManager::createFinishedEvent(PxCudaContext* cudaContext)
+{
+	cudaContext->eventCreate(&mFinishedEvent, CU_EVENT_DEFAULT);
+}
+
+void PxgCopyManager::destroyFinishedEvent(PxCudaContext* cudaContext)
+{
+	cudaContext->eventDestroy(mFinishedEvent);
+}
+							
+void PxgCopyManager::pushDeferredHtoD(const CopyDesc& desc)
+{
+	PxU32 newSize = (mNumDescriptors + 1) * sizeof(CopyDesc);
+	newSize = (newSize + 255) & ~255; //round up to ensure 256-bytes alignment of the following array
+	newSize += (mNumDescriptors + 1) * sizeof(PxU32); //run-sum array
+
+	if (newSize > mDescriptorsQueue.size())
+	{
+		mDescriptorsQueue.resize(newSize * 2);
+	}
+
+	CopyDesc* descsCPU = reinterpret_cast<CopyDesc*>(mDescriptorsQueue.begin());
+	descsCPU[mNumDescriptors++] = desc;
+}
+
+
+bool PxgCopyManager::hasFinishedCopying(PxCudaContext* cudaContext) const
+{
+	CUresult result = cudaContext->eventQuery(mFinishedEvent);
+	PX_ASSERT(result == CUDA_SUCCESS || result == CUDA_ERROR_NOT_READY);
+
+	return result != CUDA_ERROR_NOT_READY;
+}
+
+void PxgCopyManager::waitAndReset(PxCudaContext* cudaContext)
+{
+	if(mEventRecorded)
+	{
+		CUresult result = cudaContext->eventSynchronize(mFinishedEvent);
+		PX_UNUSED(result);
+		PX_ASSERT(result == CUDA_SUCCESS);
+	}
+	resetUnsafe();
+}
+				
+	
+void PxgCopyManager::dispatchCopy(CUstream stream, PxCudaContextManager* cudaContextManager, KernelWrangler* kernelWrangler)
+{
+	PxCudaContext* cudaContext = cudaContextManager->getCudaContext();
+
+	PX_ASSERT(hasFinishedCopying(cudaContext));
+
+	PxU32 numDescs = mNumDescriptors;
+	mEventRecorded = false;
+	
+	if (!numDescs)
+		return;
+	
+	PxU32 numWarpsPerBlock = COPY_KERNEL_WARPS_PER_BLOCK;
+	PxU32 numBlocks = numDescs;
+	PxU32 numExtraShared = cudaContextManager->supportsArchSM30() ? 0 : numWarpsPerBlock * WARP_SIZE * sizeof(PxU32);
+
+	CUfunction kernelFunction = kernelWrangler->getCuFunction(PxgKernelIds::MEM_COPY_BALANCED_KERNEL);
+
+	{						
+		CopyDesc* descsGPU = reinterpret_cast<CopyDesc*>(getMappedDevicePtr(cudaContext, mDescriptorsQueue.begin()));
+				
+		PxCudaKernelParam kernelParams[] =
+		{
+			PX_CUDA_KERNEL_PARAM(descsGPU),
+			PX_CUDA_KERNEL_PARAM(numDescs)
+		};
+
+		CUresult result = cudaContext->launchKernel(kernelFunction, numBlocks, 1, 1, WARP_SIZE, numWarpsPerBlock, 1, numExtraShared, stream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+
+		if(result != CUDA_SUCCESS)
+			printf("GPU MemCopyBalanced fail to launch kernel!!\n");
+
+#if DEBUG_COPY_MANAGER
+		result = cudaContext->streamSynchronize(stream);
+		if (result != CUDA_SUCCESS)
+			printf("GPU MemCopyBalanced died!!\n");
+#endif
+	}
+
+	CUresult result = cudaContext->eventRecord(mFinishedEvent, stream);
+	mEventRecorded = true;
+	PX_UNUSED(result);
+	PX_ASSERT(result == CUDA_SUCCESS);
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgCudaBuffer.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgCudaBuffer.cpp
@@ -0,0 +1,164 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgCudaBuffer.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxAssert.h"
+
+#include "cudamanager/PxCudaContext.h"
+#include "PxPhysXGpu.h"
+#include "PxsKernelWrangler.h"
+#include "common/PxPhysXCommonConfig.h"
+#include "PxgMemoryManager.h"
+
+#define MEMCHECK_SUPPORT 0
+
+using namespace physx;
+
+void PxgCudaBuffer::allocate(const PxU64 size, const char* filename, PxI32 line)
+{
+	PX_ASSERT(mHeapMemoryAllocator);
+
+	if (mSize < size)
+	{
+		if (mSize > 0 && mPtr) 
+		{
+#if MEMCHECK_SUPPORT
+			PX_UNUSED(filename);
+			PX_UNUSED(line);
+			PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
+			PxU8* ptr = reinterpret_cast<PxU8*>(mPtr);
+			alloc->mContextManager->freeDeviceBuffer(ptr);
+			mPtr = NULL;
+#else
+			mHeapMemoryAllocator->deallocate(reinterpret_cast<void*>(mPtr));
+#endif
+		}
+			
+		//Allocate either double current size or the requested size, depending on which is larger
+		mSize = PxMax(size, mSize * 2);
+			
+#if MEMCHECK_SUPPORT
+		PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
+		mPtr = CUdeviceptr(alloc->mContextManager->allocDeviceBuffer<PxU8>(PxU32(mSize)));
+		PX_ASSERT(mPtr);
+#else
+		mPtr = reinterpret_cast<CUdeviceptr>(mHeapMemoryAllocator->allocate(mSize, mStatGroup, filename, line));
+#endif
+
+#if PX_STOMP_ALLOCATED_MEMORY
+		if (mPtr)
+		{
+			PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
+			PxCudaContextManager* ccm = alloc->mContextManager;
+			if (ccm) 
+			{
+				PxScopedCudaLock scl(*ccm);
+				CUresult result = ccm->getCudaContext()->memsetD8(mPtr, static_cast<unsigned char>(0xcd), mSize);
+				if (result != 0)
+					PX_ASSERT(result == 0);
+			}
+			else
+			{
+				PxGetFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, PX_FL,
+					"Not possible to stomp PxgCudaBufferMemory because not cuda context manager is available.");
+			}
+		}
+#endif
+	}
+}
+
+void PxgCudaBuffer::allocateCopyOldDataAsync(const PxU64 size, PxCudaContext* cudaContext, CUstream stream, const char* filename, PxI32 line)
+{
+	PX_ASSERT(mHeapMemoryAllocator);
+
+	PxU64 oldSize = mSize;
+
+	//Allocate either double current size or the requested size, depending on which is larger
+	mSize = (oldSize < size) ? PxMax(size, mSize * 2) : mSize;
+
+	if (oldSize < size)
+	{
+		CUdeviceptr oldPtr = mPtr;
+#if MEMCHECK_SUPPORT
+		PX_UNUSED(filename);
+		PX_UNUSED(line);
+		PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
+		mPtr = CUdeviceptr(alloc->mContextManager->allocDeviceBuffer<PxU8>(PxU32(mSize)));
+		PX_ASSERT(mPtr);
+#else
+		mPtr = reinterpret_cast<CUdeviceptr>(mHeapMemoryAllocator->allocate(mSize, mStatGroup, filename, line));
+#endif
+
+		if (oldSize > 0 && oldPtr)
+		{
+			cudaContext->memcpyDtoDAsync(mPtr, oldPtr, oldSize, stream);
+			//Defer deletion. This makes sure nothing else gets this memory until after the memcopy has completed
+
+#if MEMCHECK_SUPPORT
+//Since MEMCHECK_SUPPORT is only active for invalid memory access debugging, let it leak for now
+#else
+			mHeapMemoryAllocator->deallocateDeferred(reinterpret_cast<void*>(oldPtr));
+#endif
+		}	
+	}
+}
+
+void PxgCudaBuffer::deallocate()
+{
+	PX_ASSERT(mHeapMemoryAllocator);
+	if (mSize && mPtr)	
+	{
+#if MEMCHECK_SUPPORT
+		PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
+		PxU8* ptr = reinterpret_cast<PxU8*>(mPtr);
+		alloc->mContextManager->freeDeviceBuffer(ptr);
+#else
+		mHeapMemoryAllocator->deallocate(reinterpret_cast<void*>(mPtr));
+#endif		
+		mPtr = 0;
+		mSize = 0;
+	}
+}
+
+void PxgCudaBuffer::deallocateDeferred()
+{
+#if MEMCHECK_SUPPORT
+	//Since MEMCHECK_SUPPORT is only active for invalid memory access debugging, let it leak for now
+#else
+	PX_ASSERT(mHeapMemoryAllocator);
+	if (mSize && mPtr)	
+		mHeapMemoryAllocator->deallocateDeferred(reinterpret_cast<void*>(mPtr));
+#endif
+}
+
+PxgCudaBuffer::~PxgCudaBuffer()
+{
+	deallocate();
+}
+
--- a/engine/third_party/physx/source/gpucommon/src/PxgCudaMemoryAllocator.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgCudaMemoryAllocator.cpp
@@ -0,0 +1,244 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "PxgCudaMemoryAllocator.h"
+#include "foundation/PxErrors.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxPreprocessor.h"
+
+#if PX_LINUX && PX_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdocumentation"
+#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
+#endif
+#include <cuda.h>
+#if PX_LINUX && PX_CLANG
+#pragma clang diagnostic pop
+#endif
+#include "foundation/PxAllocator.h"
+#include "foundation/PxAtomic.h"
+#include "foundation/PxAssert.h"
+#include "cudamanager/PxCudaContextManager.h"
+
+#include "cudamanager/PxCudaContext.h"
+#include "common/PxPhysXCommonConfig.h"
+
+using namespace physx;
+
+// memory tracking.
+#if PX_DEBUG
+#include "PxgMemoryTracker.h"
+static MemTracker deviceMemTracker;
+static MemTracker hostMemTracker;
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void* physx::PxgPinnedMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line)
+{
+	PxU8* ptr = NULL;
+	CUresult result = cudaContext.memHostAlloc((void**)&ptr, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_PORTABLE);
+	if (result != CUDA_SUCCESS || !ptr)
+	{
+		PxGetFoundation().error(PX_WARN, PX_FL, "Failed to allocate pinned memory.");
+		return NULL;
+	}
+
+	PX_ASSERT((size_t)(ptr) % 256 == 0); //alignment check. I believe it should be guaranteed
+
+#if PX_STOMP_ALLOCATED_MEMORY
+	// fill pinned memory with markers to catch uninitialized memory earlier. 
+	// use alternating pattern to avoid pairs of start, end values to cancel each other out.
+	PxU32 pat[2] = { 0xcdcdcdcd, 0xdcdcdcdc };
+	for (size_t i = 0; i < (size/4); ++i)
+		reinterpret_cast<PxU32*>(ptr)[i] = pat[i % 2];
+#endif
+
+#if PX_DEBUG
+	hostMemTracker.registerMemory(ptr, false, size, filename, line);
+#else
+	PX_UNUSED(filename);
+	PX_UNUSED(line);
+#endif
+
+	return ptr;
+}
+
+void physx::PxgPinnedMemoryDeallocate(PxCudaContext& cudaContext, void* ptr)
+{
+	if (ptr == NULL)
+		return;
+
+	CUresult result = cudaContext.memFreeHost(ptr);
+	PX_UNUSED(result);
+	PX_ASSERT(result == CUDA_SUCCESS);
+#if PX_DEBUG
+	hostMemTracker.unregisterMemory(ptr, false);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void* physx::PxgCudaDeviceMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line)
+{
+	if (cudaContext.isInAbortMode())
+		return NULL;
+
+	PxDeviceAllocatorCallback* callback = cudaContext.getAllocatorCallback();
+	if (callback)
+	{
+		void* ptr = NULL;
+		bool result = callback->memAlloc(&ptr, size);
+		if (!result)
+		{
+			cudaContext.setAbortMode(true);
+			PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxDeviceAllocatorCallback failed to allocate memory %zu bytes!", size);
+			return NULL;
+		}
+#if PX_DEBUG
+		if (result)
+			deviceMemTracker.registerMemory(ptr, true, size, filename, line);
+#else
+	PX_UNUSED(filename);
+	PX_UNUSED(line);
+#endif
+		return ptr;
+	}
+	else
+	{
+		CUdeviceptr ptr = 0;
+		CUresult result = cudaContext.memAlloc(&ptr, size);
+		PX_ASSERT((ptr & 127) == 0);
+		if (result != CUDA_SUCCESS)
+		{
+			cudaContext.setAbortMode(true);
+			PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxgCudaDeviceMemoryAllocator failed to allocate memory %zu bytes! Result = %i", size, result);
+			return NULL;
+		}
+#if PX_DEBUG
+		if (result == CUDA_SUCCESS)
+			deviceMemTracker.registerMemory(reinterpret_cast<void*>(ptr), true, size, filename, line);
+#else
+	PX_UNUSED(filename);
+	PX_UNUSED(line);
+#endif
+		return reinterpret_cast<void*>(ptr);
+	}
+}
+
+void physx::PxgCudaDeviceMemoryDeallocate(PxCudaContext& cudaContext, void* ptr)
+{
+	PxDeviceAllocatorCallback* callback = cudaContext.getAllocatorCallback();
+	if (callback)
+	{
+		bool result = callback->memFree(ptr);
+		if (!result)
+			PxGetFoundation().error(PX_WARN, PX_FL, "PxDeviceAllocatorCallback fail to deallocate memory!!\n");
+	}
+	else
+	{
+		CUresult result = cudaContext.memFree(reinterpret_cast<CUdeviceptr>(ptr));
+		if (result != CUDA_SUCCESS)
+			PxGetFoundation().error(PX_WARN, PX_FL, "PxgCudaDeviceMemoryDeallocate fail to deallocate memory!! Result = %i\n", result);
+	}
+#if PX_DEBUG
+	if (ptr) 
+		deviceMemTracker.unregisterMemory(ptr, true);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PxgPinnedHostLinearMemoryAllocator::PxgPinnedHostLinearMemoryAllocator(PxCudaContextManager* contextManager, const PxU64 size) : 
+	mCudaContext(contextManager->getCudaContext())
+{
+	reserve(size);
+}
+
+PxgPinnedHostLinearMemoryAllocator::~PxgPinnedHostLinearMemoryAllocator()
+{
+	deallocate();
+}
+
+void PxgPinnedHostLinearMemoryAllocator::reserveAndGrow(const PxU64 size)
+{
+	// only reallocate when the new size is larger than what we had before.
+	if (size > mTotalSize)
+	{
+		deallocate();
+
+		const PxU64 newSize = PxMax(size, PxU64(PxCeil(mTotalSize * 1.5f))); 
+
+		mStart = reinterpret_cast<PxU8*>(PxgPinnedMemoryAllocate(*mCudaContext, newSize, PX_FL));
+
+		mTotalSize = newSize;
+		mCurrentSize = 0;
+	}
+}
+
+void PxgPinnedHostLinearMemoryAllocator::reserve(const PxU64 size)
+{
+	PX_COMPILE_TIME_ASSERT(sizeof(size_t) == sizeof(PxU64));
+
+	mStart = reinterpret_cast<PxU8*>(PxgPinnedMemoryAllocate(*mCudaContext, size, PX_FL));
+
+	mTotalSize = size;
+	mCurrentSize = 0;
+}
+
+void PxgPinnedHostLinearMemoryAllocator::reset()
+{
+	mCurrentSize = 0;
+}
+
+void* PxgPinnedHostLinearMemoryAllocator::allocate(const PxU64 size, const PxU64 alignment)
+{
+	if(size > 0)
+	{
+		const PxI64 alignedSize = PxI64(size + alignment);
+		PxU64 baseOffset = PxU64(physx::PxAtomicAdd(reinterpret_cast<PxI64*>(&mCurrentSize), alignedSize));
+
+		if (baseOffset > mTotalSize)
+		{
+			PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxgPinnedHostLinearMemoryAllocator: overflowing initial allocation size, increase capacity to at least %u\n", baseOffset);
+			return NULL;
+		}
+
+		// this takes baseOffset again because of the atomic.
+		uintptr_t startAddress = (uintptr_t(mStart)) + (baseOffset - alignedSize);
+		startAddress = (startAddress + alignment-1) & (~(alignment - 1));
+		return (void*)startAddress;
+	}
+	return NULL;
+}
+
+void PxgPinnedHostLinearMemoryAllocator::deallocate()
+{
+	if(mTotalSize && mStart)
+		PxgPinnedMemoryDeallocate(*mCudaContext, mStart);
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgHeapMemoryAllocator.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgHeapMemoryAllocator.cpp
@@ -0,0 +1,525 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "PxgHeapMemAllocator.h"
+#include "foundation/PxAllocator.h"
+#include "foundation/PxMath.h"
+#include "common/PxProfileZone.h"
+#include "PxsMemoryManager.h"
+
+using namespace physx;
+
+#define EXCEPTIONAL_ALLOC_FACTOR 2
+
+bool Block::isValid()
+{
+	BlockHeader* current = mStartHeader;
+	while (current)
+	{
+		BlockHeader* next = current->mNext;
+		if (next)
+		{
+			if ((current->mRootIndex > next->mRootIndex) || ((current->mRootIndex == next->mRootIndex) && (current->mOffset >= next->mOffset)))
+			{
+				return false;
+			}
+			else
+			{
+				current = next;
+				next = current->mNext;
+			}
+		}
+		else
+		{
+			current = NULL;
+		}
+	}
+	return true;
+}
+
+void Block::insertBlockHeader(const PxU32 rootIndex, const PxU32 offset, PxPool<BlockHeader>& pool)
+{
+	PX_PROFILE_ZONE("Block::insertBlockHeader", 0);
+
+	BlockHeader* newHeader = pool.allocate();
+	newHeader->initialize(rootIndex, offset);
+
+	if (mStartHeader)
+	{
+		BlockHeader* header = mStartHeader;
+
+		while (header && ((header->mRootIndex < rootIndex) || (header->mRootIndex == rootIndex && header->mOffset < offset)))
+		{
+			header = header->mNext;
+		}
+
+		//if we found a header, we need to insert a new header in front of the found header
+		if (header)
+		{
+			BlockHeader* prevHeader = header->mPrev;
+
+			newHeader->mNext = header;
+			newHeader->mPrev = prevHeader;
+
+			if (prevHeader)
+			{
+				prevHeader->mNext = newHeader;
+			}
+			else
+			{
+				mStartHeader = newHeader;
+			}
+
+			header->mPrev = newHeader;
+		}
+		else
+		{
+			//if we didn't found an appropriated header, we need to insert this new header at the end of the linked list
+			mEndHeader->mNext = newHeader;
+			newHeader->mPrev = mEndHeader;
+			mEndHeader = newHeader;
+		}
+	}
+	else
+	{
+		mStartHeader = newHeader;
+		mEndHeader = newHeader;
+	}
+
+	PX_ASSERT(isValid());
+	mHeaderSizes++;
+}
+
+void Block::removeBlockHeader(BlockHeader* header, PxPool<BlockHeader>& pool)
+{
+	BlockHeader* mPrev = header->mPrev;
+	BlockHeader* mNext = header->mNext;
+
+	if (mPrev)
+		mPrev->mNext = mNext;
+	else
+		mStartHeader = mNext;
+
+	if (mNext)
+		mNext->mPrev = mPrev;
+	else
+		mEndHeader = mPrev;
+
+	pool.deallocate(header);
+
+	mHeaderSizes--;
+}
+
+BlockHeader* Block::findBuddy(const PxU32 offsetToFind, const PxU32 rootIndex)
+{
+	BlockHeader* header = mStartHeader;
+
+	while (header && (header->mOffset != offsetToFind || header->mRootIndex != rootIndex))
+	{
+		header = header->mNext;
+	}
+
+	return header;
+}
+
+PxgHeapMemoryAllocator::PxgHeapMemoryAllocator(const PxU32 byteSize, PxVirtualAllocatorCallback* allocator) : mBlockHeaderPool(PxAllocatorTraits<BlockHeader>::Type(), 128)
+{
+	PX_ASSERT(PxIsPowerOfTwo(byteSize));
+	PX_ASSERT(byteSize >= 128);
+	mAllocationSize = byteSize;
+	mAllocator = allocator;
+
+	PX_PROFILE_ZONE("PxgHeapMemoryAllocator::initialization", 0);
+	void* memory = mAllocator->allocate(mAllocationSize, 0, PX_FL);
+	
+	// AD: the allocation above can fail.
+	if (memory)
+	{
+		mRoots.pushBack(memory);
+		mTotalMem = mAllocationSize;
+		initializeBlocks(0);
+	}
+	else
+	{
+		mTotalMem = 0;
+		mBitfield = 0;
+	}
+
+}
+
+PxgHeapMemoryAllocator::~PxgHeapMemoryAllocator()
+{
+	if (mAllocator)
+	{
+		for (PxU32 i = 0; i < mRoots.size(); ++i)
+		{
+			mAllocator->deallocate(mRoots[i]);
+		}
+		for (PxU32 i = 0; i < mExceptionalAllocs.size(); ++i)
+		{
+			if(mExceptionalAllocs[i].address)
+				mAllocator->deallocate(mExceptionalAllocs[i].address);
+		}
+		mRoots.clear();
+		mExceptionalAllocs.clear();
+		mAllocator = NULL;
+	}
+}
+
+void PxgHeapMemoryAllocator::initializeBlocks(const PxU32 rootIndex)
+{
+	//calculate how many slots do we need, the smallest blockSize in a slot will be 128 byte. the 120 = pow(2, 7)
+	const PxU32 highestBit = PxHighestSetBit(mAllocationSize) + 1 - 7;
+
+	mBlocks.resize(highestBit);
+
+	//initialize all blocks
+	for (PxU32 i = 0; i < highestBit; ++i)
+	{
+		mBlocks[i].mBlockSize = 1u << (i + 7u);
+		mBlocks[i].mBlockIndex = i;
+	}
+
+	//all blocks are empty beside the highestBit
+	mBitfield = (1u << (highestBit - 1u));
+
+	mBlocks[highestBit - 1].insertBlockHeader(rootIndex, 0, mBlockHeaderPool);
+}
+
+PxU32 PxgHeapMemoryAllocator::getNextFreeBlock(const PxU32 blockIndex, const PxU32 allocationSize, const char* file, const int line)
+{
+	PX_ASSERT(PxIsPowerOfTwo(allocationSize));
+	const PxU32 bits = mBitfield & (~((1 << blockIndex) - 1));
+	//no bigger slot avaiable
+	if (bits == 0)
+	{
+		PX_PROFILE_ZONE("PxgHeapMemoryAllocator::getNextFreeBlock", 0);
+		//we can't find any free blocks, we allocate more memory
+		const PxU32 maxAllocationSize = PxMax(allocationSize, mAllocationSize);
+		void* memorys = mAllocator->allocate(maxAllocationSize, 0, file, line);
+		if (!memorys)
+			return PXG_INVALID_BLOCK;
+
+		mRoots.pushBack(memorys);
+
+		mTotalMem += maxAllocationSize;
+
+		//if the allocationSize is bigger than the default allocation size(mAllocationSize), we need to increase
+		//the block slots
+		if (blockIndex >= mBlocks.size())
+		{
+			PxU32 oldSize = mBlocks.size();
+			mBlocks.resize(blockIndex + 1);
+
+			for (PxU32 i = oldSize; i <= blockIndex; ++i)
+			{
+				//blockSize is power of two
+				mBlocks[i].mBlockSize = 1u << (i + 7u);
+				mBlocks[i].mBlockIndex = i;
+			}
+		}
+
+		const PxU32 newBlockIndex = PxU32(PxMax(PxI32(PxHighestSetBit(maxAllocationSize)) - 7, 0));
+		const PxU32 rootIndex = mRoots.size() - 1;
+		Block* block = &mBlocks[newBlockIndex];
+
+		block->insertBlockHeader(rootIndex, 0, mBlockHeaderPool);
+		mBitfield = mBitfield | (1u << newBlockIndex);
+
+		return newBlockIndex;
+	}
+	else
+	{
+		return PxLowestSetBit(bits);
+	}
+}
+
+void* PxgHeapMemoryAllocator::allocate(const size_t byteSize, const int group, const char* file, const int line)
+{
+	if (byteSize == 0)
+		return NULL;
+
+	PX_PROFILE_ZONE("PxgHeapMemoryAllocator::allocate", 0);
+
+	PxMutex::ScopedLock myLock(mMutex);
+
+	PX_ASSERT(group >= 0 && group < PxsHeapStats::eHEAPSTATS_COUNT);
+	mHeapStats.stats[group] += byteSize;
+
+	if ((byteSize * EXCEPTIONAL_ALLOC_FACTOR) > mAllocationSize)
+	{
+		PX_PROFILE_ZONE("PxgHeapMemoryAllocator::exceptionalAlloc", 0);
+		//We are allocating over half the size of a page. In this case, we'll use a whole page so we might
+		//as well just allocate an exceptional block for this using the built-in allocator...
+		void* memorys = mAllocator->allocate(byteSize, 0, file, line);
+		if (!memorys)
+			return NULL;
+
+		mTotalMem += byteSize;
+
+		PxU32 index = mExceptionalAllocs.size();
+		ExceptionalAlloc alloc;
+		alloc.address = memorys;
+		alloc.size = byteSize;
+		mExceptionalAllocs.pushBack(alloc);
+
+		mHashMap.insert(memorys, AllocationValue(PXG_INVALID_BLOCK, index, byteSize, group));
+
+#if PX_DEBUG
+		mMemTracker.registerMemory(reinterpret_cast<void*>(memorys), true, byteSize, file, line);
+#endif
+
+		return memorys;
+	}
+
+	const PxU32 maxSize = PxIsPowerOfTwo(PxU32(byteSize)) ? PxU32(byteSize) : PxNextPowerOfTwo(PxU32(byteSize));
+
+	//get the slot index
+	const PxU32 blockIndex = PxU32(PxMax(PxI32(PxHighestSetBit(maxSize)) - 7, 0));
+
+	//Reserve enough memory for this block if it is needed
+	const PxU32 freeBlockIndex = getNextFreeBlock(blockIndex, maxSize, file, line);
+
+	// if the allocation of the free block failed, make sure we pass the error along.
+	if (freeBlockIndex == PXG_INVALID_BLOCK)
+		return NULL;
+
+	if (mBlocks[blockIndex].isEmpty())
+	{
+		//We don't have a slot of this size, so recursively split higher blocks until we get to the desired size.
+		//The above getNextFreeBlock(...) call will ensure that there is a suitable block to use.
+		Block& tBlock = mBlocks[blockIndex];
+
+		Block* freeBlock = &mBlocks[freeBlockIndex];
+
+		PxU32 cBlockSize = freeBlock->mBlockSize;
+
+		//remove the last free header
+
+		BlockHeader* newBlockHeader = freeBlock->getFreeBlocks();
+
+		const PxU32 rootIndex = newBlockHeader->mRootIndex;
+		const PxU32 offset = newBlockHeader->mOffset;
+
+		freeBlock->removeBlockHeader(newBlockHeader, mBlockHeaderPool);
+
+		if (freeBlock->isEmpty())
+		{
+			mBitfield = mBitfield & (~(1u << freeBlockIndex));
+		}
+
+		void* freeAddress = reinterpret_cast<void*>(reinterpret_cast<PxU8*>(mRoots[rootIndex]) + offset);
+		PX_ASSERT(!mHashMap.find(freeAddress));
+
+		mBlocks[blockIndex].insertBlockHeader(rootIndex, tBlock.mBlockSize + offset, mBlockHeaderPool);
+
+		mBitfield = mBitfield | (1u << blockIndex);
+
+		mHashMap.insert(freeAddress, AllocationValue(blockIndex, rootIndex, byteSize, group));
+
+		//recursively split blocks
+		PxU32 cOffset = offset;
+		PxU32 cBlockIndex = freeBlock->mBlockIndex;
+
+		const PxU32 tBlockSize = tBlock.mBlockSize << 1;
+
+		while (cBlockSize > tBlockSize)
+		{
+			cBlockSize = cBlockSize >> 1;
+			cOffset = cBlockSize + offset;
+			cBlockIndex = cBlockIndex - 1;
+			mBlocks[cBlockIndex].insertBlockHeader(rootIndex, cOffset, mBlockHeaderPool);
+			mBitfield = mBitfield | (1u << cBlockIndex);
+		}
+
+#if PX_DEBUG
+		mMemTracker.registerMemory(reinterpret_cast<void*>(freeAddress), true, byteSize, file, line);
+#endif
+
+		return freeAddress;
+	}
+	else
+	{
+		Block& tBlock = mBlocks[blockIndex];
+
+		BlockHeader* newHeader = tBlock.getFreeBlocks();
+		const PxU32 rootIndex = newHeader->mRootIndex;
+		const PxU32 offset = newHeader->mOffset;
+
+		tBlock.removeBlockHeader(newHeader, mBlockHeaderPool);
+		if (tBlock.isEmpty())
+		{
+			mBitfield = mBitfield & (~(1u << blockIndex));
+		}
+		void* address = reinterpret_cast<void*>(reinterpret_cast<PxU8*>(mRoots[rootIndex]) + offset);
+		PX_ASSERT(!mHashMap.find(address));
+
+		mHashMap.insert(address, AllocationValue(blockIndex, rootIndex, byteSize, group));
+
+#if PX_DEBUG
+		mMemTracker.registerMemory(reinterpret_cast<void*>(address), true, byteSize, file, line);
+#endif
+
+		return address;
+	}
+}
+	
+void PxgHeapMemoryAllocator::deallocateDeferred(void* ptr)
+{
+	deferredDeallocs.pushBack(ptr);
+}
+
+void PxgHeapMemoryAllocator::flushDeferredDeallocs()
+{
+	for (PxU32 i = 0; i < deferredDeallocs.size(); ++i)
+		deallocate(deferredDeallocs[i]);
+	deferredDeallocs.forceSize_Unsafe(0);
+}
+
+void PxgHeapMemoryAllocator::deallocate(void* ptr)
+{
+	PX_PROFILE_ZONE("PxgHeapMemoryAllocator::deallocate", 0);
+	if (ptr == NULL)
+		return;
+
+	PxMutex::ScopedLock myLock(mMutex);
+
+	PX_ASSERT(mHashMap.find(ptr));
+	//found the block index
+	AllocationValue value = mHashMap.find(ptr)->second;
+
+	mHeapStats.stats[value.mGroup] -= value.mByteSize;
+
+	mHashMap.erase(ptr);
+
+	if (value.mBlockIndex == PXG_INVALID_BLOCK)
+	{
+		//Exceptional allocation, we just release it back to the CUDA allocator...
+
+		mTotalMem -= mExceptionalAllocs[value.mRootIndex].size;
+
+		mExceptionalAllocs[value.mRootIndex].address = NULL;
+		mExceptionalAllocs[value.mRootIndex].size = 0;
+		mAllocator->deallocate(ptr);
+
+#if PX_DEBUG
+		mMemTracker.unregisterMemory(ptr, true);
+#endif
+		return;
+	}
+
+	const PxU32 rootIndex = value.mRootIndex;
+	PxU32 blockIndex = value.mBlockIndex;
+
+	PxU32 offset = PxU32(reinterpret_cast<PxU8*>(ptr)-reinterpret_cast<PxU8*>(mRoots[rootIndex]));
+
+	Block* block = &mBlocks[blockIndex];
+
+	do
+	{
+		const PxU32 offsetToFind = (((offset / block->mBlockSize) & 1) == 0) ? offset + block->mBlockSize : offset - block->mBlockSize;
+
+		BlockHeader* buddyHeader = block->findBuddy(offsetToFind, rootIndex);
+
+		if (buddyHeader)
+		{
+			//current block need to remove the merged free header
+			block->removeBlockHeader(buddyHeader, mBlockHeaderPool);
+
+			if (block->isEmpty())
+			{
+				mBitfield = mBitfield & (~(1u << blockIndex));
+			}
+
+			offset = PxMin(offsetToFind, offset);
+
+			blockIndex = blockIndex + 1;
+
+			if (blockIndex < mBlocks.size())
+			{
+				block = &mBlocks[blockIndex];
+			}
+			else
+			{
+				block->insertBlockHeader(rootIndex, offset, mBlockHeaderPool);
+				mBitfield = mBitfield | (1u << blockIndex);
+				break;
+			}
+		}
+		else
+		{
+			PX_ASSERT(buddyHeader == NULL);
+			//just put it back to the block
+			block->insertBlockHeader(rootIndex, offset, mBlockHeaderPool);
+			mBitfield = mBitfield | (1u << blockIndex);
+			break;
+		}
+	} while (1);
+
+#if PX_DEBUG
+	mMemTracker.unregisterMemory(ptr, true);
+#endif
+}
+
+PxU64 PxgHeapMemoryAllocator::getTotalSize()
+{
+	return mTotalMem;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PxgHeapMemoryAllocatorManager::PxgHeapMemoryAllocatorManager(PxU32 heapCapacity, PxsMemoryManager* memoryManager)
+{
+	mDeviceMemoryAllocators = PX_NEW(PxgHeapMemoryAllocator)(heapCapacity, memoryManager->getDeviceMemoryAllocator());
+	mMappedMemoryAllocators = PX_NEW(PxgHeapMemoryAllocator)(heapCapacity, memoryManager->getHostMemoryAllocator());
+}
+
+PxgHeapMemoryAllocatorManager::~PxgHeapMemoryAllocatorManager()
+{
+	PX_DELETE(mDeviceMemoryAllocators);
+	PX_DELETE(mMappedMemoryAllocators);
+}
+
+PxU64 PxgHeapMemoryAllocatorManager::getDeviceMemorySize() const
+{
+	return mDeviceMemoryAllocators ? mDeviceMemoryAllocators->getTotalSize() : 0;
+}
+
+PxsHeapStats PxgHeapMemoryAllocatorManager::getDeviceHeapStats() const
+{
+	if(mDeviceMemoryAllocators)
+		return mDeviceMemoryAllocators->getHeapStats();
+	else
+		return PxsHeapStats();
+}
+
+void PxgHeapMemoryAllocatorManager::flushDeferredDeallocs()
+{
+	if (mDeviceMemoryAllocators) // this should actually never be null...
+		mDeviceMemoryAllocators->flushDeferredDeallocs();
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgKernelWrangler.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgKernelWrangler.cpp
@@ -0,0 +1,52 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgKernelWrangler.h"
+#include "CudaKernelWrangler.h"
+#include "foundation/PxAllocator.h"
+#include "cudamanager/PxCudaContext.h"
+
+using namespace physx;
+
+static const char* kernelNames[]
+{
+#define KERNEL_DEF(id, name) name,
+#include "PxgKernelNames.h"
+#undef KERNEL_DEF
+};
+
+PxgCudaKernelWranglerManager::PxgCudaKernelWranglerManager(PxCudaContextManager& cudaContextManager, PxErrorCallback& errorCallback)
+{
+	mCudaContextManager = &cudaContextManager;
+	mKernelWrangler = PX_NEW(KernelWrangler)(cudaContextManager, errorCallback, kernelNames, sizeof(kernelNames) / sizeof(char*));
+}
+
+PxgCudaKernelWranglerManager::~PxgCudaKernelWranglerManager()
+{
+	PX_DELETE(mKernelWrangler);
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgMemCopyDispatcher.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgMemCopyDispatcher.cpp
@@ -0,0 +1,67 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+
+#include "PxgMemCopyDispatcher.h"
+
+#include "cudamanager/PxCudaContext.h"
+#include "CudaKernelWrangler.h"
+#include "PxgKernelIndices.h"
+
+namespace physx
+{
+	void PxgMemCopyDispatcher::flushCommands(CUstream stream, PxCudaContext* cudaContext, KernelWrangler* kernelWrangler)
+	{
+		// AD - this assumes the context lock is already held?
+		if (mPinnedCopyBuffer.size())
+		{
+			mDeviceCopyCommands.allocate(mPinnedCopyBuffer.size() * sizeof(PxgPtrPair), PX_FL);
+			cudaContext->memcpyHtoDAsync(mDeviceCopyCommands.getDevicePtr(), mPinnedCopyBuffer.begin(), mPinnedCopyBuffer.size() * sizeof(PxgPtrPair), stream);
+
+			CUfunction function = kernelWrangler->getCuFunction(PxgKernelIds::COPY_USER_DATA);
+
+			PX_ASSERT(mMaxSize <= PX_MAX_U32);
+			const PxU32 maxS = PxU32(mMaxSize);
+
+			const PxU32 blockSize = 256;
+			const PxU32 numBlocks = ((maxS/4) + blockSize-1)/ blockSize;
+
+			CUdeviceptr ptr = mDeviceCopyCommands.getDevicePtr();
+			PxU32 count = mPinnedCopyBuffer.size();
+
+			PxCudaKernelParam kernelParams[] =
+			{
+				PX_CUDA_KERNEL_PARAM(ptr),
+				PX_CUDA_KERNEL_PARAM(count)
+			};
+
+			CUresult launchResult = cudaContext->launchKernel(function, numBlocks, count, 1, 256, 1, 1, 0, stream, kernelParams, sizeof(kernelParams), 0, PX_FL);
+			PX_ASSERT(launchResult == CUDA_SUCCESS);
+			PX_UNUSED(launchResult);
+		}
+		mPinnedCopyBuffer.forceSize_Unsafe(0);
+		mMaxSize = 0;
+	}
+}
--- a/engine/third_party/physx/source/gpucommon/src/PxgMemoryManager.cpp
+++ b/engine/third_party/physx/source/gpucommon/src/PxgMemoryManager.cpp
@@ -0,0 +1,112 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgMemoryManager.h"
+#include "PxgCudaMemoryAllocator.h"
+
+using namespace physx;
+
+namespace physx
+{
+	PxgCudaAllocatorCallbackBase::PxgCudaAllocatorCallbackBase(PxCudaContextManager* contextManager) : mContextManager(contextManager), mCudaContext(contextManager->getCudaContext()) {}
+}
+
+namespace
+{	
+	// PT: this one calls PxgPinnedMemoryAllocate/PxgPinnedMemoryDeallocate, i.e. cuMemHostAlloc/cuMemFreeHost
+	class PxgCudaHostMemoryAllocatorCallback : public PxgCudaAllocatorCallbackBase
+	{
+	public:
+		PxgCudaHostMemoryAllocatorCallback(PxCudaContextManager* contextManager) : PxgCudaAllocatorCallbackBase(contextManager)	{}
+
+		// PxVirtualAllocatorCallback
+		virtual	void*	allocate(size_t size, int, const char* file, int line)	PX_OVERRIDE PX_FINAL
+		{
+			PxScopedCudaLock lock(*mContextManager);
+			return PxgPinnedMemoryAllocate(*mCudaContext, size, file, line);
+		}
+
+		virtual	void	deallocate(void* ptr)	PX_OVERRIDE PX_FINAL
+		{
+			if(ptr)
+			{
+				PxScopedCudaLock lock(*mContextManager);
+				PxgPinnedMemoryDeallocate(*mCudaContext, ptr);
+			}
+		}
+		//~PxVirtualAllocatorCallback
+	};
+
+	// PT: this one calls PxgCudaDeviceMemoryAllocate/PxgCudaDeviceMemoryDeallocate, i.e. cuMemAlloc/cuMemFree
+	class PxgCudaDeviceMemoryAllocatorCallback : public PxgCudaAllocatorCallbackBase
+	{
+	public:
+		PxgCudaDeviceMemoryAllocatorCallback(PxCudaContextManager* contextManager) : PxgCudaAllocatorCallbackBase(contextManager)	{}
+		// PxVirtualAllocatorCallback
+
+		virtual	void*	allocate(size_t size, int, const char* file, int line)	PX_OVERRIDE PX_FINAL
+		{
+			PxScopedCudaLock lock(*mContextManager);
+			return PxgCudaDeviceMemoryAllocate(*mCudaContext, size, file, line);
+		}
+
+		virtual	void	deallocate(void* ptr)	PX_OVERRIDE PX_FINAL
+		{
+			if(ptr)
+			{
+				PxScopedCudaLock lock(*mContextManager);
+				PxgCudaDeviceMemoryDeallocate(*mCudaContext, ptr);
+			}
+		}
+		//~PxVirtualAllocatorCallback
+	};
+
+	class PxgMemoryManager : public PxsMemoryManager
+	{
+	public:
+				PxgMemoryManager(PxCudaContextManager* cudaContextManager) : mCudaContextManager(cudaContextManager), mHostMemoryAllocator(cudaContextManager), mDeviceMemoryAllocator(cudaContextManager)	{}
+		virtual	~PxgMemoryManager()	{}
+
+		// PxsMemoryManager
+		virtual PxVirtualAllocatorCallback*	getHostMemoryAllocator()	PX_OVERRIDE	{ return &mHostMemoryAllocator;		}
+		virtual PxVirtualAllocatorCallback*	getDeviceMemoryAllocator()	PX_OVERRIDE	{ return &mDeviceMemoryAllocator;	}
+		//~PxsMemoryManager
+		
+		PxCudaContextManager*					mCudaContextManager;
+		PxgCudaHostMemoryAllocatorCallback		mHostMemoryAllocator;
+		PxgCudaDeviceMemoryAllocatorCallback	mDeviceMemoryAllocator;
+	};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PxsMemoryManager* physx::createPxgMemoryManager(PxCudaContextManager* cudaContextManager)
+{
+	return PX_NEW(PxgMemoryManager)(cudaContextManager);
+}
+